1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include <cmath>
10 
11 #include "HWConformity.h"
12 #include "Optimizer.h"
13 #include "visa_wa.h"
14 #include "DebugInfo.h"
15 #include "G4_Verifier.hpp"
16 
17 using namespace vISA;
18 
getReverseCondMod(G4_CondModifier mod)19 static G4_CondModifier getReverseCondMod(G4_CondModifier mod)
20 {
21     switch (mod)
22     {
23     case Mod_z:
24         return Mod_z;
25     case Mod_e:
26         return Mod_e;
27     case Mod_nz:
28         return Mod_nz;
29     case Mod_ne:
30         return Mod_ne;
31     case Mod_g:
32         return Mod_l;
33     case Mod_ge:
34         return Mod_le;
35     case Mod_l:
36         return Mod_g;
37     case Mod_le:
38         return Mod_ge;
39     default:
40         MUST_BE_TRUE(0, "Invalid conditional modifier input for reversed conditional modifier.");
41     }
42     return Mod_cond_undef;
43 }
44 
isCompressedInst(G4_INST * inst)45 static bool isCompressedInst(G4_INST* inst) {
46     return inst->isComprInst();
47 }
48 
49 #define isUnitRegionRow(opnd, execSize)      \
50         (opnd->isImm() ||      \
51         opnd->isSrcRegRegion() && opnd->asSrcRegRegion()->getRegion()->width == execSize || \
52         opnd->isSrcRegRegion() && opnd->asSrcRegRegion()->getRegion()->vertStride == 0)
53 
getDclAlignment(int opndBytes,G4_INST * inst,bool isScalar)54 G4_SubReg_Align HWConformity::getDclAlignment(int opndBytes, G4_INST* inst, bool isScalar)
55 {
56     auto subAlign = Get_G4_SubRegAlign_From_Size((uint16_t)opndBytes);
57     bool hasAccSrc = inst->hasACCSrc();
58 
59     if (hasAccSrc && subAlign < GRFALIGN)
60     {
61         subAlign = GRFALIGN;
62     }
63 
64     if (!isScalar)
65     {
66         // certain instructions have additional alignment requirements for non-scalar sources
67         if (!builder.hasAlign1Ternary() && inst->getNumSrc() == 3 && !inst->isSend() && subAlign < Eight_Word)
68         {
69             subAlign = Eight_Word;
70         }
71         if (inst->isMath())
72         {
73             subAlign = GRFALIGN;
74         }
75     }
76 
77     return subAlign;
78 }
79 /*
80  *  create a new mov instruction and insert it after "it"
81  *  mov (esize) dst tmp:type
82  *  where esize is "inst"'s execution size and insert it after "inst"
83  *  return value is the new temp variable as a dst operand
84  *  If dstAlign is specified, the new temp will at least be aligend to that size
85  *
86  *  The new mov instruction is inserted right after "it", and caller is safe to
87  *  access it via "++it".
88  */
insertMovAfter(INST_LIST_ITER & it,G4_DstRegRegion * dst,G4_Type type,G4_BB * bb,G4_SubReg_Align dstAlign)89 G4_DstRegRegion* HWConformity::insertMovAfter(INST_LIST_ITER& it, G4_DstRegRegion* dst, G4_Type type, G4_BB* bb, G4_SubReg_Align dstAlign)
90 {
91     G4_INST* inst = *it;
92 
93     if (!dst)
94     {
95         return dst;
96     }
97 
98     if (inst->hasNULLDst())
99     {
100         return builder.createDst(
101             dst->getBase(),
102             0,
103             0,
104             1,
105             type);
106     }
107 
108     G4_ExecSize exec_size = inst->getExecSize();
109     G4_Type execType = inst->isRawMov() ? dst->getType() : inst->getExecType();
110     bool scalarSrc = true;
111 
112     for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; i++)
113     {
114         G4_Operand* src = inst->getSrc(i);
115         if (!src->isImm())
116         {
117             if (!(inst->isMath() && i == 1 && src->isNullReg()) &&
118                 (src->isSrcRegRegion() && !src->asSrcRegRegion()->isScalar()))
119             {
120                 scalarSrc = false;
121             }
122         }
123         else if (IS_VINTTYPE(src->getType()) || IS_VFTYPE(src->getType()))
124         {
125             scalarSrc = false;
126         }
127     }
128 
129     G4_ExecSize newExecSize =
130         ((inst->opcode() == G4_sel || inst->getImplAccSrc() || !scalarSrc) ? exec_size : g4::SIMD1);
131 
132     uint32_t opExecWidthBytes = newExecSize * TypeSize(execType);
133     if (execType == Type_DF && IS_BTYPE(type))
134     {
135         type = (type == Type_UB ? Type_UW : Type_W);
136     }
137     uint16_t dstWidthBytes = newExecSize * TypeSize(type);
138     uint16_t scale = TypeSize(execType) / TypeSize(type);
139     /*   so according to comments in function that call it MAD needs to have packed format.
140         It ends up with hStride 2, due to DefHoisting.
141         So it is trying to undo it.
142         For every other type if srcType > dstCype we need to adjust regions.
143         This is not necessary for HF. It's already packed.
144 
145         The src region of move is wrong. Since for HF it is packed, unlike other data types.
146         mad (8) r56.0.xyzw:hf -r37.0.xyzw:f r59.0.xyzw:hf r58.0.xyzw:hf {Align16, NoMask}
147         mov (16) r44.0<2>:hf r56.0<16;8,2>:hf {Align1, H1} // #??:$39:%66
148     */
149     if (scale == 0 || (builder.getPlatform() >= GENX_CHV && execType == Type_F && type == builder.getMixModeType()))
150     {
151         scale = 1;
152     }
153 
154     G4_SubReg_Align subAlign = getDclAlignment(opExecWidthBytes > dstWidthBytes ? opExecWidthBytes : dstWidthBytes,
155         inst, newExecSize == 1);
156 
157     if (subAlign < dstAlign)
158     {
159         subAlign = dstAlign;
160     }
161 
162     const RegionDesc* region = newExecSize > 1 ? builder.createRegionDesc(scale, 1, 0) : builder.getRegionScalar();
163 
164     G4_Declare* dcl = builder.createTempVar(newExecSize == 1 ? 1 : newExecSize * scale, type, subAlign);
165 
166     G4_SrcRegRegion* srcRegion = builder.createSrcRegRegion(dcl, region);
167     G4_Predicate* pred = NULL;
168 
169     if (inst->opcode() != G4_sel)
170     {
171         pred = inst->getPredicate();
172         inst->setPredicate(NULL);
173         // maintainDU4TempMov will update def-use
174     }
175 
176     unsigned int new_option = inst->getMaskOption();
177     G4_INST* newInst = builder.createMov(exec_size, dst, srcRegion, new_option, false);
178     newInst->setPredicate(pred);
179     newInst->setSaturate(inst->getSaturate());
180     bb->insertAfter(it, newInst);
181 
182     // update propagation info
183     maintainDU4TempMov(inst, newInst);
184 
185     if (type == dst->getType())
186     {
187         newInst->setSaturate(g4::NOSAT);
188     }
189     else if (type == Type_F || type == Type_DF)
190     {
191         inst->setSaturate(g4::NOSAT);
192     }
193 
194     inst->setExecSize(newExecSize);
195     if (newExecSize == 1)
196     {
197         inst->setNoMask(true);
198     }
199 
200     return builder.createDstRegRegion(dcl, scale);
201 }
202 
203 //
204 // replace instruction (*it)' source srcPos, which must be a scalar/immediate,
205 // with a temp variable after inserting
206 // mov (esize) tmp<1>:type imm/scalar {options}
207 // before the instruction
208 // This is like insertMovBefore(), except that the latter will always use
209 // simd1 move for scalar/imm values, which may not be what we want
210 // NOTE: This does not check for redundant moves.  We are counting on a later LVN pass
211 // to clean them up
212 //
broadcast(G4_BB * bb,INST_LIST_ITER it,int srcPos,G4_SubReg_Align align)213 void HWConformity::broadcast(
214     G4_BB* bb, INST_LIST_ITER it, int srcPos, G4_SubReg_Align align)
215 {
216     G4_INST* inst = *it;
217     G4_Operand* src = inst->getSrc(srcPos);
218     MUST_BE_TRUE(src->isImm() ||
219         (src->isSrcRegRegion() && src->asSrcRegRegion()->isScalar()),
220         "source must be an immediate or scalar");
221     G4_Type type = src->getType();
222 
223     G4_ExecSize execSize = inst->getExecSize();
224     uint32_t instMask = inst->getMaskOption();
225 
226     // avoid simd16 Qword moves
227     MUST_BE_TRUE((unsigned)execSize * TypeSize(type) <= 2u * numEltPerGRF<Type_UB>(),
228         "move can't exceed 2 GRFs");
229 
230     G4_Declare* dcl = builder.createTempVar(execSize, type, align);
231     G4_DstRegRegion* dst = builder.createDst(
232         dcl->getRegVar(),
233         0,
234         0,
235         1,
236         type);
237     G4_INST* newInst = builder.createMov(execSize, dst, src, instMask, false);
238 
239     bb->insertBefore(it, newInst);
240 
241     const RegionDesc* srcRegion = builder.getRegionStride1();
242     G4_SrcRegRegion* newSrc = builder.createSrcRegRegion(dcl, srcRegion);
243     inst->setSrc(newSrc, srcPos);
244     newInst->addDefUse(inst, inst->getSrcOperandNum(srcPos));
245 
246 }
247 
248 //
249 // A simplified version of insertMovBefore(), this copies raw bytes from source to a temp
250 // and replaces the original source with tmp.  This is primarily used to ensure operand alignment and region restrictions
251 // op (esize) ... (mod) src<region>:type
252 // -->
253 // mov (esize) tmp<1>:type src<region>:type
254 // op (esize) ... (mod) tmp<1;1,0>:type
255 //
256 // source must be a G4_SrcRegRegion (direct or indirect), immediates are not supported
257 // note that modifier is propagated from source to tmp, but region is not
258 //
259 //
insertCopyBefore(INST_LIST_ITER it,uint32_t srcNum,G4_SubReg_Align tmpAlign,G4_BB * bb)260 G4_SrcRegRegion* HWConformity::insertCopyBefore(INST_LIST_ITER it, uint32_t srcNum,
261     G4_SubReg_Align tmpAlign, G4_BB* bb)
262 {
263     G4_INST* inst = *it;
264     G4_Operand* src = inst->getSrc(srcNum);
265     MUST_BE_TRUE(src != nullptr && src->isSrcRegRegion(), "source must be a SrcRegRegion");
266     G4_SrcRegRegion* origSrc = src->asSrcRegRegion();
267 
268     G4_ExecSize newExecSize = origSrc->isScalar() ? g4::SIMD1 : inst->getExecSize();
269     G4_Declare* dcl = builder.createTempVar(newExecSize, origSrc->getType(), tmpAlign);
270     G4_SrcModifier modifier = origSrc->getModifier();
271     origSrc->setModifier(Mod_src_undef);
272     G4_DstRegRegion* dst = builder.createDstRegRegion(dcl, 1);
273 
274     G4_INST* movInst = builder.createMov(newExecSize, dst, origSrc, InstOpt_WriteEnable, false);
275 
276     bb->insertBefore(it, movInst);
277     G4_SrcRegRegion* newSrc = builder.createSrcRegRegion(modifier, Direct, dcl->getRegVar(),
278         0, 0, newExecSize == 1 ? builder.getRegionScalar() : builder.getRegionStride1(),
279         dcl->getElemType());
280 
281     return newSrc;
282 }
283 
insertCopyAtBBEntry(G4_BB * bb,G4_ExecSize execSize,G4_Operand * src)284 G4_SrcRegRegion* HWConformity::insertCopyAtBBEntry(G4_BB* bb, G4_ExecSize execSize, G4_Operand* src)
285 {
286     MUST_BE_TRUE(src != nullptr && src->isSrcRegRegion(), "source must be a SrcRegRegion");
287     G4_SrcRegRegion* origSrc = src->asSrcRegRegion();
288     auto lb = src->getLinearizedStart();
289     auto rb = src->getLinearizedEnd();
290 
291     unsigned int regNum = lb / numEltPerGRF<Type_UB>();
292     unsigned int numRegs = (rb + numEltPerGRF<Type_UB>() - 1 - lb) / numEltPerGRF<Type_UB>();
293     if (regNum == -1 || numRegs == 0)
294     {
295         return nullptr;
296     }
297 
298     G4_Declare* dcl = builder.createTempVar(execSize, origSrc->getType(), GRFALIGN);
299     dcl->getRegVar()->setPhyReg(builder.phyregpool.getGreg(regNum), 0);
300     G4_SrcModifier modifier = origSrc->getModifier();
301     origSrc->setModifier(Mod_src_undef);
302     G4_DstRegRegion* dst = builder.createDstRegRegion(dcl, 1);
303     dst->computePReg();
304 
305     G4_INST* movInst = builder.createMov(execSize, dst, origSrc, InstOpt_WriteEnable, false);
306 
307     for (auto it = bb->begin();
308         it != bb->end();
309         it++)
310     {
311         if (!(*it)->isLabel())
312         {
313             bb->insertBefore(it, movInst);
314             break;
315         }
316     }
317 
318     G4_SrcRegRegion* newSrc = builder.createSrcRegRegion(modifier, Direct, dcl->getRegVar(),
319         0, 0, execSize == 1 ? builder.getRegionScalar() : builder.getRegionStride1(),
320         dcl->getElemType());
321     newSrc->asSrcRegRegion()->computePReg();
322     return newSrc;
323 }
324 
325 /*
326  *  create a new mov instruction
327  *  mov (esize) tmp<1>:type src
328  *  where esize is "inst"'s execution size and insert it before "inst"
329  *  return value is the new temp variable as a source operand.
330  *
331  *  "inst" is pointed by "it", and the new mov inst is inserted right
332  *  before "it", so that caller can safely use "--it" to access the new
333  *  mov instruction.
334  */
insertMovBefore(INST_LIST_ITER it,uint32_t srcNum,G4_Type type,G4_BB * bb,G4_SubReg_Align tmpAlign)335 G4_Operand* HWConformity::insertMovBefore(
336     INST_LIST_ITER it, uint32_t srcNum, G4_Type type, G4_BB* bb,
337     G4_SubReg_Align tmpAlign)
338 {
339     return insertMovBefore(it, srcNum, type, bb, 0, tmpAlign);
340 }
341 
insertMovBefore(INST_LIST_ITER it,uint32_t srcNum,G4_Type type,G4_BB * bb,uint16_t tmpStride,G4_SubReg_Align tmpAlign)342 G4_Operand* HWConformity::insertMovBefore(INST_LIST_ITER it, uint32_t srcNum, G4_Type type, G4_BB* bb,
343     uint16_t tmpStride, G4_SubReg_Align tmpAlign)
344 {
345     G4_INST* inst = *it;
346     G4_SubReg_Align subAlign;
347     const RegionDesc* region = nullptr;
348     G4_ExecSize execSize = inst->getExecSize();
349     G4_Operand* src = inst->getSrc(srcNum);
350     unsigned short scale = IS_BTYPE(src->getType()) && src->getType() == type ? 2 : 1;
351 
352     G4_ExecSize newExecSize = (src->isImm() && !IS_VTYPE(src->getType())) ||
353         (src->isSrcRegRegion() && src->asSrcRegRegion()->isScalar())
354         ? g4::SIMD1 : execSize;
355 
356     if (newExecSize > 1)
357     {
358         if (tmpStride)
359         {
360             scale = tmpStride;
361         }
362         else
363         {
364             if (scale == 1 && !IS_VTYPE(src->getType()))
365             {
366                 scale = (uint16_t)(TypeSize(src->getType()) / TypeSize(type));
367             }
368             if (scale == 0)
369             {
370                 scale = 1;
371             }
372         }
373         region = builder.createRegionDesc(scale, 1, 0);
374     }
375     else
376     {
377         scale = src->getTypeSize() / TypeSize(type);
378         if (scale == 0)
379         {
380             scale = 1;
381         }
382         region = builder.getRegionScalar();
383     }
384 
385     int opExecWidthBytes = IS_VINTTYPE(src->getType()) ?
386         numEltPerGRF<Type_UB>() / 2 * (execSize > 8 ? execSize / 8 : 1) :
387         (src->getType() == Type_VF ?
388             numEltPerGRF<Type_UB>() / 2 * (execSize > 4 ? execSize / 4 : 1) :
389             newExecSize * TypeSize(type) * scale);
390 
391     subAlign = getDclAlignment(opExecWidthBytes, inst, newExecSize == 1);
392 
393     if (subAlign < tmpAlign)
394     {
395         subAlign = tmpAlign;
396     }
397 
398     uint32_t newInstEMask = newExecSize == 1 ? InstOpt_WriteEnable : inst->getMaskOption();
399 
400     // due to old BDW regioning rule we need NoMask inst here so they can be split
401     if (kernel.getKernelType() == VISA_CM && builder.getPlatform() == GENX_BDW)
402     {
403         if (!bb->isAllLaneActive())
404         {
405             newInstEMask = InstOpt_WriteEnable;
406         }
407     }
408 
409     G4_Declare* dcl = builder.createTempVar(newExecSize == 1 ? 1 : newExecSize * scale, type, subAlign);
410     G4_DstRegRegion* dstRegion = builder.createDstRegRegion(dcl, scale);
411     G4_INST* newInst = builder.createMov(newExecSize, dstRegion, builder.duplicateOperand(src), newInstEMask, false);
412     bb->insertBefore(it, newInst);
413     inst->transferDef(newInst, Gen4_Operand_Number(srcNum + 1), Opnd_src0);
414     newInst->addDefUse(inst, Gen4_Operand_Number(srcNum + 1));
415 
416     G4_SrcModifier modifier = Mod_src_undef;
417     if (src->isSrcRegRegion())
418     {
419         G4_SrcModifier srcMod = src->asSrcRegRegion()->getModifier();
420         if (srcMod == Mod_Not)
421         {
422             // mov doesn't support logic modifiers, so we keep it on the new source
423             modifier = Mod_Not;
424             newInst->getSrc(0)->asSrcRegRegion()->setModifier(Mod_src_undef);
425         }
426         else if (src->getType() == Type_BF)
427         {
428             // bf mov does not support src mod as it is changed to shl or uw mov.
429             // Keep it on the new source.
430             modifier = srcMod;
431             newInst->getSrc(0)->asSrcRegRegion()->setModifier(Mod_src_undef);
432         }
433     }
434 
435     return builder.createSrcRegRegion(
436         modifier,
437         Direct,
438         dcl->getRegVar(),
439         0,
440         0,
441         region,
442         dcl->getElemType());
443 }
444 
fixPackedSource(INST_LIST_ITER it,G4_BB * bb)445 void HWConformity::fixPackedSource(INST_LIST_ITER it, G4_BB* bb)
446 {
447     G4_INST* inst = *it;
448 
449     bool nonTypeWFound = false, nonTypeFFound = false, incompatibleTypeFound = false;
450 
451     for (int i = 0; i < inst->getNumSrc(); i++)
452     {
453         auto src = inst->getSrc(i);
454         if (!src)
455         {
456             continue;
457         }
458         if (!IS_VTYPE(src->getType()))
459         {
460             // Make sure other src operands are of word type only as this is a HW requirement
461             if (src->getType() != Type_W && src->getType() != Type_UW)
462             {
463                 nonTypeWFound = true;
464             }
465             if (src->getType() != Type_F)
466             {
467                 nonTypeFFound = true;
468             }
469             continue;
470         }
471         G4_Type target_type = src->getType() == Type_VF ? Type_F : Type_W;
472         if (target_type == Type_W && (nonTypeWFound || !builder.hasByteALU()))
473         {
474             // non-word type src is not allowed to co-exist with :v src
475             // also if platform lacks byte regioning :v src may be incompatible with later legalization
476             incompatibleTypeFound = true;
477         }
478         else if (target_type == Type_F && nonTypeFFound == true)
479         {
480             // non-float type src is not allowed to co-exist with :vf src
481             incompatibleTypeFound = true;
482         }
483 
484         // Insert a move only if immediate operand is not last src operand
485         if (i != inst->getNumSrc() - 1 || incompatibleTypeFound == true)
486         {
487             inst->setSrc(insertMovBefore(it, i, target_type, bb), i);
488         }
489     }
490 }
491 /*
492  * fixMathInst() checks the following:
493  * The math instruction can only use GRF registers as source(s) and destination.
494  * The math instruction does not support indirect addressing modes.
495  * source horizontal stride must be 1 with the exception of scalar sources and destination horizontal stride must be always 1.
496  * Source and destination offset must be the same, except the case of scalar source
497  * DW and UD is the only source format supported for INT DIV, FP16/FP32 is the only source format supported for all the other functions.
498  * Mixed DW and UD sources are not allowed for the INT DIV function.
499  * For single source math function, <src1> must be programmed as ARF-NULL register.
500  */
fixMathInst(INST_LIST_ITER it,G4_BB * bb)501 bool HWConformity::fixMathInst(INST_LIST_ITER it, G4_BB* bb)
502 {
503     G4_INST* inst = *it;
504     G4_DstRegRegion* dst = inst->getDst();
505     G4_Operand* src0 = inst->getSrc(0), * src1 = inst->getSrc(1);
506     bool mov_dst = false;
507 
508     MUST_BE_TRUE(inst->isMath(), "Expect math instruction");
509     G4_InstMath* mathInst = inst->asMathInst();
510 
511     if (mathInst->getMathCtrl() == MATH_INVM || mathInst->getMathCtrl() == MATH_RSQRTM)
512     {
513         // split two GRF math macros. This should only happen for FP64
514         if (!builder.hasTwoGRFMathMacro() &&
515             IS_DFTYPE(inst->getDst()->getType()) && ((uint32_t)(inst->getExecSize() * 2)) > builder.getNativeExecSize())
516         {
517             evenlySplitInst(it, bb);
518             return true;
519         }
520         // math macros are constructed internally and should already conform to all other HW rules
521         return false;
522     }
523 
524     if (builder.getOption(vISA_DisableHFMath))
525     {
526         auto src0 = inst->getSrc(0);
527         auto src1 = inst->getSrc(1);
528         auto dst = inst->getDst();
529         if (src0 && src0->getType() == Type_HF)
530         {
531             replaceSrc(it, 0, Type_F, bb);
532         }
533 
534         if (src1 && src1->getType() == Type_HF)
535         {
536             replaceSrc(it, 1, Type_F, bb);
537         }
538 
539         if (dst && dst->getType() == Type_HF)
540         {
541             replaceDst(it, Type_F);
542         }
543     }
544 
545     // covers MATH_INT_DIV, MATH_INT_DIV_QUOT, MATH_INT_DIV_REM
546     bool isIntDivide = inst->asMathInst()->isMathIntDiv();
547     bool hasSameOffset = hasSameSubregOffset(inst);
548 
549     auto hasModMinus = [](G4_Operand* SrcOprd)
550     {
551         if (SrcOprd->isSrcRegRegion())
552         {
553             G4_SrcModifier mod = SrcOprd->asSrcRegRegion()->getModifier();
554             return (mod == Mod_Minus || mod == Mod_Minus_Abs);
555         }
556         return false;
557     };
558 
559     // check if the source needs a move and if so the new move type
560     auto needsMove = [this, inst, isIntDivide, hasSameOffset, hasModMinus](int srcID, G4_Type& newType)
561     {
562         assert((srcID == 0 || srcID == 1) && "math can have at most two sources");
563         G4_Operand* src = inst->getSrc(srcID);
564         newType = src->getType();
565         if (isIntDivide)
566         {
567             // case 1: Perform a signed division if there's any minus src modifier.
568             //   math.quot  r10:w   r20:ub   -r30:ub
569             // Make sure newType is D, not UD. The correct code is:
570             //   mov  r22:d  r20:ub
571             //   mov  r32:d  -r30:ub
572             //   math.quot r10:w  r22:d  r32:d
573             // case 2: Perform an appropriate type conversion based on the type ranks of both sources.
574             //   math.quot  r6:ud  r3:b  r4:ud
575             // Make sure it's still an unsigned division.
576             //   mov  r11:ud  r3:b
577             //   math.quot  r6:ud  r11:ud  r4:ud
578             G4_Type src0Type = inst->getSrc(0)->getType();
579             G4_Type src1Type = inst->getSrc(1)->getType();
580             G4_Type divType = Type_UNDEF;
581             if (hasModMinus(inst->getSrc(0)) || hasModMinus(inst->getSrc(1)))
582             {
583                 // If there's any minus source modifier, do a signed division.
584                 divType = Type_D;
585             }
586             else if (TypeSize(src0Type) != TypeSize(src1Type))
587             {
588                 // If src0 and src1 have different ranks, get the signedness of the
589                 // division from the higher rank src.
590                 G4_Type higherRankType = TypeSize(src0Type) > TypeSize(src1Type) ? src0Type : src1Type;
591                 divType = IS_SIGNED_INT(higherRankType) ? Type_D : Type_UD;
592             }
593             else
594             {
595                 // If both sources have the same rank, do a signed division only
596                 // when both are signed. Otherwise, do an unsigned division.
597                 divType = IS_SIGNED_INT(src0Type) && IS_SIGNED_INT(src1Type) ? Type_D : Type_UD;
598             }
599             assert(divType == Type_D || divType == Type_UD);
600             if (newType != divType)
601             {
602                 newType = divType;
603                 return true;
604             }
605         }
606         else if ((src->getType() != Type_F && src->getType() != Type_VF) &&
607             (builder.getPlatform() == GENX_BDW || src->getType() != Type_HF))
608         {
609             // CHV+ supports F/HF math, while BDW only supports F math
610             // mix mode math is handled in fixMixedHFInst()
611             newType = Type_F;
612             return true;
613         }
614 
615         if (src->isImm())
616         {
617             if (srcID == 0 && inst->asMathInst()->getMathCtrl() >= MATH_FDIV)
618             {
619                 return true;
620             }
621         }
622         else if (src->isSrcRegRegion())
623         {
624             G4_SrcRegRegion* srcRegion = src->asSrcRegRegion();
625             const RegionDesc* rd = srcRegion->getRegion();
626             if (srcRegion->getModifier() != Mod_src_undef && isIntDivide)
627             {
628                 // no source modifer for int divide
629                 return true;
630             }
631             else if (srcRegion->getRegAccess() != Direct)
632             {
633                 return true;
634             }
635             else if (!srcRegion->isScalar())
636             {
637                 if (!hasSameOffset && !builder.isOpndAligned(srcRegion, numEltPerGRF<Type_UB>()))
638                 {
639                     return true;
640                 }
641                 else if (!rd->isContiguous(inst->getExecSize()))
642                 {
643                     return true;
644                 }
645             }
646         }
647         else
648         {
649             ASSERT_USER(false, "Unexpected math source!");
650         }
651         return false;
652     };
653 
654     if (src0)
655     {
656         G4_Type src0_type = src0->getType();
657         bool needsSrc0Mov = needsMove(0, src0_type);
658         if (needsSrc0Mov)
659         {
660             inst->setSrc(insertMovBefore(it, 0, src0->isImm() ? G4_Operand::GetNonVectorImmType(src0_type) : src0_type, bb), 0);
661             src0 = inst->getSrc(0);
662         }
663     }
664 
665     bool nullSrc1 = src1 && src1->isNullReg();
666     if (!nullSrc1 && src1)
667     {
668         G4_Type src1_type = src1->getType();
669         bool needsSrc1Move = needsMove(1, src1_type);
670 
671         if (needsSrc1Move)
672         {
673             if (isIntDivide && src1->isImm() && !IS_VINTTYPE(src1->getType()))
674             {
675                 // just change the immediate's type
676                 uint32_t immVal = (uint32_t)src1->asImm()->getImm();
677                 inst->setSrc(builder.createImm(immVal, src1_type), 1);
678             }
679             else
680             {
681                 inst->setSrc(insertMovBefore(it, 1, src1->isImm() ? G4_Operand::GetNonVectorImmType(src1_type) : src1_type, bb), 1);
682             }
683             src1 = inst->getSrc(1);
684         }
685     }
686 
687     if (nullSrc1 && src0 && src1->getType() != src0->getType())
688     {
689         G4_SrcRegRegion* src1_opnd = builder.createNullSrc(inst->getSrc(0)->getType());
690         inst->setSrc(src1_opnd, 1);
691     }
692 
693     // recompute as src0 and src1 may have been modified
694     hasSameOffset = hasSameSubregOffset(inst);
695     G4_Type extype = inst->getExecType2();
696     bool cond1 = (dst->getType() != extype && !(dst->getType() == Type_UD && extype == Type_D));
697     if (dst->getRegAccess() != Direct || dst->getHorzStride() != 1 || cond1 ||
698         (!hasSameOffset && inst->getExecSize() != g4::SIMD1 && !builder.isOpndAligned(dst, numEltPerGRF<Type_UB>())))
699     {
700         mov_dst = true;
701         replaceDst(it, extype);
702     }
703 
704     if (builder.hasHFMathGRFAlign())
705     {
706         auto src0 = inst->getSrc(0);
707         auto src1 = inst->getSrc(1);
708         auto dst = inst->getDst();
709 
710         if (dst && !dst->isNullReg() && dst->getType() == Type_HF && dst->getHorzStride() == 1)
711         {
712             if (!builder.isOpndAligned(dst, numEltPerGRF<Type_UB>()))
713             {
714                 mov_dst = true;
715                 replaceDst(it, dst->getType(), GRFALIGN);
716             }
717             if (src0 && !src0->isNullReg() && src0->getType() == Type_HF)
718             {
719                 if (!builder.isOpndAligned(src0, numEltPerGRF<Type_UB>()))
720                 {
721                     G4_Operand* newSrc0 = insertMovBefore(it, 0, src0->getType(), bb, GRFALIGN);
722                     inst->setSrc(newSrc0, 0);
723                 }
724             }
725 
726             if (src1 && !src1->isNullReg() && src1->getType() == Type_HF)
727             {
728                 if (!builder.isOpndAligned(src0, numEltPerGRF<Type_UB>()))
729                 {
730                     G4_Operand* newSrc0 = insertMovBefore(it, 1, src0->getType(), bb, GRFALIGN);
731                     inst->setSrc(newSrc0, 1);
732                 }
733             }
734         }
735     }
736 
737     return mov_dst;
738 }
739 
hasSameSubregOffset(G4_INST * inst) const740 bool HWConformity::hasSameSubregOffset(G4_INST* inst) const
741 {
742     uint32_t offset;
743     return hasSameSubregOffset(inst, offset);
744 }
745 
746 //
747 // returns true if all sources and dst in this inst have the same fixed subreg offset
748 // null src/dst, scalar sources and immediates are excluded from the check
749 // If true, return the common byte offset in byteOffset
750 //
hasSameSubregOffset(G4_INST * inst,uint32_t & byteOffset) const751 bool HWConformity::hasSameSubregOffset(G4_INST* inst, uint32_t& byteOffset) const
752 {
753     bool anyOffset = true; // true means offset is not fixed yet
754     byteOffset = 0;
755     if (inst->getDst())
756     {
757         G4_DstRegRegion* dst = inst->getDst();
758         if (dst->isNullReg())
759         {
760             // do nothing
761         }
762         else if (dst->hasFixedSubregOffset(byteOffset))
763         {
764             anyOffset = false;
765         }
766         else
767         {
768             return false;
769         }
770     }
771 
772     for (int i = 0; i < inst->getNumSrc(); ++i)
773     {
774         G4_Operand* src = inst->getSrc(i);
775         if (src->isSrcRegRegion())
776         {
777             uint32_t srcOffset = 0;
778             G4_SrcRegRegion* srcRegion = src->asSrcRegRegion();
779             if (srcRegion->isNullReg() || srcRegion->isScalar())
780             {
781                 continue;
782             }
783             else if (srcRegion->hasFixedSubregOffset(srcOffset))
784             {
785                 if (anyOffset)
786                 {
787                     byteOffset = srcOffset;
788                     anyOffset = false;
789                 }
790                 else if (srcOffset != byteOffset)
791                 {
792                     return false;
793                 }
794             }
795             else
796             {
797                 return false;
798             }
799         }
800     }
801 
802     return true;
803 }
804 
805 // Check the following rules
806 // -- src0 in 2 source instructions may not be immediate.  We try to swap for src0 and src1 for
807 //    commutative instructions in such cases
808 // -- ARF may not be in src1
fixImmAndARFSrc(INST_LIST_ITER it,G4_BB * bb)809 void HWConformity::fixImmAndARFSrc(INST_LIST_ITER it, G4_BB* bb)
810 {
811     G4_INST* inst = *it;
812     if (inst->mayExceedTwoGRF())
813     {
814         return;
815     }
816 
817     G4_Operand* src0, * src1, * src2;
818     src0 = inst->getSrc(0);
819     src1 = inst->getSrc(1);
820     src2 = inst->getSrc(2);
821 
822     /* Check for usage of two constants in binary operations */
823     if (src0 && (src0->isImm() || src0->isAddrExp()) && inst->getNumSrc() == 2)
824     {
825         if (INST_COMMUTATIVE(inst->opcode()) && !src1->isImm())
826         {
827             //all commutative inst must have 2 sources
828             if (inst->opcode() == G4_mul)
829             {
830                 bool needConstMov;
831                 //for DW and W mul, src0 must be DW and src1 W
832                 needConstMov = IS_DTYPE(src0->getType()) && !IS_DTYPE(src1->getType());
833 
834                 if (needConstMov)
835                 {
836                     G4_Type tmpType = G4_Operand::GetNonVectorImmType(src0->getType());
837 
838                     G4_Operand* newSrc0 = insertMovBefore(it, 0, tmpType, bb);
839                     inst->setSrc(newSrc0, 0);
840                 }
841                 else
842                 {
843                     // swap operands
844                     inst->swapSrc(0, 1);
845                     inst->swapDefUse();
846                 }
847             }
848             else
849             {
850                 // swap operands
851                 inst->swapSrc(0, 1);
852                 inst->swapDefUse();
853             }
854         }
855         /*
856         * A select operation isn't commutative, but we may commute the
857         * operands provided we perform a predicate inversion as well.
858         * (v0)  sel ... const V1
859         *    =>
860         * (-v0) sel ... V1 const
861         */
862         else if (inst->opcode() == G4_sel && !src1->isImm())
863         {
864             G4_CondMod* cond = inst->getCondMod();
865             if (cond)
866             {
867                 switch (cond->getMod())
868                 {
869                 case Mod_ne:
870                     inst->setCondMod(builder.createCondMod(Mod_e, cond->getBase(), 0));
871                     break;
872                 case Mod_e:
873                     inst->setCondMod(builder.createCondMod(Mod_ne, cond->getBase(), 0));
874                     break;
875                 default:
876                     break;
877                 }
878             }
879             else
880             {
881                 G4_Predicate* pred = inst->getPredicate();
882                 MUST_BE_TRUE(pred != NULL, "predicate must not be null");
883                 G4_PredState reverse = pred->getState() == PredState_Minus ? PredState_Plus : PredState_Minus;
884                 inst->setPredicate(builder.createPredicate(
885                     reverse, pred->getBase(), pred->getSubRegOff(), pred->getControl()));
886             }
887             inst->swapSrc(0, 1);
888             inst->swapDefUse();
889         }
890         else if (!inst->isMath())
891         {
892             // math immediate src0 is handled separately in fixMathInst()
893             // If src0 is not 64-bit, src1 is 64-bit, swap them to save one move.
894             if (INST_COMMUTATIVE(inst->opcode()) && src0->isImm() && src1->isImm() &&
895                 src0->getTypeSize() != 8 && src1->getTypeSize() == 8)
896             {
897                 inst->swapSrc(0, 1);
898                 inst->swapDefUse();
899                 src0 = inst->getSrc(0);
900                 src1 = inst->getSrc(1);
901                 // this needs to fall through as we still need move for src0
902             }
903 
904             if (INST_COMMUTATIVE(inst->opcode()) && src0->isAddrExp() && src1->isImm())
905             {
906                 // The original IR has both addr expr and immediate
907                 //   add(8) A0(0, 0)<1>:uw &V36 + 0 0xeca86420 : uv{ Align1, Q1 }
908                 // We insert a move for src1 which is an immediate
909                 //   mov(8) TV0(0, 0)<1> : uw 0xeca86420 : uv{ Align1 }
910                 //   add(8) A0(0, 0)<1> : uw &V36 + 0 TV0(0, 0)<8; 8, 1> : uw{ Align1, Q1 }
911                 G4_Type type = src1->getType();
912                 inst->setSrc(insertMovBefore(it, 1, G4_Operand::GetNonVectorImmType(type), bb), 1);
913                 // And we swap addr expr and the new variable
914                 //   add(8) A0(0, 0)<1> : uw TV0(0, 0)<8; 8, 1> : uw &V36 + 0 {Align1, Q1}
915                 // The final code sequence is
916                 //   mov(8) r13.0<1>:uw 0xeca86420 : uv{ Align1 } // #26:$9:%79
917                 //   add(8) a0.0<1> : uw r13.0<8; 8, 1> : uw 0x60 : uw{ Align1, Q1 }
918                 inst->setSrc(inst->getSrc(1), 0);
919                 inst->setSrc(src0, 1);
920                 inst->swapDefUse();
921             }
922             else
923             {
924                 G4_Type newSrcType = inst->needsDWType() ? (IS_UNSIGNED_INT(src0->getType()) ? Type_UD : Type_D) :
925                     G4_Operand::GetNonVectorImmType(src0->getType());
926                 inst->setSrc(insertMovBefore(it, 0, newSrcType, bb), 0);
927             }
928         }
929     }
930 
931     src0 = inst->getSrc(0);
932     src1 = inst->getSrc(1);
933     src2 = inst->getSrc(2);
934 
935     // check for non-mad 3src inst
936 
937     if (inst->opcode() == G4_madw)
938     {
939         // src0 can not be immediate.
940         if (src0 && src0->isImm())
941         {
942             // swap src0 and src1 if src0 is immediate but src1 is not immediate
943             if (src1 && !src1->isImm())
944             {
945                 inst->swapSrc(0, 1);
946                 inst->swapDefUse();
947                 src0 = inst->getSrc(0);
948                 src1 = inst->getSrc(1);
949             }
950             else
951             {
952                 inst->setSrc(insertMovBefore(it, 0, IS_UNSIGNED_INT(src0->getType()) ? Type_UD : Type_D, bb), 0);
953                 src0 = inst->getSrc(0);
954             }
955         }
956 
957         // fixe immediate type of G4_madw as it can only support D/UD types
958         if (src1 && src1->isImm())
959         {
960             uint32_t immVal = (uint32_t)src1->asImm()->getImm();
961             inst->setSrc(builder.createImm(immVal, IS_SIGNED_INT(src1->getType()) ? Type_D : Type_UD), 1);
962             src1 = inst->getSrc(1);
963         }
964 
965         if (src2 && src2->isImm())
966         {
967             uint32_t immVal = (uint32_t)src2->asImm()->getImm();
968             inst->setSrc(builder.createImm(immVal, IS_SIGNED_INT(src2->getType()) ? Type_D : Type_UD), 2);
969             src2 = inst->getSrc(2);
970         }
971     }
972 
973     // madw can have src1 as immediate
974     if (inst->getNumSrc() == 3 && src1->isImm() && inst->opcode() != G4_madw)
975     {
976         inst->setSrc(insertMovBefore(it, 1, INST_FLOAT_SRC_ONLY(inst->opcode()) ? Type_F : src1->getType(), bb), 1);
977     }
978 
979     // Architecture registers may not appear as src1.
980     auto isARF = [](G4_Operand* opnd) { return opnd->isAreg() || opnd->isFlag(); };
981     if (src1 != nullptr && isARF(src1) && !src1->isNullReg())
982     {
983         /* See if we can swap the src1 */
984         if (INST_COMMUTATIVE(inst->opcode()) && !isARF(src0))
985         {
986             inst->swapSrc(0, 1);
987             inst->swapDefUse();
988         }
989         else
990         {
991             /* Otherwise introduce a tmp */
992             inst->setSrc(insertMovBefore(it, 1, INST_FLOAT_SRC_ONLY(inst->opcode()) ? Type_F : src1->getType(), bb), 1);
993         }
994     }
995 
996     src2 = inst->getSrc(2);
997 
998     // 3 src instructions except madw can't have any constants
999     if (!builder.hasAlign1Ternary() && src2 != nullptr && src2->isImm() && inst->opcode() != G4_madw)
1000     {
1001         inst->setSrc(insertMovBefore(it, 2, src2->getType(), bb), 2);
1002     }
1003 }
1004 
fixLine(INST_LIST_ITER it,G4_BB * bb)1005 bool HWConformity::fixLine(INST_LIST_ITER it, G4_BB* bb)
1006 {
1007     G4_INST* inst = *it;
1008 
1009     if (inst->opcode() == G4_line)
1010     {
1011         bool badRegion = false;
1012         G4_Operand* src0 = inst->getSrc(0);
1013         // assumption: there are 4 elements in src0
1014         if (src0->isSrcRegRegion())
1015         {
1016             const RegionDesc* rd = src0->asSrcRegRegion()->getRegion();
1017             badRegion = (rd->vertStride != 0 || rd->width != 4 || rd->horzStride != 1);
1018         }
1019         if (!IS_FTYPE(src0->getType()) || src0->isImm() || badRegion ||
1020             !builder.isOpndAligned(src0, numEltPerGRF<Type_UB>() / 2))
1021         {
1022             // insertMovBefore()  is not used here
1023             // due to the special region <0;4,1> of src0 of line
1024             G4_Declare* src0_dcl;
1025             G4_DstRegRegion* new_dst_opnd;
1026             G4_SrcRegRegion* new_src0_opnd;
1027             unsigned char mov_size = 4;
1028 
1029             src0_dcl = builder.createTempVar(mov_size, Type_F, Eight_Word);
1030             /* Create temporary variable */
1031             // Actully we set region to be <0;4,1> directly here.
1032             const RegionDesc* rd = builder.createRegionDesc(0, 4, 1);
1033             new_src0_opnd = builder.createSrcRegRegion(src0_dcl, rd);
1034             new_dst_opnd = builder.createDstRegRegion(src0_dcl, 1);
1035 
1036             G4_INST* newInst = builder.createMov(G4_ExecSize(mov_size), new_dst_opnd, src0, InstOpt_NoOpt, false);
1037             newInst->setNoMask(true);
1038 
1039             bb->insertBefore(it, newInst);
1040             inst->setSrc(new_src0_opnd, 0);
1041             return true;
1042         }
1043     }
1044     return false;
1045 }
1046 
fixOpndType(INST_LIST_ITER it,G4_BB * bb)1047 bool HWConformity::fixOpndType(INST_LIST_ITER it, G4_BB* bb)
1048 {
1049     /*
1050     * Check for instruction that only accept float/int operands, as well as
1051     * instruction with mixed operand types.  Even though vISA itself forbids
1052     * mixed type instructions, optimizations such as copy propagation
1053     * may reintroduce them and so we do the checks here
1054     */
1055     G4_INST* inst = *it;
1056     bool changed = false;
1057     int numSrc = inst->getNumSrc();
1058     bool has_float = false;
1059     bool has_int = false;
1060 
1061     if (inst->mayExceedTwoGRF() || inst->opcode() == G4_smov)
1062     {
1063         // skip special instructions
1064         return false;
1065     }
1066 
1067     for (int i = 0; i < numSrc; i++)
1068     {
1069         if (!inst->getSrc(i))
1070         {
1071             continue;
1072         }
1073         G4_Type ty = inst->getSrc(i)->getType();
1074         if (IS_TYPE_FLOAT_ALL(ty))
1075         {
1076             has_float = true;
1077         }
1078         else
1079         {
1080             has_int = true;
1081         }
1082     }
1083     if (has_float && has_int)
1084     {
1085         for (int i = 0; i < numSrc; i++)
1086         {
1087             if (inst->getSrc(i) && !IS_FTYPE(inst->getSrc(i)->getType()) && !IS_DFTYPE(inst->getSrc(i)->getType()))
1088             {
1089                 // FIXME: we should probably either get rid of this or assert,
1090                 // it's unlikely that blinding casting int to float is the right thing here
1091                 inst->setSrc(insertMovBefore(it, i, Type_F, bb), i);
1092                 changed = true;
1093             }
1094         }
1095     }
1096 
1097     if (builder.noSrc1Byte())
1098     {
1099         if (numSrc > 1)
1100         {
1101             G4_Operand* src0 = inst->getSrc(0);
1102             G4_Operand* src1 = inst->getSrc(1);
1103             if (src0 != nullptr && src1 != nullptr && IS_BTYPE(src1->getType()))
1104             {
1105                 if (!IS_BTYPE(src0->getType()) && inst->canSwapSource())
1106                 {
1107                     inst->swapSrc(0, 1);
1108                 }
1109                 else
1110                 {
1111                     bool hasModMinus = false;
1112                     if (src1->isSrcRegRegion())
1113                     {
1114                         G4_SrcModifier mod = src1->asSrcRegRegion()->getModifier();
1115                         hasModMinus = (mod == Mod_Minus || mod == Mod_Minus_Abs);
1116                     }
1117                     // If minus modifier is present, need signed type.
1118                     G4_Type Ty = (IS_SIGNED_INT(src1->getType()) || hasModMinus) ? Type_W : Type_UW;
1119                     inst->setSrc(insertMovBefore(it, 1, Ty, bb), 1);
1120                     changed = true;
1121                 }
1122             }
1123         }
1124     }
1125     if (inst->opcode() == G4_bfn)
1126     {
1127         // BFN requires its operands to be UD/UW
1128         // ToDo: anyway to generalize this to all instructions requiring signed/unsigned int type? IGA doesn't seem to have API to query supported types
1129         auto dst = inst->getDst();
1130         if (dst->getType() == Type_D || dst->getType() == Type_W)
1131         {
1132             dst->setType(dst->getType() == Type_D ? Type_UD : Type_UW);
1133         }
1134         auto changeSrcToUnsigned = [](G4_Operand* opnd)
1135         {
1136             if (opnd->isSrcRegRegion() && (opnd->getType() == Type_D || opnd->getType() == Type_W))
1137             {
1138                 opnd->asSrcRegRegion()->setType(opnd->getType() == Type_D ? Type_UD : Type_UW);
1139             }
1140         };
1141         changeSrcToUnsigned(inst->getSrc(0));
1142         changeSrcToUnsigned(inst->getSrc(1));
1143         changeSrcToUnsigned(inst->getSrc(2));
1144     }
1145     return changed;
1146 }
1147 
1148 /*
1149  * fixOpnds() looks for operands conformity:
1150  * 1. checks can operand be a constant.
1151  * 2. checks if operand's type is conformant to operation.
1152  * 3. check if only src0 uses VxH
1153  * 4. check if indirect scalar is used in compressed inst
1154  * It tries to fix these cases by changing operands order if possible
1155  * or by insertion if temporary location with appropriate conversion.
1156  */
fixOpnds(INST_LIST_ITER it,G4_BB * bb,G4_Type & exType)1157 void HWConformity::fixOpnds(INST_LIST_ITER it, G4_BB* bb, G4_Type& exType)
1158 {
1159     G4_INST* inst = *it;
1160     if (inst->isSend())
1161     {
1162         return;
1163     }
1164 
1165     G4_Operand* src0, * src1, * src2;
1166 
1167     src0 = inst->getSrc(0);
1168     src1 = inst->getSrc(1);
1169     src2 = inst->getSrc(2);
1170 
1171     if (inst->opcode() == G4_mul)
1172     {
1173         if (IS_DTYPE(src1->getType()) &&
1174             !(IS_DTYPE(src0->getType()) || IS_FTYPE(src0->getType())))
1175         {
1176             // check if src0 uses VxH
1177             bool src0_use_VxH = false;
1178 
1179             if (src0->isSrcRegRegion() && src0->asSrcRegRegion()->getRegAccess() != Direct &&
1180                 src0->asSrcRegRegion()->getRegion()->isRegionWH()) // is this safe?
1181             {
1182                 src0_use_VxH = true;
1183             }
1184             if (src0_use_VxH)
1185             {
1186                 src0 = insertMovBefore(it, 0, src0->getType(), bb);
1187             }
1188             inst->setSrc(src0, 1);
1189             inst->setSrc(src1, 0);
1190             inst->swapDefUse();
1191             src0 = inst->getSrc(0);
1192             src1 = inst->getSrc(1);
1193         }
1194 
1195         if (src1->isSrcRegRegion() && src1->asSrcRegRegion()->getRegAccess() != Direct &&
1196             src1->asSrcRegRegion()->getRegion()->isRegionWH())
1197         {
1198             if (IS_DTYPE(src0->getType()) &&
1199                 !(IS_DTYPE(src1->getType()) || IS_FTYPE(src1->getType())))
1200             {
1201                 inst->setSrc(insertMovBefore(it, 1, src1->getType(), bb), 1);
1202             }
1203             else
1204             {
1205                 inst->swapSrc(0, 1);
1206                 inst->swapDefUse();
1207             }
1208             src0 = inst->getSrc(0);
1209             src1 = inst->getSrc(1);
1210         }
1211     }
1212 
1213     fixImmAndARFSrc(it, bb);
1214 
1215     src0 = inst->getSrc(0);
1216     src1 = inst->getSrc(1);
1217     src2 = inst->getSrc(2);
1218 
1219     // Vx1 and VxH can only be used for src0
1220     bool src0_use_VxH = false, src1_use_VxH = false;
1221 
1222     if (src2 &&
1223         src2->isSrcRegRegion() &&
1224         src2->asSrcRegRegion()->getRegion()->isRegionWH())
1225     {
1226         inst->setSrc(insertMovBefore(it, 2, exType, bb), 2);
1227     }
1228 
1229     if (src0 != NULL &&
1230         src0->isSrcRegRegion() &&
1231         src0->asSrcRegRegion()->getRegion()->isRegionWH())
1232     {
1233         src0_use_VxH = true;
1234     }
1235 
1236     if (src1 != NULL &&
1237         src1->isSrcRegRegion() &&
1238         src1->asSrcRegRegion()->getRegion()->isRegionWH())
1239     {
1240         src1_use_VxH = true;
1241     }
1242 
1243     if (src1_use_VxH)
1244     {
1245         if ((INST_COMMUTATIVE(inst->opcode()) || inst->opcode() == G4_cmp)
1246             && !src0_use_VxH &&
1247             !(inst->opcode() == G4_mul && IS_DTYPE(src0->getType())))
1248         {
1249             inst->swapSrc(0, 1);
1250             if (inst->opcode() == G4_cmp)
1251             {
1252                 // change condMod
1253                 G4_CondMod* condMod = inst->getCondMod();
1254                 if (condMod)
1255                 {
1256                     G4_CondMod* newCondModOpnd = builder.createCondMod(
1257                         getReverseCondMod(condMod->getMod()), condMod->getBase(), condMod->getSubRegOff());
1258                     inst->setCondMod(newCondModOpnd);
1259                 }
1260             }
1261         }
1262         else
1263         {
1264             inst->setSrc(insertMovBefore(it, 1, exType, bb), 1);
1265         }
1266     }
1267 
1268     // at this point only src0 may be VxH
1269     // VxH regioning and conditional modifiers may not co-exist
1270     if (builder.getPlatform() >= GENX_ICLLP)
1271     {
1272         src0 = inst->getSrc(0);
1273         if (src0 && src0->isSrcRegRegion() && src0->asSrcRegRegion()->getRegion()->isRegionWH())
1274         {
1275             if (inst->getCondMod())
1276             {
1277                 inst->setSrc(insertMovBefore(it, 0, src0->getType(), bb), 0);
1278             }
1279         }
1280     }
1281 }
1282 
fixAlign13SrcInst(INST_LIST_ITER iter,G4_BB * bb)1283 void HWConformity::fixAlign13SrcInst(INST_LIST_ITER iter, G4_BB* bb)
1284 {
1285     // again mad should already conform by construction
1286     G4_INST* inst = *iter;
1287     MUST_BE_TRUE(inst->getNumSrc() == 3 && !inst->isSend(), "expect 3src inst");
1288 
1289     if (inst->opcode() != G4_mad && inst->opcode() != G4_madw)
1290     {
1291         G4_DstRegRegion* dst = inst->getDst();
1292         if (!isGoodAlign1TernaryDst(inst))
1293         {
1294             auto alignment = builder.noSrc2Regioning() ? GRFALIGN : Four_Word;
1295             replaceDst(iter, dst->getType(), alignment);
1296         }
1297 
1298         bool canBeImm = true;
1299         for (int i = 0; i < inst->getNumSrc(); ++i)
1300         {
1301             if (!isGoodAlign1TernarySrc(inst, i, canBeImm))
1302             {
1303                 if (i == 2 && builder.noSrc2Regioning())
1304                 {
1305                     // some additional handling for src2 when src2 regioning is not available
1306                     fixSrc2(iter, bb, false);
1307                 }
1308                 else
1309                 {
1310                     G4_SubReg_Align subalign = (i == 2) ? Four_Word : Any;
1311                     inst->setSrc(insertMovBefore(iter, i, inst->getSrc(i)->getType(), bb, subalign), i);
1312                 }
1313             }
1314             else
1315             {
1316                 if (inst->getSrc(i)->isImm())
1317                 {
1318                     canBeImm = false;
1319                 }
1320             }
1321         }
1322     }
1323 }
1324 
fix3SrcInst(INST_LIST_ITER iter,G4_BB * bb)1325 void HWConformity::fix3SrcInst(INST_LIST_ITER iter, G4_BB* bb)
1326 {
1327     G4_INST* inst = *iter;
1328     if (inst->getNumSrc() != 3 || inst->mayExceedTwoGRF() || inst->opcode() == G4_madm)
1329     {
1330         return;
1331     }
1332 
1333     if (builder.hasAlign1Ternary())
1334     {
1335         fixAlign13SrcInst(iter, bb);
1336         return;
1337     }
1338 
1339     if (inst->opcode() != G4_mad && inst->opcode() != G4_madw)
1340     {
1341         // check that dst and srcs are legal for 3src.  We do not check
1342         // mad and madw since they should already conform by construction
1343         uint8_t execSize = inst->getExecSize();
1344         G4_DstRegRegion* dst = inst->getDst();
1345         if (dst->getRegAccess() != Direct || dst->getHorzStride() != 1 ||
1346             !builder.isOpndAligned(dst, (execSize >= 8) ? 32 : execSize * 4))
1347         {
1348             replaceDst(iter, dst->getType());
1349         }
1350         for (int i = 0; i < 3; i++)
1351         {
1352             if (!isGoodAlign16Src(inst, i))
1353             {
1354                 inst->setSrc(
1355                     insertMovBefore(iter, i, inst->getSrc(i)->getType(), bb),
1356                     i);
1357             }
1358         }
1359     }
1360 
1361     //When it is set (Align16), the instruction uses 16-byte-aligned addressing for source and destination operands.
1362     if ((inst->getExecSize() == g4::SIMD1))
1363     {
1364         if (inst->getDst() &&
1365             inst->getDst()->getBase()->isRegVar())
1366         {
1367             if (!builder.isOpndAligned(inst->getDst(), 16))
1368             {
1369                 replaceDst(iter, inst->getDst()->getType(), Eight_Word);
1370             }
1371         }
1372     }
1373 
1374     if (inst->getExecSize() == g4::SIMD16)
1375     {
1376         bool wa3rc = (VISA_WA_CHECK(builder.getPWaTable(), WaDisableSIMD16On3SrcInstr) &&
1377             !(inst->getExecType() == Type_HF &&
1378                 inst->getOperand(Opnd_src1)->isSrcRegRegion() &&
1379                 inst->getOperand(Opnd_src1)->getType() == Type_HF &&
1380                 !inst->getOperand(Opnd_src1)->asSrcRegRegion()->crossGRF()));
1381 
1382         if (wa3rc)
1383         {
1384             evenlySplitInst(iter, bb);
1385         }
1386     }
1387 }
1388 
fixCompareInst(INST_LIST_ITER i,G4_BB * bb,G4_Type exType,int dst_elsize)1389 void HWConformity::fixCompareInst(
1390     INST_LIST_ITER i,
1391     G4_BB* bb,
1392     G4_Type exType,
1393     int dst_elsize)
1394 {
1395     G4_INST* inst = *i;
1396     G4_Operand* dst = inst->getDst();
1397 
1398     if (dst && dst->isNullReg())
1399     {
1400         // change dst hstride if necessary
1401         if (TypeSize(exType) != dst->getTypeSize())
1402         {
1403             // create a new dst with new stride
1404             G4_DstRegRegion* new_null = builder.createNullDst(exType);
1405             inst->setDest(new_null);
1406         }
1407     }
1408 }
1409 
1410 // For integer packing moves, we can replace the src type with the dst type instead of inserting
1411 // a new move to satisfy dst alignment, since integer down conversion is based on truncation
1412 // an inst has to satisfy the following properties:
1413 // -- is a move (duh) and does not have conditional modifiers or saturation
1414 // -- dst must be a direct DstRegRegion that is GRF-aligned
1415 // -- src must be a direct SrcRegRegion with GRF base, no modifiers, and packed/scalar region
1416 // -- both dst and src have integer type, with source stride > dst stride
1417 // returns true if we have successfully down cast the src type
canReplaceMovSrcType(IR_Builder & builder,G4_INST * inst,uint32_t extypesize)1418 static bool canReplaceMovSrcType(IR_Builder& builder, G4_INST* inst, uint32_t extypesize)
1419 {
1420 
1421     if (inst->opcode() != G4_mov || inst->getCondMod() != NULL || inst->getSaturate())
1422     {
1423         return false;
1424     }
1425     if (!inst->getSrc(0)->isSrcRegRegion())
1426     {
1427         return false;
1428     }
1429 
1430     G4_DstRegRegion* dst = inst->getDst();
1431     G4_SrcRegRegion* src0 = inst->getSrc(0)->asSrcRegRegion();
1432     int dstByteOffset = dst->getByteOffset();
1433     if (dstByteOffset % extypesize != 0 ||
1434         dst->getRegAccess() != Direct)
1435     {
1436         // don't do this if dst is not GRF aligned, since we have to fix it later anyway
1437         return false;
1438     }
1439 
1440     if (src0->getRegAccess() != Direct || src0->getModifier() != Mod_src_undef ||
1441         (src0->getTopDcl() == NULL || src0->getTopDcl()->getRegFile() != G4_GRF))
1442     {
1443         return false;
1444     }
1445 
1446     bool isIntPackingMove = false;
1447     if (IS_TYPE_INT(dst->getType()) && IS_TYPE_INT(src0->getType()))
1448     {
1449         uint32_t dstAlign = dst->getTypeSize() * dst->getHorzStride();
1450         if (dstAlign < src0->getTypeSize())
1451         {
1452             isIntPackingMove = true;
1453         }
1454     }
1455 
1456     if (!isIntPackingMove)
1457     {
1458         return false;
1459     }
1460 
1461     // we only handle direct contiguous and scalar source region for now,
1462     // as VxH and strided regions are a bit harder to update
1463     if (src0->getRegion()->isContiguous(inst->getExecSize()))
1464     {
1465         uint16_t newHS = extypesize / dst->getTypeSize();
1466         if (newHS > 4)
1467         {
1468             // rule out Q -> B moves if Q is not scalar
1469             return false;
1470         }
1471     }
1472     else if (!src0->isScalar())
1473     {
1474         // only handle scalar and contiguous regions for now
1475         return false;
1476     }
1477 
1478     // instead of inserting a move, we change src's type to be same as dst type
1479     // e.g.,
1480     // mov (8) r1.0<1>:b r2.4<8;8,1>:d
1481     // becomes
1482     // mov (8) r1.0<1>:b r2.16<32;8,4>:b
1483     // This is safe since integer down conversion is based on truncation
1484     uint32_t typeSizeRatio = extypesize / dst->getTypeSize();
1485     uint32_t numElt = src0->isScalar() ? 1 : inst->getExecSize() * typeSizeRatio;
1486     G4_Declare* newDcl = builder.createTempVar(numElt, dst->getType(), Any);
1487     newDcl->setAliasDeclare(src0->getBase()->asRegVar()->getDeclare(), 0);
1488     const RegionDesc* region = src0->isScalar() ? builder.getRegionScalar() :
1489         builder.createRegionDesc((uint16_t)inst->getExecSize(), (uint16_t)inst->getExecSize() * typeSizeRatio,
1490             inst->getExecSize(),
1491             (uint16_t)typeSizeRatio);
1492     G4_SrcRegRegion* newSrc = builder.createSrc(
1493         newDcl->getRegVar(),
1494         src0->getRegOff(),
1495         src0->getSubRegOff() * typeSizeRatio,
1496         region,
1497         dst->getType());
1498     inst->setSrc(newSrc, 0);
1499     return true;
1500 }
1501 
1502 // implement HW restrictions on mov
1503 // -- There is no direct conversion from B/UB to DF or DF to B/UB.
1504 //    Use two instructions and a word or DWord intermediate type.
1505 // -- There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
1506 //    Use two instructions and a word or DWord intermediate integer type.
1507 // -- There is no direct conversion from HF to DF or DF to HF.
1508 //    Use two instructions and F (Float) as an intermediate type.
1509 // -- There is no direct conversion from HF to Q/UQ or Q/UQ to HF.
1510 //    Use two instructions and F (Float) or a word integer type or a DWord integer type as an intermediate type.
1511 // -- There is no direct scalar conversion from B/UB to HF or F.
1512 //    Use two instructions and a WORD or DWORD intermediate type respectively.
1513 // -- There is no direct conversion from HF to Integer (DWORD or WORD).
1514 //    Use two instructions and F (Float) as an intermediate type.
1515 // returns true if a move is inserted
fixMov(INST_LIST_ITER i,G4_BB * bb)1516 bool HWConformity::fixMov(INST_LIST_ITER i, G4_BB* bb)
1517 {
1518     G4_INST* inst = *i;
1519 
1520     if (inst->opcode() != G4_mov)
1521     {
1522         return false;
1523     }
1524 
1525     G4_Type dstType = inst->getDst()->getType();
1526     G4_Type srcType = inst->getSrc(0)->getType();
1527     auto src = inst->getSrc(0);
1528 
1529     bool scalarByteToFloat = builder.noScalarByteToFloat() &&
1530         IS_BTYPE(srcType) &&
1531         (IS_FTYPE(dstType) || IS_HFTYPE(dstType)) &&
1532         (src->isSrcRegRegion() && src->asSrcRegRegion()->isScalar());
1533     bool dstByteSrc64b = IS_BTYPE(dstType) && (IS_DFTYPE(srcType) || IS_QTYPE(srcType));
1534 
1535     if (scalarByteToFloat || dstByteSrc64b)
1536     {
1537         replaceDst(i, Type_W);
1538         return true;
1539     }
1540     if (IS_BTYPE(srcType) && (IS_DFTYPE(dstType) || IS_QTYPE(dstType)))
1541     {
1542         // mov Q/DF B
1543         replaceDst(i, Type_W);
1544         return true;
1545     }
1546     if (isLowPrecisionFloatTy(dstType) && (IS_DFTYPE(srcType) || IS_QTYPE(srcType)))
1547     {
1548         // mov HF Q/DF
1549         replaceDst(i, Type_F);
1550         return true;
1551     }
1552     if (isLowPrecisionFloatTy(srcType) && (IS_DFTYPE(dstType) || IS_QTYPE(dstType)))
1553     {
1554         // mov Q/DF HF
1555         replaceDst(i, Type_F);
1556         return true;
1557     }
1558     const bool noHFToInteger = builder.noHFToInteger() &&
1559         IS_HFTYPE(srcType) &&
1560         (dstType == Type_D || dstType == Type_W);
1561     if (noHFToInteger)
1562     {
1563         // mov W/DW HF
1564         replaceDst(i, Type_F);
1565         return true;
1566     }
1567     return false;
1568 }
1569 
fixRotate(INST_LIST_ITER i,G4_BB * bb)1570 bool HWConformity::fixRotate(INST_LIST_ITER i, G4_BB* bb)
1571 {
1572 
1573     // rotate requires src0 and dst to have the same datatype precision
1574     // It also does not support *B/*Q types, but that should be enforced at the vISA level
1575     // returns true if new instruction is inserted
1576     bool changed = false;
1577     G4_INST* inst = *i;
1578     if (inst->opcode() != G4_rol && inst->opcode() != G4_ror)
1579     {
1580         return false;
1581     }
1582     G4_DstRegRegion* dst = inst->getDst();
1583     G4_SrcRegRegion* src = inst->getSrc(0)->asSrcRegRegion();
1584 
1585     MUST_BE_TRUE(IS_WTYPE(dst->getType()) || IS_DTYPE(dst->getType()) || IS_QTYPE(dst->getType()), "dst type must be *W or *D or *Q");
1586     MUST_BE_TRUE(IS_WTYPE(src->getType()) || IS_DTYPE(src->getType()) || IS_QTYPE(src->getType()), "src type must be *W or *D or *Q");
1587 
1588     if (dst->getTypeSize() != src->getTypeSize())
1589     {
1590         // keep exec type same and change dst to be same type as src
1591         replaceDst(i, src->getType());
1592         dst = inst->getDst();
1593         changed = true;
1594     }
1595 
1596     if (dst->getType() == Type_W)
1597     {
1598         dst->setType(Type_UW);
1599     }
1600     else if (dst->getType() == Type_D)
1601     {
1602         dst->setType(Type_UD);
1603     }
1604     else if (builder.getPlatform() >= GENX_PVC && dst->getType() == Type_Q)
1605     {
1606         dst->setType(Type_UQ);
1607     }
1608 
1609     if (src->getType() == Type_W)
1610     {
1611         src->setType(Type_UW);
1612     }
1613     else if (src->getType() == Type_D)
1614     {
1615         src->setType(Type_UD);
1616     }
1617     else if (builder.getPlatform() >= GENX_PVC && src->getType() == Type_Q)
1618     {
1619         src->setType(Type_UQ);
1620     }
1621     return changed;
1622 }
1623 
fixDstAlignment(INST_LIST_ITER i,G4_BB * bb,G4_Type extype,unsigned int dst_elsize)1624 bool HWConformity::fixDstAlignment(INST_LIST_ITER i, G4_BB* bb, G4_Type extype, unsigned int dst_elsize)
1625 {
1626     G4_INST* inst = *i;
1627     bool insertMOV = false;
1628 
1629     unsigned char exec_size = inst->getExecSize();
1630     G4_DstRegRegion* dst = inst->getDst();
1631     G4_Operand* src0 = inst->getSrc(0);
1632     unsigned h_stride = dst->getHorzStride();
1633     unsigned int extypesize = TypeSize(extype);
1634 
1635     if (hasDedicateAlignRegionConformity(i))
1636     {
1637         return insertMOV;
1638     }
1639 
1640     if (inst->hasNULLDst())
1641     {
1642         if (dst_elsize * h_stride < extypesize)
1643         {
1644             uint16_t newHStride = extypesize / dst_elsize;
1645             if (newHStride == 8)
1646             {
1647                 MUST_BE_TRUE(dst_elsize == 1, "expect B/UB dst");
1648                 if (inst->opcode() == G4_mov && exec_size == 1 &&
1649                     src0->isSrcRegRegion() && !src0->asSrcRegRegion()->hasModifier())
1650                 {
1651                     // Just set src to be the same type as dst
1652                     src0->asSrcRegRegion()->setType(dst->getType());
1653                 }
1654                 else
1655                 {
1656                     replaceDst(i, Type_W);
1657                     return true;
1658                 }
1659             }
1660             else
1661             {
1662                 MUST_BE_TRUE(newHStride <= 4, "horizontal stride must be <=4");
1663                 dst->setHorzStride(newHStride);
1664             }
1665         }
1666 
1667         return insertMOV;
1668     }
1669 
1670     // optimize initialization instructions
1671     if (inst->opcode() == G4_mov && src0->isImm() &&
1672         (bb->isAllLaneActive() || inst->isWriteEnableInst()) &&
1673         !inst->getPredicate() &&
1674         dst->getRegAccess() == Direct &&
1675         dst->getHorzStride() == 1 &&
1676         inst->getSaturate() == false &&
1677         IS_BTYPE(dst->getType()) &&
1678         !IS_TYPE_F32_F64(src0->getType()) &&
1679         builder.isOpndAligned(dst, src0->getTypeSize()))
1680     {
1681         // inst is a mov with packed byte dst and int imm source
1682         int64_t value = src0->asImm()->getInt();
1683         uint64_t new_value = (value & 0xFF) | (value << 0x8);
1684         int scale = 2;
1685 
1686         if (IS_DTYPE(src0->getType()))
1687         {
1688             scale = 4;
1689             new_value = (new_value & 0xFFFF) | (new_value << 0x10);
1690         }
1691 
1692         if (exec_size >= scale)
1693         {
1694             G4_Type new_type = (scale == 2) ? Type_UW : Type_UD;
1695             auto newDst = builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff() / scale, 1, new_type, dst->getAccRegSel());
1696             inst->setDest(newDst);
1697             inst->setSrc(builder.createImm(new_value, new_type), 0);
1698             inst->setExecSize(G4_ExecSize(exec_size / scale));
1699             return insertMOV;
1700         }
1701     }
1702 
1703     bool byteDst = IS_BTYPE(dst->getType());
1704 
1705     // Byte can not be used as dstination of INT*INT
1706     if ((byteDst && inst->opcode() == G4_mul &&
1707         IS_TYPE_INT(inst->getSrc(0)->getType()) && IS_TYPE_INT(inst->getSrc(1)->getType())))
1708     {
1709         // change dst type to W
1710         replaceDst(i, Type_W);
1711         return true;
1712     }
1713 
1714     if (byteDst && extypesize == 8)
1715     {
1716         // Gen doesn't support hstride 8, so we add a W move here
1717         replaceDst(i, Type_W);
1718         return true;
1719     }
1720 
1721     if (builder.hasBFMixMode() && extype == Type_F && inst->getDst()->getType() == Type_BF && !inst->isDpas())
1722     {
1723         // For now, BF mixed mode should not need this check.
1724         // If visa may allow any region as input under bf mixed mode, we need to change this.
1725         return false;
1726     }
1727 
1728     bool dstHFMixModeInst = inst->getDst()->getType() == builder.getMixModeType() && extype == Type_F;
1729     bool dstNotAlignedToExecType = exec_size > 1 && (dst_elsize * h_stride) < extypesize &&
1730         !(builder.hasMixMode() && dstHFMixModeInst);
1731     unsigned short dst_byte_offset;
1732     builder.isOpndAligned(dst, dst_byte_offset, extypesize);
1733     if (!((dst_byte_offset % extypesize == 0) ||
1734         (byteDst &&
1735         (dst_byte_offset % extypesize == 1))
1736        ) ||
1737         /*
1738          * Dynamic offset can be odd for serialized instructions
1739          * or when horizontal offset is dynamic.
1740          * Probably we need the same for any dst with dynamic offsets.
1741          */
1742         (dst_elsize < extypesize &&
1743             dst->getRegAccess() != Direct &&
1744             !(byteDst && extypesize == 2 && exec_size == 1)
1745            ) ||
1746         dstNotAlignedToExecType)
1747     {
1748         /*
1749          * 10.3
1750          * For byte dst type:
1751          * 1. no 1 horstride
1752          * 2. no odd start subreg
1753          * There is only one excpetion - raw mov op
1754          * Raw means src operand has no attribute.
1755          *
1756          * Note: Actually all these cases are now controlled
1757          *       by extypesize value.
1758          */
1759 
1760         if (inst->isRawMov() &&
1761             (dst_byte_offset % extypesize == 0 ||
1762             (byteDst && dst_byte_offset % extypesize == 1)))
1763         {
1764             return insertMOV;
1765         }
1766 
1767         if (canReplaceMovSrcType(builder, inst, extypesize))
1768         {
1769             return false;
1770         }
1771 
1772         if (inst->opcode() == G4_mov)
1773         {
1774             bool intHFConversion = false;
1775             G4_Operand* src0 = inst->getSrc(0);
1776             if (isLowPrecisionFloatTy(dst->getType()) && IS_TYPE_INT(src0->getType()))
1777             {
1778                 intHFConversion = true;
1779             }
1780             else if (isLowPrecisionFloatTy(src0->getType()) && IS_TYPE_INT(dst->getType()))
1781             {
1782                 intHFConversion = true;
1783             }
1784             // F to packed HF operations are handled specially later
1785             bool FtoHFMov = dst->getType() == Type_HF && src0->getType() == Type_F;
1786             if (builder.getPlatform() >= GENX_CHV && !intHFConversion &&
1787                 (inst->isMixedMode() || (builder.hasFtoPackedHFMove() && FtoHFMov && inst->getExecSize() >= builder.getNativeExecSize())))
1788             {
1789                 return insertMOV;
1790             }
1791         }
1792 
1793         if (splitInstListForByteDst(i, bb, (uint16_t)extypesize))
1794         {
1795             return true;
1796         }
1797 
1798         inst->setDest(insertMovAfter(i, dst, dst->getType(), bb));
1799         insertMOV = true;
1800     }
1801 
1802     return insertMOV;
1803 }
1804 
fixPredicateIndirectInst(INST_LIST_ITER it,G4_BB * bb)1805 void HWConformity::fixPredicateIndirectInst(INST_LIST_ITER it, G4_BB* bb)
1806 {
1807     G4_INST* inst = (*it);
1808     if (inst->getPredicate() &&
1809         inst->getDst() &&
1810         !inst->getDst()->isNullReg() &&
1811         inst->getDst()->getRegAccess() == Direct)
1812     {
1813         bool hasIndirectSource = false;
1814         for (int i = 0; i < inst->getNumSrc(); i++)
1815         {
1816             G4_Operand* opnd = inst->getSrc(i);
1817 
1818             if (opnd && opnd->isSrcRegRegion() &&
1819                 opnd->asSrcRegRegion()->getRegAccess() == IndirGRF)
1820             {
1821                 if (inst->opcode() == G4_sel)
1822                 {
1823                     replaceSrc(it, i, opnd->getType(), bb);
1824                 }
1825                 else
1826                 {
1827                     hasIndirectSource = true;
1828                     break;
1829                 }
1830             }
1831         }
1832 
1833         if (hasIndirectSource)
1834         {
1835             replaceDst(it, inst->getDst()->getType());
1836         }
1837     }
1838 }
1839 
1840 /*
1841  * This function checks to see if the instruction's indirect operands
1842  * potentially require totally more than 8 distinct addr reg sub-registers, and
1843  * then determines which of the operands to spill into temporary GRFs so
1844  * as to limit total number of distinct sub-registers used by the instruction
1845  * to 8. This is a requirement imposed by the CM register allocator.
1846  */
1847 
fixIndirectOpnd(INST_LIST_ITER i,G4_BB * bb)1848 bool HWConformity::fixIndirectOpnd(INST_LIST_ITER i, G4_BB* bb)
1849 {
1850     G4_INST* inst = *i;
1851 
1852     G4_Operand* src0 = inst->getSrc(0), * src1 = inst->getSrc(1);
1853     G4_DstRegRegion* dst = inst->getDst();
1854     bool null_dst = (!dst || inst->hasNULLDst());
1855 
1856     bool null_src0 = !src0;
1857     bool null_src1 = !src1 || (inst->isMath() && src1->isNullReg());
1858 
1859     const int addr_reg_max_count = 16;
1860     const int addr_reg_size = TypeSize(Type_UW);
1861     int src_uniq_count = 0;
1862     int src1_count = 0;
1863     int src0_count = 0;
1864     int dst_uniq_count = 0;
1865     int dst_count = 0;
1866     bool nospill_src1 = false;
1867     bool nospill_src0 = false;
1868     bool nospill_dst = false;
1869     bool spill_src1 = false;
1870     bool spill_src0 = false;
1871     bool spill_dst = false;
1872     G4_Declare* addr_dcl0 = NULL, * addr_dcl1 = NULL, * addr_dcl2 = NULL;
1873     if (!null_src0 && src0->isSrcRegRegion() &&
1874         src0->getRegAccess() != Direct && src0->asSrcRegRegion()->getBase()->isRegVar()) {
1875         addr_dcl0 = src0->asSrcRegRegion()->getBase()->asRegVar()->getDeclare()->getRootDeclare();
1876         // is the following precise?
1877         src0_count = addr_dcl0->getTotalElems();
1878         MUST_BE_TRUE(src0_count <= addr_reg_max_count, "More than 8 address subregisters required for one operand.");
1879         src_uniq_count += src0_count;
1880     }
1881 
1882     if (!null_src1 && src1->isSrcRegRegion() &&
1883         src1->getRegAccess() != Direct && src1->asSrcRegRegion()->getBase()->isRegVar()) {
1884         addr_dcl1 = src1->asSrcRegRegion()->getBase()->asRegVar()->getDeclare()->getRootDeclare();
1885         src1_count = addr_dcl1->getTotalElems();
1886         MUST_BE_TRUE(src1_count <= addr_reg_max_count, "More than 8 address subregisters required for one operand.");
1887         if (addr_dcl1 != addr_dcl0) {
1888             // should we use top level dcl here?
1889             src_uniq_count += src1_count;
1890         }
1891         else {
1892             nospill_src1 = true;
1893             nospill_src0 = true;
1894         }
1895     }
1896 
1897     if (!null_dst &&
1898         dst->getRegAccess() != Direct && dst->getBase()->isRegVar())
1899     {
1900         addr_dcl2 = dst->getBase()->asRegVar()->getDeclare()->getRootDeclare();
1901         dst_count = addr_dcl2->getTotalElems();
1902         MUST_BE_TRUE(dst_count <= addr_reg_max_count, "More than 8 address subregisters required for one operand.");
1903         if (addr_dcl2 != addr_dcl0 && addr_dcl2 != addr_dcl1) {
1904             dst_uniq_count += dst_count;
1905         }
1906         else if (addr_dcl2 != addr_dcl0) {
1907             nospill_dst = true;
1908             nospill_src0 = true;
1909         }
1910         else {
1911             nospill_dst = true;
1912             nospill_src1 = true;
1913         }
1914     }
1915 
1916     if (src_uniq_count > addr_reg_max_count) {
1917         if (src0_count > src1_count || nospill_src1) {
1918             MUST_BE_TRUE(nospill_src0 == false, "Address of source0 should be spilled.");
1919             spill_src0 = true;
1920             src_uniq_count -= src0_count;
1921         }
1922         else {
1923             MUST_BE_TRUE(nospill_src1 == false, "Address of source1 should be spilled.");
1924             spill_src1 = true;
1925             src_uniq_count -= src1_count;
1926         }
1927     }
1928 
1929     if (src_uniq_count + dst_uniq_count > addr_reg_max_count) {
1930         MUST_BE_TRUE(nospill_dst == false, "Address of dst should be spilled.");
1931 
1932         if (nospill_src1 && nospill_src0) {
1933             spill_dst = true;
1934             dst_uniq_count = 0;
1935         }
1936         else if (dst_uniq_count > src0_count&& dst_uniq_count > src1_count) {
1937             spill_dst = true;
1938             dst_uniq_count = 0;
1939         }
1940         else if (spill_src0) {
1941             spill_src1 = true;
1942             src_uniq_count -= src1_count;
1943         }
1944         else if (spill_src1) {
1945             spill_src0 = true;
1946             src_uniq_count -= src0_count;
1947         }
1948         else if (src0_count > src1_count) {
1949             spill_src0 = true;
1950             src_uniq_count -= src0_count;
1951         }
1952         else {
1953             spill_src1 = true;
1954             src_uniq_count -= src1_count;
1955         }
1956     }
1957 
1958     MUST_BE_TRUE(src_uniq_count + dst_uniq_count <= addr_reg_max_count,
1959         "Remianed number of address registers should be no more than 8 after spill.");
1960 
1961     // Is this only for iselect?
1962     // What if a scalar with indirect addressing is used?
1963     if (spill_src0) {
1964         G4_Operand* new_src0 = insertMovBefore(i, 0, src0->getType(), bb);
1965         inst->setSrc(new_src0, 0);
1966     }
1967 
1968     if (spill_src1 && src1) {
1969         G4_Operand* new_src1 = insertMovBefore(i, 1, src1->getType(), bb);
1970         inst->setSrc(new_src1, 1);
1971     }
1972 
1973     if (spill_dst && dst)
1974     {
1975         G4_DstRegRegion* new_dst = insertMovAfter(i, dst, dst->getType(), bb);
1976         inst->setDest(new_dst);
1977         if (dst != new_dst &&
1978             (IS_FTYPE(dst->getType()) || IS_DFTYPE(dst->getType())))
1979         {
1980             inst->setSaturate(g4::NOSAT);
1981         }
1982     }
1983     return spill_dst;
1984 }
1985 
1986 // If an accumulator is a source operand, its register region must match that of the
1987 // destination register (which means GRF-aligned since we always GRF-align Acc)
1988 // also check for restrictions on explicit acc dst
fixAcc(INST_LIST_ITER iter,G4_BB * bb)1989 bool HWConformity::fixAcc(INST_LIST_ITER iter, G4_BB* bb)
1990 {
1991     G4_INST* inst = *iter;
1992 
1993     bool changed = false;
1994     auto dst = inst->getDst();
1995     if ((dst && dst->isAccReg()) || inst->opcode() == G4_mach)
1996     {
1997         if (!builder.accDstforIndirectSrc())
1998         {
1999             if (inst->getSrc(0)->isSrcRegRegion() && inst->getSrc(0)->asSrcRegRegion()->getRegAccess() == IndirGRF)
2000             {
2001                 inst->setSrc(insertMovBefore(iter, 0, inst->getSrc(0)->getType(), bb), 0);
2002                 changed = true;
2003             }
2004         }
2005     }
2006 
2007     // implicit acc src/dst get its offset from dst
2008     bool useAcc = inst->hasImplicitAccSrc() || inst->hasImplicitAccDst();
2009     if (!useAcc)
2010     {
2011         for (int i = 0; i < inst->getNumSrc(); ++i)
2012         {
2013             G4_Operand* src = inst->getSrc(i);
2014             if (src && src->isAccReg())
2015             {
2016                 useAcc = true;
2017                 break;
2018             }
2019         }
2020     }
2021 
2022     if (useAcc &&
2023         dst &&
2024         dst->getBase() &&
2025         dst->getBase()->isRegVar())
2026     {
2027         if (!builder.isOpndAligned(dst, numEltPerGRF<Type_UB>()))
2028         {
2029             inst->setDest(insertMovAfter(iter, dst, dst->getType(), bb, GRFALIGN));
2030             changed = true;
2031         }
2032     }
2033 
2034     return changed;
2035 }
2036 
2037 /*
2038  * When operation execution size is 1, destination horizontal stride is set
2039  * according to rule 10.2:
2040  *
2041  * 10.1.2. If ExecSize is greater than 1, dst.HorzStride*sizeof(dst.Type) must
2042  *         be equal to or greater than the size of the execution data type.
2043  * 10.2. If ExecSize is 1, dst.HorzStride must not be 0. Note that this is
2044  *       relaxed from rule 10.1.2. Also note that this rule for destination
2045  *       horizontal stride is different from that for source as stated
2046  *       in rule #7.
2047  *
2048  * There are some instructions which work unpredictably if both ExecSize
2049  * and dst.HorzStride are 1. But they work fine if dst.HorzStride is set
2050  * according to rule 10.1.2. So we have to correct all such cases.
2051  *
2052  * This supposed to be the last operation before emitting final assembly code.
2053  */
fixDstHstride(INST_LIST_ITER i,int extypesize)2054 void HWConformity::fixDstHstride(INST_LIST_ITER i, int extypesize)
2055 {
2056     G4_INST* inst = *i;
2057     G4_DstRegRegion* dst = inst->getDst();
2058     int dst_elsize = dst->getTypeSize();
2059 
2060     if (dst)
2061     {
2062         unsigned short hs = dst->getHorzStride();
2063         if (hs * dst_elsize < extypesize)
2064         {
2065             dst->setHorzStride((unsigned short)(extypesize / dst_elsize));
2066         }
2067     }
2068 }
2069 
2070 template<class T>
isPreAssignedRegOffsetNonZero(T * region)2071 bool isPreAssignedRegOffsetNonZero(T* region)
2072 {
2073     // T is non-NULL and either
2074     // G4_SrcRegRegion or G4_DstRegRegion
2075     bool ret = false;
2076 
2077     if ((region->isSrcRegRegion() || region->isDstRegRegion()) &&
2078         region->getBase() &&
2079         region->getBase()->isRegVar() &&
2080         region->getBase()->asRegVar()->isPhyRegAssigned() &&
2081         region->getBase()->asRegVar()->getPhyRegOff() != 0)
2082     {
2083         ret = true;
2084     }
2085 
2086     return ret;
2087 }
2088 
generateMacl(INST_LIST_ITER it,G4_BB * bb)2089 void HWConformity::generateMacl(INST_LIST_ITER it, G4_BB* bb)
2090 {
2091     G4_INST* mulInst = *it;
2092     MUST_BE_TRUE(mulInst->opcode() == G4_mul, "expect mul instruction");
2093     if (mulInst->getExecSize() > builder.getNativeExecSize())
2094     {
2095         auto startIter = it;
2096         bool isFirstInst = startIter == bb->begin();
2097         if (!isFirstInst)
2098         {
2099             --startIter;
2100         }
2101         evenlySplitInst(it, bb);
2102         if (!isFirstInst)
2103         {
2104             ++startIter;
2105         }
2106         // startIter now points to first mul created by split
2107         auto endIter = it;
2108         ++endIter;
2109         // endIter points to the first inst after the original mul
2110         for (auto iter = startIter; iter != endIter;)
2111         {
2112             auto nextIter = iter;
2113             ++nextIter;
2114             G4_INST* currInst = *iter;
2115             if (currInst->opcode() == G4_mul)
2116             {
2117                 doGenerateMacl(iter, bb);
2118             }
2119             iter = nextIter;
2120         }
2121     }
2122     else
2123     {
2124         doGenerateMacl(it, bb);
2125     }
2126 }
2127 
2128 // convert vISA mul (8) dst src0 src1 into
2129 // mul (8) acc0.0<1>:d src0:d src1:w
2130 // mach (8) dst:d src0:d src1:d
2131 //
doGenerateMacl(INST_LIST_ITER it,G4_BB * bb)2132 void HWConformity::doGenerateMacl(INST_LIST_ITER it, G4_BB* bb)
2133 {
2134     G4_INST* mulInst = *it;
2135     MUST_BE_TRUE(mulInst->opcode() == G4_mul, "expect mul instruction");
2136     assert(mulInst->getExecSize() <= builder.getNativeExecSize() && "expect single register inst");
2137 
2138     G4_Operand* src0 = mulInst->getSrc(0);
2139     G4_Operand* src1 = mulInst->getSrc(1);
2140     MUST_BE_TRUE(IS_DTYPE(src0->getType()) && IS_DTYPE(src1->getType()), "both sources must have dword type");
2141 
2142     // src1 does not support modifier
2143     checkSrcMod(it, bb, 1);
2144     // fix src1 region: stride can't exceed 4, otherwise the stride of src1 in the expanded mul will be invalid
2145     // mulh dst:d src0:d src1:d
2146     //  =>
2147     // mul acc0:d src0:d src1:uw
2148     // mach dst:d src0:d src1:d
2149     fixSrc1Region(it, bb);
2150     src1 = mulInst->getSrc(1);
2151 
2152     if (!builder.supportSrcModforMul())
2153     {
2154         checkSrcMod(it, bb, 0);
2155         src0 = mulInst->getSrc(0);
2156     }
2157 
2158     // sat cannot be used at all in the macro sequence
2159     // this effectivly means sat is broken for mul D D D
2160     mulInst->setSaturate(g4::NOSAT);
2161 
2162     G4_DstRegRegion* origDst = mulInst->getDst();
2163     G4_Type tmpType = (IS_UNSIGNED_INT(src0->getType()) && IS_UNSIGNED_INT(src1->getType())) ? Type_UD : Type_D;
2164     if (builder.noMulOrMadwExpandingBeforeScheduler() && builder.getOption(vISA_expandMulPostSchedule))
2165     {
2166         // Here just create tmp variables to fix srcMod, cond modifier, saturate, etc. And Mul->Mul+Macl expanding will
2167         // be done in expandMulPostSchedule pass.
2168 
2169         // Need extra mov if dst is acc and src0 is indirect
2170         if (!builder.accDstforIndirectSrc())
2171         {
2172             if (src0->isSrcRegRegion() && src0->asSrcRegRegion()->getRegAccess() == IndirGRF)
2173             {
2174                 mulInst->setSrc(insertMovBefore(it, 0, src0->getType(), bb), 0);
2175             }
2176         }
2177 
2178         //need extra move for dst
2179         if (!IS_DTYPE(origDst->getType()) || origDst->getHorzStride() != 1 ||
2180             !builder.isOpndAligned(origDst, getGRFSize()))
2181         {
2182             // macl dst must be grf-aligned, packed D/UD as it is also used for the implicit acc source's region
2183             G4_DstRegRegion* tmpDst = insertMovAfter(it, origDst, tmpType, bb, GRFALIGN);
2184             mulInst->setDest(tmpDst);
2185         }
2186 
2187         // set implicit acc dst to the mul instruction as acc will be used as dst of the expanded mul after local scheduling.
2188         // it is a must to fix the WAR/WAW issue of acc in local scheduling.
2189         G4_DstRegRegion* accDstOpnd = builder.createDst(builder.phyregpool.getAcc0Reg(), 0, 0, 1, mulInst->getDst()->getType());
2190         mulInst->setImplAccDst(accDstOpnd);
2191     }
2192     else
2193     {
2194         G4_DstRegRegion* accDstOpnd = builder.createDst(builder.phyregpool.getAcc0Reg(), 0, 0, 1, tmpType);
2195         mulInst->setDest(accDstOpnd);
2196 
2197         uint32_t origOptions = mulInst->getOption();
2198         fixMulSrc1(it, bb);
2199         mulInst->setOptionOn(InstOpt_WriteEnable);
2200 
2201         G4_Predicate* predicate = mulInst->getPredicate();
2202         if (predicate != nullptr)
2203         {
2204             // move pred to mach
2205             mulInst->setPredicate(nullptr);
2206         }
2207         if (mulInst->getCondMod() != nullptr)
2208         {
2209             // conditional modifier cannot be used
2210             // when the MUL source operand is of dword type.
2211             MUST_BE_TRUE(false, "Dw multiply does not support conditional modifiers");
2212             mulInst->setCondMod(nullptr);
2213         }
2214 
2215         // create a macl inst
2216         G4_INST* maclInst = builder.createMacl(mulInst->getExecSize(),
2217             origDst, builder.duplicateOperand(src0), builder.duplicateOperand(src1), origOptions, tmpType);
2218         maclInst->setPredicate(predicate);
2219 
2220         // maintain du chain as fixAccDst uses it later
2221         mulInst->addDefUse(maclInst, Opnd_implAccSrc);
2222 
2223         INST_LIST_ITER machIter = it;
2224         machIter = bb->insertBefore(++machIter, maclInst);
2225 
2226         if (!IS_DTYPE(origDst->getType()) || origDst->getHorzStride() != 1 ||
2227             !builder.isOpndAligned(origDst, getGRFSize()))
2228         {
2229             // macl dst must be grf-aligned, packed D/UD as it is also used for the implicit acc source's region
2230             G4_DstRegRegion* tmpDst = insertMovAfter(machIter, origDst, tmpType, bb, GRFALIGN);
2231             maclInst->setDest(tmpDst);
2232         }
2233     }
2234 }
2235 
2236 // get rid of source modifiers on this inst[srcPos]
checkSrcMod(INST_LIST_ITER it,G4_BB * bb,int srcPos)2237 bool HWConformity::checkSrcMod(INST_LIST_ITER it, G4_BB* bb, int srcPos)
2238 {
2239     bool changed = false;
2240     G4_INST* inst = *it;
2241     assert(srcPos < inst->getNumSrc() && "invalid srcPos");
2242     auto src = inst->getSrc(srcPos);
2243     if (src->isSrcRegRegion())
2244     {
2245         G4_SrcRegRegion* srcRegion = src->asSrcRegRegion();
2246         if (srcRegion->getModifier() != Mod_src_undef)
2247         {
2248             G4_Type type = IS_DTYPE(src->getType()) ? src->getType() : Type_D;
2249             src = insertMovBefore(it, srcPos, type, bb);
2250             inst->setSrc(src, srcPos);
2251             changed = true;
2252         }
2253     }
2254     return changed;
2255 }
2256 
2257 // If both source operands of an MUL instruction are of dword integer type,
2258 // only the lower 16 bits of data elements in src0 are used.
2259 // The full precision multiplication results can be only produced together
2260 // with the mach and mov instructions.
2261 
fixMULInst(INST_LIST_ITER & i,G4_BB * bb)2262 bool HWConformity::fixMULInst(INST_LIST_ITER& i, G4_BB* bb)
2263 {
2264     bool insertedInst = false;
2265     G4_INST* inst = *i;
2266     G4_DstRegRegion* dst = inst->getDst();
2267     G4_ExecSize execSize = inst->getExecSize();
2268     bool srcExchanged = false;
2269 
2270     if (dst->isAccReg())
2271     {
2272         return false;
2273     }
2274 
2275     uint32_t inst_opt = inst->getOption();
2276     G4_Operand* src0 = inst->getSrc(0), * src1 = inst->getSrc(1);
2277 
2278     // MUL is commutative and only
2279     // allows src1 to be a constant.
2280     // If src0 is a constant and src1
2281     // is not, they are swapped here.
2282     // If both are constants, they
2283     // will be fixed in checking HW conformity.
2284     // this is fixed in fixOpnd.
2285 
2286     if (src0->isImm() && !src1->isImm())
2287     {
2288         inst->swapSrc(0, 1);
2289         srcExchanged = true;
2290     }
2291 
2292     if (!builder.supportSrcModforMul() &&
2293         (IS_DTYPE(src0->getType()) || IS_DTYPE(src1->getType())) &&
2294         ((src0->getTypeSize()) < 4 || (src1->getTypeSize()) < 4))
2295 
2296     {
2297         checkSrcMod(i, bb, 0);
2298         checkSrcMod(i, bb, 1);
2299     }
2300 
2301     src0 = inst->getSrc(0);
2302     src1 = inst->getSrc(1);
2303     // Q dst needs 64-bit support regardless of src type
2304     bool isDMul = IS_QTYPE(dst->getType()) || (IS_DTYPE(src0->getType()) && IS_DTYPE(src1->getType()));
2305 
2306     if (!isDMul)
2307     {
2308         return false;
2309     }
2310 
2311     if (builder.hasMacl() && !IS_QTYPE(dst->getType()) &&
2312         (builder.noDwDstForDwordMul() || inst->getExecSize() > g4::SIMD1))
2313     {
2314         // use macl for D = D x D. We use macl when possible
2315         // except on scalar inst on platforms that support native DMul
2316         generateMacl(i, bb);
2317         return true;
2318     }
2319 
2320     bool doNativeMul = false;
2321     if (!builder.no64bitRegioning())
2322     {
2323         // platform natively supports DW-DW multiply, no need to generate mul/mach/mov sequence
2324         doNativeMul = true;
2325     }
2326     else
2327     {
2328         if ((builder.getPlatform() == GENX_CHV || builder.getPlatform() == GENX_BXT))
2329         {
2330             if (inst->getExecSize() == g4::SIMD1)
2331             {
2332                 // scalar insts are a-ok
2333                 return false;
2334             }
2335             // ok if source is scalar or qword-aligned
2336             doNativeMul = dst->getTypeSize() * dst->getHorzStride() == 8;
2337             auto isQWordStride = [inst, this](G4_SrcRegRegion* src)
2338             {
2339                 const RegionDesc* region = src->getRegion();
2340                 if (!region->isScalar())
2341                 {
2342                     uint16_t stride = 0;
2343                     (void)region->isSingleNonUnitStride(inst->getExecSize(), stride);
2344                     if (stride != 2)
2345                     {
2346                         return false;
2347                     }
2348                     // check that source is GRF-aligned to ensure that every element is qword-aligned
2349                     return builder.isOpndAligned(src, 32);
2350                 }
2351                 return true;
2352             };
2353             if (doNativeMul && src0->isSrcRegRegion())
2354             {
2355                 doNativeMul = isQWordStride(src0->asSrcRegRegion());
2356             }
2357             if (doNativeMul && src1->isSrcRegRegion())
2358             {
2359                 doNativeMul = isQWordStride(src1->asSrcRegRegion());
2360             }
2361         }
2362     }
2363 
2364     if (doNativeMul)
2365     {
2366         // promote source to D type if necessary
2367         if (IS_QTYPE(dst->getType()))
2368         {
2369             G4_Type newTy;
2370             G4_Operand* newOpnd;
2371             if (!IS_DTYPE(src0->getType()))
2372             {
2373                 newTy = IS_SIGNED_INT(src0->getType()) ? Type_D : Type_UD;
2374                 newOpnd = insertMovBefore(i, 0, newTy, bb);
2375                 inst->setSrc(newOpnd, 0);
2376                 insertedInst = true;
2377             }
2378 
2379             if (!IS_DTYPE(src1->getType()))
2380             {
2381                 newTy = IS_SIGNED_INT(src1->getType()) ? Type_D : Type_UD;
2382                 if (src1->isImm())
2383                 {
2384                     newOpnd = builder.createImm(src1->asImm()->getImm(), newTy);
2385                 }
2386                 else
2387                 {
2388                     newOpnd = insertMovBefore(i, 1, newTy, bb);
2389                 }
2390                 inst->setSrc(newOpnd, 1);
2391                 insertedInst = true;
2392             }
2393         }
2394         return insertedInst;
2395     }
2396 
2397     // both sources are dword, replace with mul/mach/mov sequence
2398     // At this point, src0 and src1 are both DW, so we simply make
2399     // acc's type (i.e. dst_type) be DW/UD
2400 
2401     G4_CondMod* condmod = builder.duplicateOperand(inst->getCondMod());
2402     G4_Predicate* pred = builder.duplicateOperand(inst->getPredicate());
2403 
2404     G4_Type tmp_type = (IS_UNSIGNED_INT(src0->getType()) && IS_UNSIGNED_INT(src1->getType())) ? Type_UD : Type_D;
2405 
2406     // src1 does not support modifier
2407     checkSrcMod(i, bb, 1);
2408     src1 = inst->getSrc(1);
2409 
2410     if (!builder.supportSrcModforMul())
2411     {
2412         checkSrcMod(i, bb, 0);
2413         src0 = inst->getSrc(0);
2414     }
2415 
2416     auto satMod = inst->getSaturate();
2417     inst->setSaturate(g4::NOSAT);
2418 
2419     G4_DstRegRegion* acc_dst_opnd = builder.createDst(builder.phyregpool.getAcc0Reg(), 0, 0, 1, tmp_type);
2420     inst->setDest(acc_dst_opnd);
2421     fixMulSrc1(i, bb);
2422 
2423     inst->setNoMask(true);
2424 
2425     if (pred)
2426     {
2427         // conditional modifier cannot be used
2428         // when the MUL source operand is of dword type.
2429         inst->setCondMod(nullptr);
2430     }
2431 
2432     // Dst is either null, or a temp D if the original dst is Q/UQ
2433     G4_DstRegRegion* machDst = NULL;
2434     G4_Declare* high32BitDcl = NULL;
2435     if (IS_QTYPE(dst->getType()))
2436     {
2437         high32BitDcl = builder.createTempVar(execSize, Type_D, Any);
2438         machDst = builder.createDstRegRegion(high32BitDcl, 1);
2439     }
2440     else
2441     {
2442         machDst = builder.createNullDst(Type_D);
2443     }
2444 
2445     // create a mach inst
2446     G4_INST* newInst = builder.createMach(execSize, machDst,
2447         builder.duplicateOperand(src0), builder.duplicateOperand(src1), inst_opt, tmp_type);
2448 
2449     INST_LIST_ITER iter = i;
2450     iter++;
2451     bb->insertBefore(iter, newInst);
2452 
2453     inst->setPredicate(nullptr);
2454 
2455     inst->copyDef(newInst, Opnd_src0, Opnd_src0);
2456     inst->copyDef(newInst, Opnd_src1, Opnd_src1);
2457     inst->transferUse(newInst);
2458     inst->addDefUse(newInst, Opnd_implAccSrc);
2459 
2460     // create an explciit acc source for later use
2461     const RegionDesc* rd = execSize > 1 ? builder.getRegionStride1() : builder.getRegionScalar();
2462     G4_SrcRegRegion* acc_src_opnd = builder.createSrc(
2463         builder.phyregpool.getAcc0Reg(), 0, 0, rd, tmp_type);
2464 
2465     insertedInst = true;
2466 
2467     if (IS_QTYPE(dst->getType()))
2468     {
2469         // we have to produce two additional moves to form the Q/UQ:
2470         // mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw
2471         // mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d
2472         // mov (8) r6.0<1>:d acc0:d  // Low 32 bits.
2473         // mov (8) dst.0<2>:d r6.0<1>:d
2474         // mov (8) dst.1<2>:d r5.0<1>:d
2475         // Note that we don't try to combine the moves because of the HW restriction that
2476         // "If an accumulator is an explicit source operand, its register region must match that of the destination register"
2477 
2478         G4_Declare* low32BitDcl = builder.createTempVar(execSize, Type_D, Any);
2479         G4_INST* movInst = builder.createMov(execSize,
2480             builder.createDstRegRegion(low32BitDcl, 1),
2481             builder.createSrcRegRegion(*acc_src_opnd), inst_opt, false);
2482         bb->insertBefore(iter, movInst);
2483 
2484         G4_DstRegRegion* origDst = dst;
2485         bool needsExtraMov = origDst->getHorzStride() > 1 || condmod != NULL || satMod;
2486 
2487         G4_Declare* dstAlias = builder.createTempVar(execSize * 2, Type_D, Any);
2488         if (!needsExtraMov)
2489         {
2490             uint32_t aliasOffset = origDst->getRegOff() * numEltPerGRF<Type_UB>() + origDst->getSubRegOff() * 8;
2491             dstAlias->setAliasDeclare(origDst->getBase()->asRegVar()->getDeclare(), aliasOffset);
2492         }
2493         G4_INST* lowMove = builder.createMov(execSize,
2494             builder.createDstRegRegion(dstAlias, 2),
2495             builder.createSrcRegRegion(low32BitDcl, builder.getRegionStride1()),
2496             inst_opt, false);
2497         lowMove->setPredicate(pred);
2498 
2499         bb->insertBefore(iter, lowMove);
2500 
2501         MUST_BE_TRUE(high32BitDcl != NULL, "mach dst must not be null");
2502         G4_INST* highMove = builder.createMov(execSize,
2503             builder.createDst(dstAlias->getRegVar(), 0, 1, 2, dstAlias->getElemType()),
2504             builder.createSrcRegRegion(high32BitDcl, builder.getRegionStride1()),
2505             inst_opt, false);
2506         highMove->setPredicate(pred);
2507         bb->insertBefore(iter, highMove);
2508 
2509         if (needsExtraMov)
2510         {
2511             // this will take care of non-packed dst/cond mod/saturate
2512             G4_Declare* dstAliasAsQ = builder.createTempVar(execSize, Type_Q, Any);
2513             dstAliasAsQ->setAliasDeclare(dstAlias, 0);
2514             G4_INST* moveInst = builder.createMov(execSize, dst, builder.createSrcRegRegion(dstAliasAsQ, builder.getRegionStride1()),
2515                 inst_opt, false);
2516             moveInst->setCondMod(condmod);
2517             moveInst->setSaturate(satMod);
2518             bb->insertBefore(iter, moveInst);
2519         }
2520 
2521         return true;
2522     }
2523 
2524     INST_LIST_ITER last_iter;
2525     // create a mov inst
2526     if (satMod == g4::NOSAT)
2527     {
2528         bool extra_mov = dst && dst->getExecTypeSize() > TypeSize(Type_D);
2529         extra_mov |= isPreAssignedRegOffsetNonZero<G4_DstRegRegion>(dst);
2530 
2531         G4_INST* movInst = builder.createMov(execSize, dst, builder.createSrcRegRegion(*acc_src_opnd),
2532             inst_opt, false);
2533         movInst->setPredicate(pred);
2534         movInst->setCondMod(condmod);
2535 
2536         newInst->transferUse(movInst);
2537         newInst->addDefUse(movInst, Opnd_src0);
2538 
2539         bb->insertBefore(iter, movInst);
2540         last_iter = iter;
2541         last_iter--;
2542         if (extra_mov)
2543         {
2544             // add a tmp mov
2545             iter--;
2546             G4_DstRegRegion* new_next_dst = insertMovAfter(iter, dst, dst->getType(), bb);
2547             movInst->setDest(new_next_dst);
2548             movInst->setPredicate(NULL);
2549         }
2550     }
2551     else
2552     {
2553         // create an extra mov inst
2554         G4_Declare* dcl = builder.createTempVar(
2555             execSize,
2556             tmp_type,
2557             GRFALIGN);
2558 
2559         G4_DstRegRegion* tmp_dst_opnd = builder.createDst(
2560             dcl->getRegVar(),
2561             0,
2562             0,
2563             1,
2564             tmp_type);
2565         G4_INST* movInst = builder.createMov(execSize, tmp_dst_opnd,
2566             builder.createSrcRegRegion(*acc_src_opnd), InstOpt_NoOpt, false);
2567         movInst->setCondMod(condmod);
2568         bb->insertBefore(iter, movInst);
2569 
2570         last_iter = iter;
2571         last_iter--;
2572 
2573         G4_SrcRegRegion* tmp_src_opnd = builder.createSrc(dcl->getRegVar(), 0, 0, rd, tmp_type);
2574 
2575         G4_INST* newInst2 = builder.createInternalInst(
2576             pred, G4_mov, condmod, satMod, execSize, dst, tmp_src_opnd, NULL, inst_opt);
2577 
2578         newInst->transferUse(newInst2);
2579         newInst->addDefUse(movInst, Opnd_src0);
2580         movInst->addDefUse(newInst2, Opnd_src0);
2581         bb->insertBefore(iter, newInst2);
2582         iter++;
2583     }
2584 
2585     if (execSize > builder.getNativeExecSize())
2586     {
2587         splitDWMULInst(i, last_iter, bb);
2588     }
2589 
2590     return insertedInst;
2591 }
2592 
2593 
2594 // Translate MULH into
2595 // MUL acc src0 src1
2596 // MACH dst src0 src1
fixMULHInst(INST_LIST_ITER & i,G4_BB * bb)2597 void HWConformity::fixMULHInst(INST_LIST_ITER& i, G4_BB* bb)
2598 {
2599     G4_INST* inst = *i;
2600     G4_ExecSize execSize = inst->getExecSize();
2601 
2602     int inst_opt = inst->getOption();
2603 
2604     G4_Operand* src0 = inst->getSrc(0), * src1 = inst->getSrc(1);
2605 
2606     if (src0->isImm() && !src1->isImm())
2607     {
2608         inst->swapSrc(0, 1);
2609         src0 = inst->getSrc(0);
2610         src1 = inst->getSrc(1);
2611     }
2612 
2613     bool useMulQDD = false;
2614     if (execSize <= builder.getNativeExecSize() && !builder.no64bitRegioning() &&
2615         builder.supportFloatOr64bRegioning())
2616     {
2617         useMulQDD = true;
2618         if (!IS_DTYPE(src0->getType()) || !IS_DTYPE(src1->getType()))
2619         {
2620             if (src1->isImm() &&
2621                 IS_DTYPE(src0->getType()) &&
2622                 (IS_WTYPE(src1->getType()) || IS_BTYPE(src1->getType())))
2623             {
2624                 // Ensure src1 has the same type size as src0.
2625                 const G4_Imm* oldImm = src1->asImm();
2626                 G4_Imm* newImm = builder.createImm(oldImm->getInt(), src0->getType());
2627                 inst->setSrc(newImm, 1);
2628             }
2629             else
2630             {
2631                 useMulQDD = false;
2632             }
2633         }
2634     }
2635     if (useMulQDD)
2636     {
2637         // use mul Q D D to get the upper 32-bit
2638         // note that we don't do this for CHV/BXT due to the 64-bit type restrictions
2639         inst->setOpcode(G4_mul);
2640         G4_DstRegRegion* dst = inst->getDst();
2641         G4_Type dstType = dst->getType();
2642 
2643         if (dstType == Type_UD)
2644             dstType = Type_UQ;
2645         else
2646             dstType = Type_Q;
2647         G4_Declare* dstDcl = dst->getBase()->asRegVar()->getDeclare();
2648         G4_Declare* tmpDcl = builder.createTempVar(
2649             execSize,
2650             dstType,
2651             Any,
2652             "TV");
2653         tmpDcl->copyAlign(dstDcl);
2654 
2655         G4_DstRegRegion* tmpDst = builder.createDstRegRegion(tmpDcl, 1);
2656         inst->setDest(tmpDst);
2657 
2658         //need move to cast back to D/UD type
2659         G4_SrcRegRegion* tmpSrc = builder.createSrc(
2660             tmpDcl->getRegVar(),
2661             0,
2662             1,
2663             execSize > 1 ? builder.getRegionStride2() : builder.getRegionScalar(),
2664             dst->getType());
2665 
2666         G4_INST* tmpMov = builder.createMov(execSize, dst, tmpSrc, inst->getOption(), false);
2667         tmpMov->setPredicate(builder.duplicateOperand(inst->getPredicate()));
2668 
2669         bb->insertAfter(i, tmpMov);
2670 
2671         // Check the new inserted mov inst
2672         i++;
2673 
2674         // Need to remove dst from uses list of mulh, and add them to movInst useList
2675         // add movInst to uselist of mulh.
2676         // Add mulh to def instruction list of movInst
2677         inst->transferUse(tmpMov);
2678         inst->addDefUse(tmpMov, Opnd_src0);
2679         return;
2680     }
2681 
2682     // src1 does not support modifier
2683     checkSrcMod(i, bb, 1);
2684     // fix src1 region: stride can't exceed 4, otherwise the stride of src1 in the expanded mul will be invalid
2685     // mulh dst:d src0:d src1:d
2686     //  =>
2687     // mul acc0:d src0:d src1:uw
2688     // mach dst:d src0:d src1:d
2689     fixSrc1Region(i, bb);
2690     src1 = inst->getSrc(1);
2691 
2692     if (!builder.supportSrcModforMul())
2693     {
2694         checkSrcMod(i, bb, 0);
2695         src0 = inst->getSrc(0);
2696     }
2697 
2698     G4_Type tmp_type = (IS_UNSIGNED_INT(src0->getType()) && IS_UNSIGNED_INT(src1->getType())) ? Type_UD : Type_D;
2699 
2700     assert(IS_DTYPE(src0->getType()) && "src0 must be DW type");
2701 
2702 
2703     if (builder.noMulOrMadwExpandingBeforeScheduler() && builder.getOption(vISA_expandMulPostSchedule))
2704     {
2705         // Here just create tmp variables to fix srcMod, cond modifier, saturate, etc. And Mul->Mul + Macl expanding will
2706         // be done in expandMulPostSchedule pass.
2707 
2708         if (src1->isImm() && src0->getType() != src1->getType())
2709         {
2710             G4_Imm* oldImm = src1->asImm();
2711             // Ensure src1 has the same type as src0.
2712             inst->setSrc(builder.createImm(oldImm->getInt(), src0->getType()), 1);
2713         }
2714         else if (!IS_DTYPE(src1->getType()))
2715         {
2716             // this can happen due to vISA opt, convert them to src0 type which should be D/UD
2717             // We use D as the tmp type to make sure we can represent all src1 values
2718             inst->setSrc(insertMovBefore(i, 1, Type_D, bb), 1);
2719         }
2720 
2721         // Need extra mov if dst is acc and src0 is indirect
2722         if (!builder.accDstforIndirectSrc())
2723         {
2724             if (src0->isSrcRegRegion() && src0->asSrcRegRegion()->getRegAccess() == IndirGRF)
2725             {
2726                 inst->setSrc(insertMovBefore(i, 0, src0->getType(), bb), 0);
2727             }
2728         }
2729 
2730         INST_LIST_ITER end_iter = i;
2731         // this mul will be expanded into mul+macl in expandMulPostSchedule pass. Since expanded macl
2732         // must be grf-aligned, so need to make mul to be grf-aligned.
2733         G4_DstRegRegion* dst = inst->getDst();
2734         if (inst->getSaturate() ||
2735             dst->getExecTypeSize() > TypeSize(Type_D) ||
2736             isPreAssignedRegOffsetNonZero<G4_DstRegRegion>(dst) ||
2737             !builder.isOpndAligned(dst, getGRFSize()))
2738         {
2739             // add a tmp mov
2740             inst->setDest(insertMovAfter(i, dst, dst->getType(), bb, GRFALIGN));
2741             end_iter++;
2742         }
2743 
2744         // sat cannot be used at all in the macro sequence
2745         // this effectivly means sat is broken for mul D D D
2746         inst->setSaturate(g4::NOSAT);
2747 
2748         // set implicit acc dst to the mulh instruction as acc will be used as dst of the expanded mul after local scheduling.
2749         // it is a must to fix the WAR/WAW issue of acc in local scheduling.
2750         G4_DstRegRegion* accDstOpnd = builder.createDst(builder.phyregpool.getAcc0Reg(), 0, 0, 1, inst->getDst()->getType());
2751         inst->setImplAccDst(accDstOpnd);
2752 
2753         if (execSize > builder.getNativeExecSize())
2754         {
2755             auto start_iter = i;
2756             splitDWMULInst(start_iter, end_iter, bb);
2757             // start_iter points to the first half of mulh. Need double check this new inserted mulh to see if need split again
2758             i = start_iter;
2759         }
2760         else
2761         {
2762             i++;
2763         }
2764     }
2765     else
2766     {
2767         G4_DstRegRegion* acc_dst_opnd = builder.createDst(
2768             builder.phyregpool.getAcc0Reg(),
2769             0,
2770             0,
2771             1,
2772             tmp_type);
2773 
2774         G4_INST* newMul = builder.createBinOp(G4_mul, execSize,
2775             acc_dst_opnd, builder.duplicateOperand(src0), builder.duplicateOperand(src1), inst_opt, false);
2776 
2777         bb->insertBefore(i, newMul);
2778         inst->copyDefsTo(newMul, false);
2779 
2780         fixMulSrc1(std::prev(i), bb);
2781         newMul->setNoMask(true);
2782 
2783         auto machSrc1 = inst->getSrc(1);
2784         if (src1->isImm() && src0->getType() != src1->getType())
2785         {
2786             G4_Imm* oldImm = src1->asImm();
2787             // Ensure src1 has the same type as src0.
2788             machSrc1 = builder.createImm(oldImm->getInt(), src0->getType());
2789         }
2790         else if (!IS_DTYPE(src1->getType()))
2791         {
2792             // this can happen due to vISA opt, convert them to src0 type which should be D/UD
2793             // We use D as the tmp type to make sure we can represent all src1 values
2794             machSrc1 = insertMovBefore(i, 1, Type_D, bb);
2795         }
2796 
2797         // We don't duplicate the operands here as original inst is unlinked
2798         // ToDo: this invalidate du-chain, do we still need to maintain it?
2799         auto machInst = builder.createMach(inst->getExecSize(), inst->getDst(), inst->getSrc(0), machSrc1, inst_opt, tmp_type);
2800         machInst->setPredicate(inst->getPredicate());
2801         machInst->setCondMod(inst->getCondMod());
2802         *i = machInst;
2803         inst->transferUse(machInst);
2804         inst->removeAllDefs();
2805         newMul->addDefUse(machInst, Opnd_implAccSrc);
2806 
2807         INST_LIST_ITER end_iter = i;
2808         // check if the ACC source is aligned to mach dst
2809         // ToDo: this should be checked by fixAcc?
2810         G4_DstRegRegion* dst = inst->getDst();
2811         if (inst->getSaturate() ||
2812             dst->getExecTypeSize() > TypeSize(Type_D) ||
2813             isPreAssignedRegOffsetNonZero<G4_DstRegRegion>(dst))
2814         {
2815             // add a tmp mov
2816             machInst->setDest(insertMovAfter(i, dst, dst->getType(), bb));
2817             end_iter++;
2818         }
2819 
2820         if (execSize > builder.getNativeExecSize())
2821         {
2822             auto start_iter = std::prev(i);
2823             splitDWMULInst(start_iter, end_iter, bb);
2824             // start_iter ponits to the first half of mul. Need to check the new inserted mul/mach instructions
2825             i = start_iter;
2826         }
2827         else
2828         {
2829             // i points to mach, and need to check the new inserted mul before mach
2830             i = std::prev(i);
2831         }
2832     }
2833     return;
2834 }
2835 
2836 //
2837 // insert move instructions to copy numDwords dwords from src to dst at the specified location
2838 // a NoMask UD move is used.
2839 // dst and src must be dword-aligned.
2840 // srcOffset and dstOffset are in bytes
2841 // numDwords must be one of {1,2,4,8,16}
2842 // ToDo: may want to generalize this into a copyBytes function that selects the appropriate move type
2843 // based on dst and src type
2844 //
copyDwords(G4_Declare * dst,int dstOffset,G4_Declare * src,int srcOffset,int numDwords,G4_BB * bb,INST_LIST_ITER iter)2845 void HWConformity::copyDwords(G4_Declare* dst,
2846     int dstOffset,
2847     G4_Declare* src,
2848     int srcOffset,
2849     int numDwords,
2850     G4_BB* bb,
2851     INST_LIST_ITER iter)
2852 {
2853 
2854     MUST_BE_TRUE(numDwords == 1 || numDwords == 2 || numDwords == 4 ||
2855         numDwords == 8 || numDwords == 16 || numDwords == 32, "invalid number of dwords to copy");
2856 
2857     G4_Declare* newDst = dst;
2858 
2859     if (dst->getElemType() != Type_UD)
2860     {
2861         // create an alias with type UD
2862         newDst = builder.createTempVar(numDwords, Type_UD, Any);
2863         newDst->setAliasDeclare(dst, 0);
2864     }
2865 
2866     G4_Declare* newSrc = src;
2867     if (src->getElemType() != Type_UD)
2868     {
2869         // create an alias with type UD
2870         newSrc = builder.createTempVar(numDwords, Type_UD, Any);
2871         newSrc->setAliasDeclare(src, 0);
2872     }
2873 
2874     G4_SrcRegRegion* srcOpnd = builder.createSrc(
2875         newSrc->getRegVar(), srcOffset / numEltPerGRF<Type_UB>(),
2876         (srcOffset % numEltPerGRF<Type_UB>()) / TypeSize(Type_UD),
2877         builder.getRegionStride1(), Type_UD);
2878     G4_DstRegRegion* dstOpnd = builder.createDst(newDst->getRegVar(),
2879         dstOffset / numEltPerGRF<Type_UB>(),
2880         (dstOffset % numEltPerGRF<Type_UB>()) / TypeSize(Type_UD), 1, Type_UD);
2881 
2882     G4_INST* movInst = builder.createMov(G4_ExecSize(numDwords), dstOpnd, srcOpnd, InstOpt_WriteEnable, false);
2883 
2884     INST_LIST_ITER movPos = bb->insertBefore(iter, movInst);
2885 
2886     if (numDwords == numEltPerGRF<Type_UD>() * 2 &&
2887         ((dstOffset % numEltPerGRF<Type_UB>()) != 0 || (srcOffset % numEltPerGRF<Type_UB>()) != 0))
2888     {
2889         // move crosses 2 GRF boundary, needs splitting
2890         evenlySplitInst(movPos, bb);
2891     }
2892 }
2893 
2894 // like the above, but source is an indirect 64-bit source and dst offset is always 0
2895 // If source is Indirect 1x1, we generate
2896 //  mov (esize*2) tmp<1>:ud r[A0]<1;1,0>:ud
2897 //  ...     tmpSrc<region>:q
2898 // If source is VxH indirect, we have to generate instead
2899 //  mov (esize*2) tmp<1>:ud r[A0]<2,1>:ud
2900 //  ...     tmpSrc<1;1,0>:q
2901 // as we can't have the indirect region on the 64-bit type operand
2902 // A0 is not changed otherwise
copyDwordsIndirect(G4_Declare * dst,G4_SrcRegRegion * src,int numDwords,G4_BB * bb,INST_LIST_ITER iter)2903 void HWConformity::copyDwordsIndirect(G4_Declare* dst,
2904     G4_SrcRegRegion* src,
2905     int numDwords,
2906     G4_BB* bb,
2907     INST_LIST_ITER iter)
2908 {
2909     MUST_BE_TRUE(
2910         TypeSize(dst->getElemType()) >= 4 && src->getTypeSize() >= 4,
2911         "dst and src must have dword or qword type");
2912 
2913     MUST_BE_TRUE(src->getRegAccess() == IndirGRF, "source must be indirect GRF");
2914 
2915     G4_Declare* newDst = dst;
2916 
2917     if (dst->getElemType() != Type_UD)
2918     {
2919         // create an alias with type UD
2920         newDst = builder.createTempVar(numDwords, Type_UD, Any);
2921         newDst->setAliasDeclare(dst, 0);
2922     }
2923 
2924     G4_SrcRegRegion* newSrc = builder.duplicateOperand(src);
2925     MUST_BE_TRUE(newSrc->getTypeSize() == 8, "only support 64-bit type source so far");
2926     newSrc->setType(Type_UD);
2927     newSrc->setModifier(Mod_src_undef);
2928     if (newSrc->getRegion()->isRegionWH())
2929     {
2930         MUST_BE_TRUE(newSrc->getRegion()->width == 1, "only handle <1,0> region for now");
2931         newSrc->setRegion(builder.createRegionDesc(UNDEFINED_SHORT, 2, 1));
2932     }
2933     else
2934     {
2935         newSrc->setRegion(builder.getRegionStride1());
2936     }
2937 
2938     G4_DstRegRegion* dstOpnd = builder.createDst(newDst->getRegVar(), 0, 0, 1, Type_UD);
2939 
2940     G4_INST* movInst = builder.createMov(G4_ExecSize(numDwords), dstOpnd, newSrc, InstOpt_WriteEnable, false);
2941 
2942     bb->insertBefore(iter, movInst);
2943 }
2944 
2945 // copy numRegs GRFs from src[srcOffset] to dst[dstOffset]
2946 // dst[dstOffset] and src[srcOffset] are both GRF-aligned
copyRegs(G4_Declare * dst,int dstOffset,G4_Declare * src,int srcOffset,int numRegs,G4_BB * bb,INST_LIST_ITER iter)2947 void HWConformity::copyRegs(G4_Declare* dst,
2948     int dstOffset,
2949     G4_Declare* src,
2950     int srcOffset,
2951     int numRegs,
2952     G4_BB* bb,
2953     INST_LIST_ITER iter)
2954 {
2955     int numByteCopied = 0;
2956     for (; numRegs >= 2; numRegs -= 2, numByteCopied += numEltPerGRF<Type_UB>() * 2)
2957     {
2958         copyDwords(dst, dstOffset + numByteCopied, src, srcOffset + numByteCopied, numEltPerGRF<Type_UD>() * 2, bb, iter);
2959     }
2960     if (numRegs != 0)
2961     {
2962         copyDwords(dst, dstOffset + numByteCopied, src, srcOffset + numByteCopied, numEltPerGRF<Type_UD>(), bb, iter);
2963     }
2964 }
2965 
2966 //
2967 // Note that this function may invalidate <iter>
2968 //
emulate64bMov(INST_LIST_ITER iter,G4_BB * bb)2969 bool HWConformity::emulate64bMov(INST_LIST_ITER iter, G4_BB* bb)
2970 {
2971     auto inst = (*iter);
2972     auto origIter = iter;
2973     auto dst = inst->getDst();
2974     auto src0 = inst->getSrc(0);
2975 
2976     MUST_BE_TRUE(!inst->getCondMod(), "cant handle cond mod");
2977     auto dstHS = dst->getHorzStride();
2978 
2979     auto incrementVar = [&](G4_Operand* var, unsigned int width, unsigned int regOff, unsigned int sregOff, G4_INST* inst, short increment)
2980     {
2981         auto addrDst = builder.createDst(var->getBase(), regOff, sregOff, 1, Type_UW);
2982         auto addrSrc = builder.createSrc(var->getBase(), regOff, sregOff,
2983             builder.getRegionStride1(), Type_UW);
2984         auto incrementImm = builder.createImm(increment, Type_W);
2985         auto addrAddInst = builder.createInternalInst(
2986             nullptr, G4_add, nullptr, g4::NOSAT,
2987             G4_ExecSize(inst->getExecSize() / width),
2988             addrDst, addrSrc, incrementImm, InstOpt_WriteEnable);
2989         return addrAddInst;
2990     };
2991 
2992     if (src0->isSrcRegRegion())
2993     {
2994         auto src0RR = src0->asSrcRegRegion();
2995         MUST_BE_TRUE(IS_INT(src0RR->getType()) && IS_INT(dst->getType()), "expecting int types on src, dst");
2996         MUST_BE_TRUE(src0RR->getModifier() == Mod_src_undef, "cannot handle saturation");
2997 
2998         const RegionDesc* rgnToUse = nullptr;
2999 
3000         if (src0RR->getRegion()->isScalar())
3001             rgnToUse = builder.getRegionScalar();
3002         else if (!src0RR->isIndirect())
3003         {
3004             uint16_t stride = 0;
3005             bool legal = src0RR->getRegion()->isSingleStride(inst->getExecSize(), stride);
3006             MUST_BE_TRUE(legal, "unsupported region");
3007             if (stride == 1)
3008                 rgnToUse = builder.getRegionStride2();
3009             else if (stride == 2)
3010                 rgnToUse = builder.getRegionStride4();
3011             else
3012                 MUST_BE_TRUE(false, "unsupported stride");
3013         }
3014         else
3015         {
3016             if (src0RR->getTypeSize() < 8)
3017                 rgnToUse = src0RR->getRegion();
3018             else
3019             {
3020                 // this will be broken up in to 2 instructions
3021                 auto factor = src0RR->getTypeSize() / dst->getTypeSize();
3022                 auto vs = src0RR->getRegion()->vertStride * factor;
3023                 auto w = src0RR->getRegion()->width;
3024                 auto hs = src0RR->getRegion()->horzStride * factor;
3025                 rgnToUse = builder.createRegionDesc(vs, w, hs);
3026             }
3027         }
3028 
3029         if (dst->getTypeSize() == 8)
3030         {
3031             if (src0->getTypeSize() == 8)
3032             {
3033                 // may be q->uq or uq->q or raw mov
3034                 // safe to do raw copy for all 3 cases
3035 
3036                 bool isNoMaskInst = !inst->getPredicate() && (inst->isWriteEnableInst() || bb->isAllLaneActive());
3037                 if (isNoMaskInst && inst->getExecSize() == g4::SIMD1 && src0->asSrcRegRegion()->isScalar())
3038                 {
3039                     // For SIMD1 case that is not under divergent CF, we can change to UD type directly:
3040                     // mov (1) r10.1<1>:uq   r20.0<0;1,0>:uq
3041                     // =>
3042                     // mov (2) r10.2<1>:ud   r20.0<1;1,0>:ud
3043                     G4_DstRegRegion* newDst = nullptr;
3044                     if (dst->isIndirect())
3045                     {
3046                         newDst = builder.createIndirectDst(dst->getBase(), dst->getSubRegOff(), dst->getHorzStride(), Type_UD, dst->getAddrImm());
3047                     }
3048                     else
3049                     {
3050                         newDst = builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff() * 2, dst->getHorzStride(), Type_UD, dst->getAccRegSel());
3051                     }
3052 
3053                     G4_SrcRegRegion* newSrc = nullptr;
3054                     if (src0->getRegAccess() == Direct)
3055                     {
3056                         newSrc = builder.createSrcRegRegion(src0RR->getModifier(), Direct, src0RR->getBase(),
3057                             src0RR->getRegOff(), src0RR->getSubRegOff() * 2, builder.getRegionStride1(), Type_UD);
3058                     }
3059                     else
3060                     {
3061                         newSrc = builder.createIndirectSrc(src0RR->getModifier(), src0RR->getBase(), src0RR->getRegOff(),
3062                             src0RR->getSubRegOff(), builder.getRegionStride1(), Type_UD, src0RR->getAddrImm());
3063                     }
3064 
3065                     inst->setSrc(newSrc, 0);
3066                     inst->setDest(newDst);
3067                     inst->setExecSize(G4_ExecSize(inst->getExecSize() * 2u));
3068                     inst->setOptionOn(InstOpt_WriteEnable);
3069                     inst->setMaskOption(InstOpt_M0);
3070 
3071                     return true;
3072                 }
3073                 else
3074                 {
3075                     // mov (8) r10.0<1>:uq   r20.0<1;1,0>:uq
3076                     // =>
3077                     // mov (8) r10.0<2>:ud   r20.0<2;1,0>:ud
3078                     // mov (8) r10.1<2>:ud   r20.1<2;1,0>:ud
3079 
3080                     // 1st half
3081                     auto newDst = dst->isIndirect() ? (builder.createIndirectDst(dst->getBase(), dst->getSubRegOff(), 2 * dstHS, Type_UD, dst->getAddrImm())) :
3082                         (builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff() * 2, 2 * dstHS, Type_UD));
3083                     auto newSrc = builder.createSrcRegRegion(Mod_src_undef, src0RR->getRegAccess(), src0RR->getBase(), src0RR->getRegOff(),
3084                         src0RR->isIndirect() ? src0RR->getSubRegOff() : (src0RR->getSubRegOff() * 2), rgnToUse, Type_UD);
3085                     newSrc->setImmAddrOff(src0RR->getAddrImm());
3086                     auto newInst = builder.createMov(inst->getExecSize(), newDst, newSrc, inst->getOption(), false);
3087                     newInst->setPredicate(inst->getPredicate() ? builder.createPredicate(*inst->getPredicate()) : nullptr);
3088                     iter = bb->insertBefore(origIter, newInst);
3089 
3090                     // second half
3091                     bool dstAddrIncremented = false, src0AddrIncremented = false;
3092                     unsigned int immAddrOff = 4;
3093                     if (dst->isIndirect() && (4 + dst->getAddrImm()) > 512)
3094                     {
3095                         // increment dst address register by 4, later decrement it
3096                         dstAddrIncremented = true;
3097                         immAddrOff = 0;
3098                         iter = bb->insertBefore(origIter, incrementVar(dst, inst->getExecSize(), dst->getRegOff(), dst->getSubRegOff(), inst, 4));
3099                     }
3100                     newDst = dst->isIndirect() ? (builder.createIndirectDst(dst->getBase(), dst->getSubRegOff(), 2 * dstHS, Type_UD, immAddrOff + dst->getAddrImm())) :
3101                         (builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff() * 2 + 1, 2 * dstHS, Type_UD));
3102                     newSrc = builder.createSrcRegRegion(Mod_src_undef, src0RR->getRegAccess(), src0RR->getBase(), src0RR->getRegOff(),
3103                         src0RR->isIndirect() ? src0RR->getSubRegOff() : (src0RR->getSubRegOff() * 2 + 1), rgnToUse, Type_UD);
3104                     if (newSrc->isIndirect())
3105                     {
3106                         // upper 4 bytes
3107                         if ((4 + src0RR->getAddrImm()) > 512)
3108                         {
3109                             src0AddrIncremented = true;
3110                             iter = bb->insertBefore(origIter, incrementVar(src0RR, src0RR->getRegion()->width, src0RR->getRegOff(), src0RR->getSubRegOff(), inst, 4));
3111                             newSrc->setImmAddrOff(src0RR->getAddrImm());
3112                         }
3113                         else
3114                             newSrc->setImmAddrOff(4 + src0RR->getAddrImm());
3115                     }
3116                     newInst = builder.createMov(inst->getExecSize(), newDst, newSrc, inst->getOption(), false);
3117                     newInst->setPredicate(inst->getPredicate() ? builder.createPredicate(*inst->getPredicate()) : nullptr);
3118                     iter = bb->insertBefore(origIter, newInst);
3119 
3120                     if (dstAddrIncremented)
3121                     {
3122                         iter = bb->insertBefore(origIter, incrementVar(dst, inst->getExecSize(), dst->getRegOff(), dst->getSubRegOff(), inst, -4));
3123                     }
3124 
3125                     if (src0AddrIncremented)
3126                     {
3127                         iter = bb->insertBefore(origIter, incrementVar(src0RR, src0RR->getRegion()->width, src0RR->getRegOff(), src0RR->getSubRegOff(), inst, -4));
3128                     }
3129 
3130                     bb->erase(origIter);
3131 
3132                     return true;
3133                 }
3134             }
3135             else if (dst->getTypeSize() == 8 && src0->getTypeSize() < 8)
3136             {
3137                 // d/ud/w/uw/b/ub -> q/uq
3138                 if (IS_SIGNED_INT(src0->getType()))
3139                 {
3140                     // when src is signed, sign extend
3141                     // b/w/d -> q/uq
3142                     //
3143                     // dst<2>.0:d = src:[d|w|b]
3144                     // dst<2>.1:d = asr dst<2>.0:d 31
3145                     auto newDst = dst->isIndirect() ? (builder.createIndirectDst(dst->getBase(), dst->getSubRegOff(), 2 * dstHS, Type_D, dst->getAddrImm())) :
3146                         (builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff() * 2, 2 * dstHS, Type_D));
3147                     auto newSrc = builder.createSrcRegRegion(*src0RR);
3148                     auto newInst = builder.createMov(inst->getExecSize(), newDst, newSrc, inst->getOption(), false);
3149                     newInst->setPredicate(inst->getPredicate() ? builder.createPredicate(*inst->getPredicate()) : nullptr);
3150                     iter = bb->insertBefore(origIter, newInst);
3151 
3152                     bool dstAddrIncremented = false;
3153                     unsigned int immAddrOff = 4;
3154                     if (dst->isIndirect() && (4 + dst->getAddrImm()) > 512)
3155                     {
3156                         // increment dst address register by 4, later decrement it
3157                         dstAddrIncremented = true;
3158                         immAddrOff = 0;
3159                         iter = bb->insertBefore(origIter, incrementVar(dst, inst->getExecSize(), dst->getRegOff(), dst->getSubRegOff(), inst, 4));
3160                     }
3161 
3162                     newDst = dst->isIndirect() ? (builder.createIndirectDst(dst->getBase(), dst->getSubRegOff(), 2 * dstHS, Type_D, immAddrOff + dst->getAddrImm())) :
3163                         (builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff() * 2 + 1, 2 * dstHS, Type_D));
3164                     if (dst->isIndirect())
3165                     {
3166                         newSrc = builder.createSrcRegRegion(Mod_src_undef, IndirGRF, dst->getBase(), dst->getRegOff(), dst->getSubRegOff(),
3167                             rgnToUse, Type_D);
3168                         newSrc->setImmAddrOff(newDst->getAddrImm());
3169                     }
3170                     else
3171                         newSrc = builder.createSrc(dst->getBase(), dst->getRegOff(), dst->getSubRegOff() * 2,
3172                             builder.getRegionStride2(), Type_D);
3173                     auto imm31 = builder.createImm(31, Type_W);
3174                     newInst = builder.createBinOp(G4_asr, inst->getExecSize(), newDst, newSrc, imm31, inst->getOption(), false);
3175                     newInst->setPredicate(inst->getPredicate() ? builder.createPredicate(*inst->getPredicate()) : nullptr);
3176                     iter = bb->insertBefore(origIter, newInst);
3177 
3178                     if (dstAddrIncremented)
3179                     {
3180                         iter = bb->insertBefore(origIter, incrementVar(dst, inst->getExecSize(), dst->getRegOff(), dst->getSubRegOff(), inst, -4));
3181                     }
3182 
3183                     bb->erase(origIter);
3184 
3185                     return true;
3186                 }
3187                 else
3188                 {
3189                     // when src is unsigned, zero extend
3190                     // ub/uw/ud -> q/uq
3191                     //
3192                     // dst<2>.0:ud = src:[ud|uw|ub]
3193                     // dst<2>.1:ud = 0
3194 
3195                     auto newDst = dst->isIndirect() ? (builder.createIndirectDst(dst->getBase(), dst->getSubRegOff(), 2 * dstHS, Type_UD, dst->getAddrImm())) :
3196                         (builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff() * 2, 2 * dstHS, Type_UD));
3197                     auto newSrc = builder.createSrcRegRegion(*src0RR);
3198                     auto newInst = builder.createMov(inst->getExecSize(), newDst, newSrc, inst->getOption(), false);
3199                     newInst->setPredicate(inst->getPredicate() ? builder.createPredicate(*inst->getPredicate()) : nullptr);
3200                     iter = bb->insertBefore(origIter, newInst);
3201 
3202                     bool dstAddrIncremented = false;
3203                     unsigned int immAddrOff = 4;
3204                     if (dst->isIndirect() && (4 + dst->getAddrImm()) > 512)
3205                     {
3206                         // increment dst address register by 4, later decrement it
3207                         dstAddrIncremented = true;
3208                         immAddrOff = 0;
3209                         iter = bb->insertBefore(origIter, incrementVar(dst, inst->getExecSize(), dst->getRegOff(), dst->getSubRegOff(), inst, 4));
3210                     }
3211                     newDst = dst->isIndirect() ? (builder.createIndirectDst(dst->getBase(), dst->getSubRegOff(), 2 * dstHS, Type_UD, immAddrOff + dst->getAddrImm())) :
3212                         (builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff() * 2 + 1, 2 * dstHS, Type_UD));
3213                     auto imm0 = builder.createImm(0);
3214                     newInst = builder.createMov(inst->getExecSize(), newDst, imm0, inst->getOption(), false);
3215                     newInst->setPredicate(inst->getPredicate() ? builder.createPredicate(*inst->getPredicate()) : nullptr);
3216                     iter = bb->insertBefore(origIter, newInst);
3217 
3218                     if (dstAddrIncremented)
3219                     {
3220                         iter = bb->insertBefore(origIter, incrementVar(dst, inst->getExecSize(), dst->getRegOff(), dst->getSubRegOff(), inst, -4));
3221                     }
3222 
3223                     bb->erase(origIter);
3224 
3225                     return true;
3226                 }
3227             }
3228         }
3229         else if (dst->getTypeSize() < 8 && src0->getTypeSize() == 8)
3230         {
3231             // truncate
3232             // q/uq -> d/ud/w/uw/b/ub
3233             // 1. mov(8) r10.0<1>:d   r20.0<1;1,0>:uq
3234             // =>
3235             // mov(8) r10.0<1>:d   r20.0<2;1,0>:d
3236             //
3237             // 2. mov(8) r10.0<1>:d   r20.1<2;1,0>:uq
3238             // =>
3239             // mov(8) r10.0<1>:d   r20.2<4;1,0>:d
3240 
3241             unsigned int factor = src0->getTypeSize() / dst->getTypeSize();
3242             auto newDst = builder.createDstRegRegion(*dst);
3243             auto newSrc = builder.createSrcRegRegion(Mod_src_undef, src0RR->getRegAccess(), src0RR->getBase(), src0RR->getRegOff(),
3244                 src0RR->isIndirect() ? src0RR->getSubRegOff() : (src0RR->getSubRegOff() * factor), rgnToUse, dst->getType());
3245             newSrc->setImmAddrOff(src0RR->getAddrImm());
3246             auto newInst = builder.createMov(inst->getExecSize(), newDst, newSrc, inst->getOption(), false);
3247             newInst->setPredicate(inst->getPredicate() ? builder.createPredicate(*inst->getPredicate()) : nullptr);
3248             iter = bb->insertBefore(origIter, newInst);
3249 
3250             bb->erase(origIter);
3251 
3252             return true;
3253         }
3254     }
3255     else if (src0->isImm())
3256     {
3257         auto imm = src0->asImm()->getInt();
3258         int low = imm & 0xffffffff;
3259         int high = (imm >> 32) & 0xffffffff;
3260 
3261         // low
3262         auto newDst = dst->isIndirect() ? (builder.createIndirectDst(dst->getBase(), dst->getSubRegOff(), 2 * dstHS, Type_D, dst->getAddrImm())) :
3263             (builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff() * 2, 2 * dstHS, Type_D));
3264         auto immLowSrc = builder.createImm(low, Type_D);
3265         auto newInst = builder.createMov(inst->getExecSize(), newDst, immLowSrc, inst->getOption(), false);
3266         newInst->setPredicate(inst->getPredicate() ? builder.createPredicate(*inst->getPredicate()) : nullptr);
3267         iter = bb->insertBefore(origIter, newInst);
3268 
3269         // high
3270         bool dstAddrIncremented = false;
3271         unsigned int immAddrOff = 4;
3272         if (dst->isIndirect() && (4 + dst->getAddrImm()) > 512)
3273         {
3274             // increment dst address register by 4, later decrement it
3275             dstAddrIncremented = true;
3276             immAddrOff = 0;
3277             iter = bb->insertBefore(origIter, incrementVar(dst, inst->getExecSize(), dst->getRegOff(), dst->getSubRegOff(), inst, 4));
3278         }
3279         newDst = dst->isIndirect() ? (builder.createIndirectDst(dst->getBase(), dst->getSubRegOff(), 2 * dstHS, Type_D, immAddrOff + dst->getAddrImm())) :
3280             (builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff() * 2 + 1, 2 * dstHS, Type_D));
3281         auto immHigh = builder.createImm(high, Type_D);
3282         newInst = builder.createMov(inst->getExecSize(), newDst, immHigh, inst->getOption(), false);
3283         newInst->setPredicate(inst->getPredicate() ? builder.createPredicate(*inst->getPredicate()) : nullptr);
3284         iter = bb->insertBefore(origIter, newInst);
3285 
3286         if (dstAddrIncremented)
3287         {
3288             iter = bb->insertBefore(origIter, incrementVar(dst, inst->getExecSize(), dst->getRegOff(), dst->getSubRegOff(), inst, -4));
3289         }
3290 
3291         bb->erase(origIter);
3292 
3293         return true;
3294     }
3295 
3296     return false;
3297 }
3298 
fix64bInst(INST_LIST_ITER iter,G4_BB * bb)3299 bool HWConformity::fix64bInst(INST_LIST_ITER iter, G4_BB* bb)
3300 {
3301 
3302     // HW restrictions:
3303     // [DevCHV, DevBXT]: When source or destination datatype is 64b, indirect addressing must not be used.
3304     // the region rules are:
3305     // Source and Destination horizontal stride must be aligned to the execution datatype.
3306     // Example:
3307     // mov (4) r10.0:df r11.0<16;8,2>:f // Source stride must be 2 since datatype is smaller
3308     // move (4) r10.0<2>:f r11.0<4;4,1>:df // Destination stride must be 2 since datatype is smaller.
3309     // as this would require splitting in some cases
3310     // Regioning must ensure Src.Vstride = Src.Width * Src.Hstride
3311     // Source and Destination offset must be the same, except the case of scalar source
3312     // [DevCHV, DevBXT]: When source or destination datatype is 64b, indirect addressing must not be used.
3313     // [DevCHV, DevBXT]: ARF registers must never be used with 64b datatype.
3314 
3315     if (!builder.no64bitRegioning())
3316     {
3317         return false;
3318     }
3319 
3320     G4_INST* inst = *iter;
3321     bool uses64BitType = false;
3322     bool isDWMultiply = false;
3323     uint8_t execSize = inst->getExecSize();
3324 
3325     if (inst->mayExceedTwoGRF())
3326     {
3327         return false;
3328     }
3329     if (inst->getDst() && inst->getDst()->getTypeSize() == 8)
3330     {
3331         uses64BitType = true;
3332     }
3333     for (int i = 0, size = inst->getNumSrc(); !uses64BitType && i < size; i++)
3334     {
3335         G4_Operand* src = inst->getSrc(i);
3336 
3337         if (src && src->getTypeSize() == 8)
3338         {
3339             uses64BitType = true;
3340         }
3341     }
3342     if (inst->opcode() == G4_mul && IS_DTYPE(inst->getSrc(0)->getType()) &&
3343         IS_DTYPE(inst->getSrc(1)->getType()))
3344     {
3345         //WA: dw*dw multiply is considered to use 64bit data type since the result is 64-bit
3346         uses64BitType = true;
3347         isDWMultiply = true;
3348     }
3349 
3350     if (uses64BitType)
3351     {
3352         if (builder.noInt64())
3353         {
3354             // handle i64 mov/add/cmp/sel
3355             // ToDo: move it to its own pass
3356             if (inst->opcode() == G4_mov && IS_INT(inst->getDst()->getType()) && IS_INT(inst->getSrc(0)->getType()))
3357             {
3358                 if (emulate64bMov(iter, bb))
3359                     return true;
3360             }
3361         }
3362 
3363         int numSrc = inst->getNumSrc();
3364 
3365         // handle indirect sources first
3366         for (int i = 0; i < numSrc; ++i)
3367         {
3368             G4_Operand* src = inst->getSrc(i);
3369             if (src != nullptr && src->isSrcRegRegion() && src->asSrcRegRegion()->getRegAccess() == IndirGRF)
3370             {
3371                 G4_SrcRegRegion* srcAsRegion = src->asSrcRegRegion();
3372                 const RegionDesc* region = srcAsRegion->getRegion();
3373                 int byteSize = srcAsRegion->getTypeSize();
3374                 if (byteSize == 8)
3375                 {
3376                     // right bound is not available for indirect operands
3377                     // FIXME: this code should be moved to getRightBound()
3378                     int rightBound = 0;
3379                     // we must change move type to UD
3380 
3381                     if (region->isScalar())
3382                     {
3383                         rightBound = byteSize;
3384                     }
3385                     else if (region->isRegionWH())
3386                     {
3387                         rightBound = inst->getExecSize() * byteSize;
3388                     }
3389                     else
3390                     {
3391                         int num_rows = inst->getExecSize() / region->width;
3392                         rightBound = (num_rows - 1) * region->vertStride * byteSize +
3393                             region->horzStride * (region->width - 1) * byteSize +
3394                             byteSize;
3395                     }
3396 
3397                     int numDwords = rightBound / TypeSize(Type_UD);
3398                     numDwords = Round_Up_Pow2(numDwords);
3399                     G4_Declare* tmpSrc = builder.createTempVar(numDwords / 2, src->getType(), GRFALIGN);
3400                     // new source's region varies depending on whether it's VxH or 1x1
3401                     const RegionDesc* newRegion = region->isRegionWH() ? builder.getRegionStride1() : region;
3402                     copyDwordsIndirect(tmpSrc, srcAsRegion, numDwords, bb, iter);
3403                     G4_SrcRegRegion* tmpSrcOpnd = builder.createSrcRegRegion(srcAsRegion->getModifier(),
3404                         Direct, tmpSrc->getRegVar(), 0, 0, newRegion, tmpSrc->getElemType());
3405                     inst->setSrc(tmpSrcOpnd, i);
3406                 }
3407                 else
3408                 {
3409                     // use the good ol' insertMovBefore
3410                     G4_Operand* tmpSrc = insertMovBefore(iter, i, src->getType(), bb);
3411                     G4_Declare* tmpSrcDcl = tmpSrc->getTopDcl();
3412                     tmpSrcDcl->setSubRegAlign(GRFALIGN);
3413                     inst->setSrc(tmpSrc, i);
3414                 }
3415             }
3416         }
3417 
3418         // now handle direct sources with bad region/alignment
3419         bool hasSameOffset = hasSameSubregOffset(inst);
3420         for (int i = 0; i < numSrc; i++)
3421         {
3422             G4_Operand* src = inst->getSrc(i);
3423             if (src != NULL && src->isSrcRegRegion())
3424             {
3425                 G4_SrcRegRegion* srcAsRegion = src->asSrcRegRegion();
3426                 const RegionDesc* region = srcAsRegion->getRegion();
3427                 int byteSize = srcAsRegion->getTypeSize();
3428 
3429                 if (!isDWMultiply && !region->isScalar() &&
3430                     (byteSize != 8 && (byteSize * region->horzStride) < 8))
3431                 {
3432                     // source is not 8 byte aligned
3433                     // this can happen e.g. for
3434                     // mov (8) r1.0<1>:df (mod)r3<8;8,1>:f
3435                     // which we'd need to change to
3436                     // mov (8) r10.0<2>:f (mod)r3.0<8;8,1>:f
3437                     // mov (8) r1.0<1>:df r10.0<8;4,2>:f
3438                     // to satisfy rule 1
3439                     uint8_t exSize = inst->getExecSize();
3440                     uint16_t multFactor = (uint16_t)(8 / byteSize);
3441                     G4_Type tmpType = srcAsRegion->getType();
3442                     if (multFactor == 8)
3443                     {
3444                         // byte type needs special handling since we can't have stride 8
3445                         tmpType = (tmpType == Type_B) ? Type_W : Type_UW;
3446                         multFactor = 4;
3447                     }
3448                     MUST_BE_TRUE(multFactor != 8, "does not support 64b operation with byte source");
3449                     G4_Declare* tmp = builder.createTempVar(exSize * multFactor,
3450                         tmpType, GRFALIGN);
3451                     G4_DstRegRegion* tmpDst = builder.createDstRegRegion(tmp, multFactor);
3452                     G4_INST* movInst = builder.createMov(inst->getExecSize(), tmpDst, src, inst->getOption(), false);
3453                     bb->insertBefore(iter, movInst);
3454                     uint16_t width = exSize;
3455                     if (width * 8u > numEltPerGRF<Type_UB>())
3456                     {
3457                         // can't have width cross GRF
3458                         width = 4;
3459                     }
3460                     G4_SrcRegRegion* newSrc = builder.createSrcRegRegion(tmp,
3461                         builder.createRegionDesc((uint16_t)multFactor * width, width, multFactor));
3462                     inst->setSrc(newSrc, i);
3463                 }
3464                 else if (region->isScalar())
3465                 {
3466 #if 0
3467                     // scalar region still must be aligned to qword, though it can be any qword
3468                     if (byteSize < 8 && !builder.isOpndAligned(srcAsRegion, 8))
3469                     {
3470                         G4_Operand* tmpSrc = insertCopyBefore(iter, i, Four_Word, bb);
3471                         inst->setSrc(tmpSrc, i);
3472                     }
3473 #endif
3474                 }
3475                 else if (!hasSameOffset)
3476                 {
3477                     // we need a temp src that is GRF-aligned
3478                     if (byteSize == 8)
3479                     {
3480                         // the same src/dst offset restriction applies to move as well, so we have to generate
3481                         // a packed move with UD type to work around the restriction
3482                         // e.g., for
3483                         // add (2) ... r1.1<4;2,2>:q
3484                         // we turn it into
3485                         // mov (8) r10.0<1>:ud r1.2<1;1,0>:ud {NoMask}
3486                         // add (2) ... r10.0<4;2,2>:q
3487                         int numDwords = (src->getRightBound() - src->getLeftBound() + 1) / TypeSize(Type_UD);
3488                         numDwords = Round_Up_Pow2(numDwords);
3489                         G4_Declare* tmpSrc = builder.createTempVar(numDwords / 2, src->getType(), GRFALIGN);
3490                         copyDwords(tmpSrc, 0, src->getTopDcl(), src->getLeftBound(), numDwords, bb, iter);
3491                         G4_SrcRegRegion* tmpSrcOpnd = builder.createSrcRegRegion(srcAsRegion->getModifier(),
3492                             Direct, tmpSrc->getRegVar(), 0, 0, srcAsRegion->getRegion(), tmpSrc->getElemType());
3493                         inst->setSrc(tmpSrcOpnd, i);
3494                     }
3495                     else
3496                     {
3497                         // use the good ol' insertMovBefore
3498                         G4_Operand* tmpSrc = insertMovBefore(iter, i, src->getType(), bb);
3499                         G4_Declare* tmpSrcDcl = tmpSrc->getTopDcl();
3500                         tmpSrcDcl->setSubRegAlign(GRFALIGN);
3501                         inst->setSrc(tmpSrc, i);
3502                     }
3503                 }
3504                     }
3505                 }
3506 
3507         for (int i = 0; i < numSrc; i++)
3508         {
3509             // rewrite <1;1,0> to <2;2,1> since HW does not like the former
3510             G4_Operand* src = inst->getSrc(i);
3511             if (src != nullptr && src->isSrcRegRegion())
3512             {
3513                 G4_SrcRegRegion* srcAsRegion = src->asSrcRegRegion();
3514                 const RegionDesc* region = srcAsRegion->getRegion();
3515                 if (!region->isRegionWH() && region->vertStride != region->horzStride * region->width)
3516                 {
3517                     // see if we can fix the region to satisfy VS = W * HS
3518                     if (region->width == inst->getExecSize())
3519                     {
3520                         // vs is a don't care, change to <w*hs, w, hz>
3521                         srcAsRegion->setRegion(builder.createRegionDesc(region->width * region->horzStride, region->width, region->horzStride));
3522                     }
3523                     else if (region->width == 1)
3524                     {
3525                         // hs is a don't care, change it to <esize*vs, esize, vs>
3526                         MUST_BE_TRUE(region->vertStride <= 4, "illegal vertical stride");
3527 
3528                         uint16_t wd = inst->getExecSize();
3529                         uint16_t hs = region->vertStride;
3530                         if (src->crossGRF())
3531                         {
3532                             // Make sure the new hs does not cross GRF
3533                             uint32_t nbytesIn1stGRF = numEltPerGRF<Type_UB>() - (src->getLeftBound() % numEltPerGRF<Type_UB>());
3534                             uint32_t eltBytes = srcAsRegion->getTypeSize();
3535                             uint32_t neltsIn1stGRF = nbytesIn1stGRF / eltBytes;
3536 
3537                             MUST_BE_TRUE((nbytesIn1stGRF % eltBytes) == 0, "Bad region with element crossing GRF");
3538                             MUST_BE_TRUE((neltsIn1stGRF % hs) == 0, "hs cannot cross GRF");
3539 
3540                             wd = neltsIn1stGRF / hs;
3541                             // Get the largest powOfTwo that can divide wd
3542                             wd = wd & (-wd);
3543                             //MUST_BE_TRUE(wd > 1, "Cannot select non-1 width w/o crossing GRF");
3544                         }
3545                         srcAsRegion->setRegion(builder.createRegionDesc(wd * hs, wd, hs));
3546                     }
3547 
3548                     else
3549                     {
3550                         // FIXME: Both VS and HS are used by the region, so we have to either split inst or insert multiple moves to pack the source
3551                         // both are painful, so we assert for now and fix later if we encounter such a case
3552                         MUST_BE_TRUE(false, "Unhandled bad 64b region on CHV/BXT");
3553                     }
3554 
3555                 }
3556             }
3557         }
3558         G4_DstRegRegion* dst = inst->getDst();
3559         if (dst != NULL && !dst->isNullReg())
3560         {
3561             bool needsTmpDst = dst->getRegAccess() != Direct ||
3562                 (execSize > 1 && !hasSameOffset) ||
3563                 dst->isAreg();
3564             if (needsTmpDst)
3565             {
3566                 // we need to have a temp dst that is direct and GRF-aligned
3567                 if (dst->getRegAccess() == Direct && dst->getTypeSize() == 8)
3568                 {
3569                     // the same src/dst offset restriction applies to move as well, so we have to generate
3570                     // a move with UD type to work around the restriction
3571                     // e.g., for
3572                     // add (2) r1.2<1>:q ...
3573                     // we generate
3574                     // add (2) r3.0<1>:q ...
3575                     // mov (4) r1.4<1>:ud r3.0<1;1,0>:ud {NoMask}
3576                     // If dst is not contiguous, we additionally add a move to pre-load the old values:
3577                     // add (2) r1.2<2>:q ...
3578                     // becomes
3579                     // mov (8) r3.0<1>:ud r1.4<1;1,0>:ud {NoMask}
3580                     // add (2) r3.0<2>:q ...
3581                     // mov (8) r1.4<1>:ud r3.0<1;1,0>:ud {NoMask}
3582                     int numDwords = (dst->getRightBound() - dst->getLeftBound() + 1) / TypeSize(Type_UD);
3583                     numDwords = Round_Up_Pow2(numDwords);
3584                     G4_Declare* tmpDst = builder.createTempVar(numDwords / 2, dst->getType(), GRFALIGN);
3585                     if (numDwords > execSize * 2)
3586                     {
3587                         // dst is not packed, need a move to pre-load the dst value into tmp
3588                         copyDwords(tmpDst, 0, dst->getTopDcl(), dst->getLeftBound(), numDwords, bb, iter);
3589                     }
3590                     INST_LIST_ITER next = iter;
3591                     ++next;
3592                     copyDwords(dst->getTopDcl(), dst->getLeftBound(), tmpDst, 0, numDwords, bb, next);
3593                     inst->setDest(builder.createDstRegRegion(tmpDst, dst->getHorzStride()));
3594                 }
3595                 else
3596                 {
3597                     // use the good ol' insertMoveAfter
3598                     G4_DstRegRegion* tmpDst = insertMovAfter(iter, dst, dst->getType(), bb);
3599                     G4_Declare* tmpDstDcl = tmpDst->getTopDcl();
3600                     tmpDstDcl->setSubRegAlign(GRFALIGN);
3601                     inst->setDest(tmpDst);
3602                     if (dst->getTypeSize() == 8)
3603                     {
3604                         // tmpDst is indirect and thus still does not conform
3605                         // we rewrite
3606                         // mov (e) r[a0.0]<1>:q src<1;1,0>:q
3607                         // into
3608                         // mov (e*2) r[a0.0]<1>:ud src<1;1,0>:ud {NoMask}
3609                         ++iter;
3610                         G4_INST* movInst = *iter;
3611                         MUST_BE_TRUE(movInst->opcode() == G4_mov && movInst->getDst() == dst &&
3612                             movInst->getSrc(0)->isSrcRegRegion(),
3613                             "unexpected instruction created by insertMovAfter");
3614                         MUST_BE_TRUE(dst->getHorzStride() == 1, "only stride 1 is supported for now");
3615                         dst->setType(Type_UD);
3616                         G4_SrcRegRegion* src = movInst->getSrc(0)->asSrcRegRegion();
3617                         G4_Declare* tmpAsUD = builder.createTempVar(tmpDstDcl->getNumElems() * 2, Type_UD, Any);
3618                         tmpAsUD->setAliasDeclare(tmpDstDcl, 0);
3619                         const RegionDesc* newRegion = src->getRegion()->isScalar() ?
3620                             builder.createRegionDesc(0, 2, 1) : builder.getRegionStride1();
3621                         G4_SrcRegRegion* srcAsUD = builder.createSrcRegRegion(src->getModifier(),
3622                             src->getRegAccess(), tmpAsUD->getRegVar(), src->getRegOff(),
3623                             src->getSubRegOff() * 2, newRegion, tmpAsUD->getElemType());
3624                         movInst->setSrc(srcAsUD, 0);
3625                         movInst->setExecSize(G4_ExecSize(inst->getExecSize() * 2u));
3626 
3627                         // NoMask is set on the mov instruction, but if we fall outside of the new execution size,
3628                         // it won't be executed fully
3629                         // e.g., we have to change
3630                         // (W) mov (16|M24) r[a0.0,64]<1>:ud r67.0<8;8,1>:ud
3631                         // into
3632                         // (W) mov (16|M0) r[a0.0,64]<1>:ud r67.0<8;8,1>:ud
3633                         movInst->setMaskOption(InstOpt_M0);
3634 
3635 
3636                         // mov saturate/pred to the original inst
3637                         movInst->setOptionOn(InstOpt_WriteEnable);
3638                         if (movInst->getSaturate())
3639                         {
3640                             movInst->setSaturate(g4::NOSAT);
3641                             inst->setSaturate(g4::SAT);
3642                         }
3643                         G4_Predicate* pred = movInst->getPredicate();
3644                         if (pred)
3645                         {
3646                             MUST_BE_TRUE(inst->getPredicate() == nullptr, "both inst and movInst have predicates");
3647                             movInst->setPredicate(nullptr);
3648                             inst->setPredicate(pred);
3649                         }
3650                     }
3651                 }
3652             }
3653         }
3654             }
3655             return false;
3656         }
3657 
3658 //------------------------------------------------------------------------------
3659 //
3660 //  For BDW, 32 bits integer multiply is implemented as the following macro
3661 //
3662 //  mul (8) acc0:d     r2.0<8;8,1>d   r3.0<16;8,2>:uw
3663 //  mach (8) rTemp<1>:d r2.0<8;8,1>d   r3.0<8;8,1>:d
3664 //  mov (8) r5.0<1>:d   rTemp:d // hi-32bits
3665 //  mov (8) r6.0<1>:d acc0:d // lo-32bits
3666 //
3667 //  Note that this only changes the mul instruction's src1, mach and mov is generated elsewhere
3668 //------------------------------------------------------------------------------
fixMulSrc1(INST_LIST_ITER i,G4_BB * bb)3669 void HWConformity::fixMulSrc1(INST_LIST_ITER i, G4_BB* bb)
3670 {
3671     G4_INST* inst = *i;
3672     G4_Operand* src1 = inst->getSrc(1);
3673 
3674     if (!IS_DTYPE(src1->getType()))
3675     {
3676         // this could happen if dst is Q
3677         return;
3678     }
3679 
3680     if (src1->isImm())
3681     {
3682         uint64_t truncVal = src1->asImm()->getImm() & 0xFFFF;
3683         G4_Imm* new_src1 = builder.createImm(truncVal, Type_UW);
3684         inst->setSrc(new_src1, 1);
3685     }
3686     else
3687     {
3688         assert(src1->isSrcRegRegion() && "region expected");
3689         G4_SrcRegRegion* srcRegion = src1->asSrcRegRegion();
3690         const RegionDesc* rd = srcRegion->getRegion();
3691 
3692         // create a new opnd with type UW
3693         unsigned short scale = TypeSize(Type_D) / TypeSize(Type_UW);
3694         unsigned short newHS = rd->horzStride * scale;
3695         unsigned short newVS = rd->vertStride * scale;
3696         const RegionDesc* new_rd = builder.createRegionDesc(newVS, rd->width, newHS);
3697         short subRegOff = srcRegion->getSubRegOff();
3698         if (srcRegion->getRegAccess() == Direct)
3699         {
3700             subRegOff *= scale;
3701         }
3702         auto new_src1 = builder.createSrcRegRegion(
3703             srcRegion->getModifier(), srcRegion->getRegAccess(),
3704             srcRegion->getBase(), srcRegion->getRegOff(), subRegOff, new_rd,
3705             Type_UW);
3706         inst->setSrc(new_src1, 1);
3707         if (srcRegion->getRegAccess() != Direct)
3708         {
3709             new_src1->setImmAddrOff(srcRegion->getAddrImm());
3710         }
3711     }
3712 }
3713 
3714 /*
3715  *  only acc0 may be used in DWord operations, so we have to break a
3716  *  SIMD16 DWord multiply into two mul-mach-mov sequences.
3717  *
3718  *  Input:
3719  *  (f0) mul (16) dst:d  src0:d  src1:d
3720  *
3721  *  Output:
3722  *  mul (8) acc0:d  src0:d  src1:d
3723  *  mach    (8) null:d  src0:d  src1:d
3724  *  (f0) mov (8) dst:d acc0:d
3725  *  mul (8) acc0:d  src0+1:d  src1+1:d
3726  *  mach    (8) null:d  src0+1:d    src1+1:d
3727  *  (f1) mov (8) dst+1:d acc0:d
3728  *
3729  */
splitDWMULInst(INST_LIST_ITER & start,INST_LIST_ITER & end,G4_BB * bb)3730 void HWConformity::splitDWMULInst(INST_LIST_ITER& start, INST_LIST_ITER& end, G4_BB* bb)
3731 {
3732     // split simd16 inst into SIMD8 ones, since D is not supported for acc1
3733     INST_LIST_ITER iter = start, last_iter = end;
3734     //iter--;
3735     last_iter++;
3736     INST_LIST_ITER curr_iter;
3737     while (iter != end)
3738     {
3739         curr_iter = iter;
3740         evenlySplitInst(curr_iter, bb);
3741         // curr_iter points to the second half after instruction splitting
3742         G4_INST* expand_sec_half_op = *curr_iter;
3743         iter++;
3744 
3745         bb->insertBefore(last_iter, expand_sec_half_op);
3746         if (curr_iter == start)
3747         {
3748             start--;
3749         }
3750         bb->erase(curr_iter);
3751     }
3752     // handle the last inst
3753     if (iter == end)
3754     {
3755         evenlySplitInst(iter, bb);
3756         G4_INST* expand_sec_half_op = *iter;
3757         bb->insertBefore(last_iter, expand_sec_half_op);
3758         // For the case that only one instruction needed to split, that is to say start equals to end
3759         if (start == end)
3760         {
3761             start--;
3762         }
3763         end--;
3764         bb->erase(iter);
3765     }
3766 }
3767 
isGoodMadType(G4_Type type)3768 static bool isGoodMadType(G4_Type type)
3769 {
3770     switch (type)
3771     {
3772     case Type_F:
3773     case Type_HF:
3774     case Type_DF:
3775     case Type_BF:
3776         return true;
3777     default:
3778         return false;
3779     }
3780 }
3781 
isGoodAlign1TernaryDst(G4_INST * inst) const3782 bool HWConformity::isGoodAlign1TernaryDst(G4_INST* inst) const
3783 {
3784     // Align1 MAD requirements:
3785     // -- dst must be direct GRF/acc with horizontal stride 1 or 2
3786     G4_Type execType = inst->getExecType();
3787     G4_DstRegRegion* dst = inst->getDst();
3788 
3789     MUST_BE_TRUE(!IS_QTYPE(dst->getType()) && !IS_BTYPE(dst->getType()), "3Src inst don't support Q and B dst types");
3790 
3791     if (!builder.hasMixMode() &&
3792         isLowPrecisionFloatTy(dst->getType()) && !isLowPrecisionFloatTy(execType))
3793     {
3794         return false;
3795     }
3796 
3797     auto dstTySize = dst->getTypeSize();
3798 
3799     int alignInBytes = std::max((int) dstTySize, builder.get3SrcDstAlign());
3800 
3801     if (builder.noSrc2Regioning())
3802     {
3803         // src2 is required to have the same subreg as dst if src2 is not a scalar
3804         // If we can't guarantee this we have to align both of them to GRF
3805         unsigned src2Pos = inst->opcode() == G4_pseudo_mad ? 0 : 2;
3806         auto src2 = inst->getSrc(src2Pos);
3807         if (src2->isSrcRegRegion() && !src2->asSrcRegRegion()->isScalar())
3808         {
3809             alignInBytes = getGRFSize();
3810         }
3811     }
3812 
3813     if (!builder.isOpndAligned(dst, alignInBytes))
3814     {
3815         // dst may have special alignment due to encoding issues
3816         return false;
3817     }
3818 
3819     uint32_t effectiveStride = dst->getHorzStride();
3820     if (dstTySize < TypeSize(execType))
3821     {
3822         if (IS_TYPE_INT(dst->getType()))
3823         {
3824             effectiveStride *= TypeSize(execType) / dstTySize;
3825         }
3826         else
3827         {
3828             // we have mixed HF and F inst
3829             // dst can be packed HF, but then it must be oword aligned
3830             // this should be checked later for mixed mode inst
3831         }
3832     }
3833 
3834     return dst->getRegAccess() == Direct && effectiveStride <= 2;
3835 }
3836 
3837 //
3838 // check for legal align1 ternary inst sources
3839 //
isGoodAlign1TernarySrc(G4_INST * inst,int srcPos,bool canBeImm)3840 bool HWConformity::isGoodAlign1TernarySrc(G4_INST* inst, int srcPos, bool canBeImm)
3841 {
3842     MUST_BE_TRUE(srcPos >= 0 && srcPos < 3, "illegal source pos");
3843 
3844     uint8_t execSize = inst->getExecSize();
3845     G4_Operand* src = inst->getSrc(srcPos);
3846     // for pseudo_mad we have to swap src0 and src2
3847     bool isSrc2 = inst->opcode() == G4_pseudo_mad ? srcPos == 0 : srcPos == 2;
3848 
3849     if (!builder.hasMixMode())
3850     {
3851         G4_Type execType = inst->getExecType();
3852         if (isLowPrecisionFloatTy(src->getType()) && !isLowPrecisionFloatTy(execType))
3853         {
3854             return false;
3855         }
3856     }
3857 
3858     if (IS_QTYPE(src->getType()))
3859     {
3860         return false;
3861     }
3862 
3863     // mad specific checks
3864     if (inst->opcode() == G4_pseudo_mad)
3865     {
3866         if (isSrc2)
3867         {
3868             if (IS_DTYPE(src->getType()))
3869             {
3870                 return false;
3871             }
3872 
3873             if (builder.noSrc2Regioning() && IS_BTYPE(src->getType()))
3874             {
3875                 return false;
3876             }
3877         }
3878         else if (srcPos == 1)
3879         {
3880             if (IS_DTYPE(src->getType()) && src->isSrcRegRegion() &&
3881                 src->asSrcRegRegion()->getModifier() != Mod_src_undef)
3882             {
3883                 // no source modifier for DW multiply
3884                 return false;
3885             }
3886         }
3887     }
3888 
3889     if (src->isImm())
3890     {
3891         // either src0 or src2 can be 16b imm, but not both
3892         // permanent WA: simd16 inst can't have src0 imm.
3893         // Instead of splitting, we just add a move
3894 
3895         if (canBeImm && (srcPos == 0 || srcPos == 2) && src->getTypeSize() <= 2)
3896         {
3897             if (VISA_WA_CHECK(builder.getPWaTable(), WaNoSimd16TernarySrc0Imm))
3898             {
3899                 return !isSrc2 && inst->getExecSize() != g4::SIMD16;
3900             }
3901             return true;
3902         }
3903         return false;
3904     }
3905     else if (src->isSrcRegRegion())
3906     {
3907         if (src->asSrcRegRegion()->getRegAccess() != Direct)
3908         {
3909             return false;
3910         }
3911 
3912         auto checkSingleStrideRegion = [](G4_SrcRegRegion* src, int stride, uint8_t execSize, IR_Builder& builder)
3913         {
3914             const RegionDesc* srcRegion = src->getRegion();
3915 
3916             if (stride > 4)
3917             {
3918                 return false;
3919             }
3920             else if (srcRegion->isContiguous(execSize))
3921             {
3922                 // Normalize the region if it is not.
3923                 if (srcRegion->width != 1)
3924                 {
3925                     src->setRegion(builder.getRegionStride1(), /*invariant*/ true);
3926                 }
3927                 if (!builder.encodeUnitStrideTernary())
3928                 {
3929                     // we have to make sure width is not being used to cross GRF, as <1;1,0>
3930                     // is not a legal region for align1 ternary source (vs 1 not supported)
3931                     // mad doesn't support <1;1,0>, the width is at least 2
3932                     int minAlignment = src->getTypeSize() * 2;
3933                     return builder.isOpndAligned(src, minAlignment);
3934                 }
3935             }
3936             return true;
3937         };
3938 
3939         // the following regions are supported:
3940         // <N;N,0>
3941         // <0;1,0>
3942         // <W*H;W,H>
3943         const RegionDesc* srcRegion = src->asSrcRegRegion()->getRegion();
3944         if (srcRegion->isScalar())
3945         {
3946             return true;
3947         }
3948 
3949         // src0 and src1 (for psuedo-mad, it's src1 and src2) may use the <N;N,0> region
3950         // as they come with a vStride in encoding
3951         // TODO: we may consider swapping src1 and src2 to catch more regions
3952         if (!isSrc2)
3953         {
3954             uint16_t stride = 0;
3955             if (srcRegion->isSingleStride(execSize, stride))
3956             {
3957                 return checkSingleStrideRegion(src->asSrcRegRegion(), stride, execSize, builder);
3958             }
3959 
3960             if (builder.encodeUnitStrideTernary())
3961             {
3962                 // <4;4,0> and <8;8,0> are ok
3963                 return srcRegion->vertStride == srcRegion->width &&
3964                     srcRegion->horzStride == 0 &&
3965                     (srcRegion->width == 4 || srcRegion->width == 8);
3966             }
3967             else
3968             {
3969                 // <2;2,0>, <4;4,0> and <8;8,0> are ok
3970                 return srcRegion->vertStride == srcRegion->width &&
3971                     srcRegion->horzStride == 0 &&
3972                     srcRegion->width <= 8;
3973             }
3974         }
3975         else
3976         {
3977             if (!builder.noSrc2Regioning())
3978             {
3979                 // src2 (src0 for pseudo-mad) is without vstride, and its region must be
3980                 // <esize*H;esize,H>, with vstride derived from exSize and hstride
3981                 uint16_t stride = 0;
3982                 if (srcRegion->isSingleStride(execSize, stride))
3983                 {
3984                     return checkSingleStrideRegion(src->asSrcRegRegion(), stride, execSize, builder);
3985                 }
3986             }
3987             else
3988             {
3989                 // not a scalar, src2 must be GRF aligned.
3990                 if (!builder.isOpndAligned(src, numEltPerGRF<Type_UB>()))
3991                 {
3992                     return false;
3993                 }
3994 
3995                 uint16_t stride = 0;
3996                 if (srcRegion->isSingleStride(execSize, stride))
3997                 {
3998                     unsigned short dstExecSize = inst->getDst()->getExecTypeSize();
3999                     unsigned short srcExecSize = stride * src->asSrcRegRegion()->getElemSize();
4000                     // Source 2 and destination stride must be aligned to the same execution type.
4001                     // E.g. mad (4) r10.0<1>:hf src0 src1 r13.0<1>:hf
4002                     //      mad (4) r10.0<2>:hf src0 src1 r13.0<1>:f
4003                     //      mad (4) r10.0<1>:f  src0 src1 r13.0<2>:hf
4004                     // this rule is relaxed if mix mode is enabled (packed HF ok)
4005                     if (dstExecSize == srcExecSize)
4006                     {
4007                         return true;
4008                     }
4009                     if (builder.hasPartialMixMode() && inst->isMixedMode())
4010                     {
4011                         return true;
4012                     }
4013                 }
4014             }
4015 
4016             return false;
4017         }
4018     }
4019 
4020     return true;
4021 }
4022 
4023 //
4024 // a source is good for align16 if:
4025 // -- it is a direct srcRegRegion
4026 // -- it has contiguous region and can be made either GRF-aligned (for exec size >= 8)
4027 //    or oword aligned (for exec size == 4)
4028 // -- or it has scalar region and is not non-simd1 double
isGoodAlign16Src(G4_INST * inst,int srcPos)4029 bool HWConformity::isGoodAlign16Src(G4_INST* inst, int srcPos)
4030 {
4031     MUST_BE_TRUE(srcPos >= 0 && srcPos < 3, "illegal source pos");
4032 
4033     uint8_t execSize = inst->getExecSize();
4034     G4_Operand* src = inst->getSrc(srcPos);
4035     G4_Type opnd_type = src->getType();
4036 
4037     // Constants are not allowed as MAD opnds.
4038     if (src->isSrcRegRegion())
4039     {
4040         const RegionDesc* region = src->asSrcRegRegion()->getRegion();
4041         G4_RegAccess regAcc = src->asSrcRegRegion()->getRegAccess();
4042 
4043         if (regAcc != Direct)
4044         {
4045             return false;
4046         }
4047 
4048         if (region->isContiguous(execSize))
4049         {
4050             if (builder.getPlatform() == GENX_BDW && TypeSize(opnd_type) < 4)
4051             {
4052                 // BDW HF has to be 32-byte aligned
4053                 if (!builder.isOpndAligned(src, 32))
4054                 {
4055                     return false;
4056                 }
4057             }
4058             else
4059             {
4060                 if (execSize >= 8)
4061                 {
4062                     // operand must be GRF aligned, or oword aligned for HF/W
4063                     uint32_t align = std::min<uint32_t>(execSize * src->getTypeSize(), 32);
4064                     if (!builder.isOpndAligned(src, align))
4065                     {
4066                         return false;
4067                     }
4068                 }
4069                 else if (execSize == 4 || execSize == 2)
4070                 {
4071                     // operand must be oword-aligned
4072                     if (!builder.isOpndAligned(src, 16))
4073                     {
4074                         return false;
4075                     }
4076                 }
4077             }
4078         }
4079         else if (src->asSrcRegRegion()->isScalar())
4080         {
4081             if (opnd_type == Type_DF && execSize != 1)
4082             {
4083                 // scalar region is illegal for DF since replicate is not supported
4084                 return false;
4085             }
4086 
4087             if (opnd_type == Type_HF && builder.getPlatform() == GENX_BDW)
4088             {
4089                 return false;
4090             }
4091         }
4092         else
4093         {
4094             // all other regions are illegal
4095             return false;
4096         }
4097 
4098         return true;
4099     }
4100     else
4101     {
4102         return false;
4103     }
4104 
4105 }
4106 
4107 //
4108 // Move modifiers of src2 in pseudo_mad to its defining instruction.
4109 //
4110 // mul (16) V66(0,0)<1>:d V46(23,0)<16;16,1>:w 0x39db:w {Align1, H1}
4111 // psuedo_mad (16) V67(0,0)<1>:d V469,0)<8;8,1>:w 0x1b5d:w -V66(0,0)<16;16,1>:d
4112 //
4113 // becomes
4114 //
4115 // mul (16) V66(0,0)<1>:d -V46(23,0)<16;16,1>:w 0x39db:w {Align1, H1}
4116 // psuedo_mad (16) V67(0,0)<1>:d V469,0)<8;8,1>:w 0x1b5d:w V66(0,0)<16;16,1>:d
4117 //
tryTransferSrcModifier(IR_Builder & builder,G4_INST * def,G4_Operand * src)4118 static void tryTransferSrcModifier(IR_Builder& builder, G4_INST* def,
4119     G4_Operand* src)
4120 {
4121     // Only when def has no other users.
4122     if (!def->hasOneUse())
4123         return;
4124 
4125     // Only transfer for integer types.
4126     if (!IS_SIGNED_INT(src->getType()))
4127         return;
4128 
4129     // In case the use type is different from the def type.
4130     if (!def->getDst() || (def->getDst()->getType() != src->getType()))
4131         return;
4132 
4133     switch (def->opcode()) {
4134     default:
4135         break;
4136 
4137         // Probably this is the only interesting op, since G4_math will not be
4138         // used to generate mac.
4139     case G4_mul:
4140     {
4141         // Chances are src1 is an immediate.
4142         G4_Operand* defSrc1 = def->getSrc(1);
4143         if (!IS_SIGNED_INT(defSrc1->getType()))
4144             return;
4145 
4146         if (defSrc1->isImm())
4147         {
4148             G4_Imm* val = defSrc1->asImm();
4149             // Mod_Minus is assumed.
4150             G4_Imm* newVal = builder.createImm(-val->getInt(), val->getType());
4151             def->setSrc(newVal, 1);
4152             src->asSrcRegRegion()->setModifier(Mod_src_undef);
4153         }
4154         else if (defSrc1->isSrcRegRegion())
4155         {
4156             G4_SrcRegRegion* reg = defSrc1->asSrcRegRegion();
4157             if (reg->getModifier() == Mod_src_undef)
4158             {
4159                 reg->setModifier(src->asSrcRegRegion()->getModifier());
4160                 src->asSrcRegRegion()->setModifier(Mod_src_undef);
4161             }
4162             else if (reg->getModifier() == Mod_Minus)
4163             {
4164                 reg->setModifier(Mod_src_undef);
4165                 src->asSrcRegRegion()->setModifier(Mod_src_undef);
4166             }
4167         }
4168     } break;
4169     }
4170 }
4171 
4172 // Try to move source modifiers on MAD's src2 into its defintion. This allows
4173 // pseudo_mad ops to be translated into mac ops.
tryEliminateMadSrcModifier(IR_Builder & builder,G4_INST * inst)4174 void HWConformity::tryEliminateMadSrcModifier(IR_Builder& builder, G4_INST* inst)
4175 {
4176     ASSERT_USER(inst->opcode() == G4_pseudo_mad, "not a speudo-mad");
4177 
4178     // For pseudo_mad, src2 is the major source operand to be examined later.
4179     // If there is no modifier on src2, then nothing to do.
4180     G4_Operand* src2 = inst->getSrc(2);
4181     if (!src2->isSrcRegRegion())
4182         return;
4183 
4184     // Currently, only handle modifier minus. To handle others, we may need
4185     // to insert extra instructions.
4186     if (src2->asSrcRegRegion()->getModifier() != Mod_Minus)
4187         return;
4188 
4189     // Only when src2 has a single definition.
4190     if (G4_INST* def = inst->getSingleDef(Opnd_src2, true))
4191     {
4192         tryTransferSrcModifier(builder, def, src2);
4193     }
4194 }
4195 
4196 /// Heuristic to decide whether this fp pseudo-mad should be lowered into a
4197 /// GEN mad or not. Returns true if mad is preferred, false otherwise.
4198 ///
4199 /// We flavor generating non-mad when this vISA mad is part of b2b mads that
4200 /// share the same dst.
4201 ///
isFpMadPreferred(G4_BB * bb,INST_LIST_ITER iter)4202 bool HWConformity::isFpMadPreferred(G4_BB* bb, INST_LIST_ITER iter)
4203 {
4204     G4_INST* inst = *iter;
4205     G4_Operand* dst = inst->getDst();
4206     MUST_BE_TRUE(inst->opcode() == G4_pseudo_mad, "expect pseudo mad");
4207 
4208     // Check whether test_inst is sharing the same dst.
4209     auto equal_mad_dst = [](G4_INST* test_inst, G4_Operand* dst)
4210     {
4211         if (test_inst->opcode() == G4_pseudo_mad)
4212         {
4213             G4_Operand* test_dst = test_inst->getDst();
4214             if (test_dst->compareOperand(dst) == Rel_eq)
4215                 return true;
4216         }
4217         return false;
4218     };
4219 
4220     auto next_iter = std::next(iter);
4221     if (next_iter != bb->end())
4222     {
4223         G4_INST* next_inst = *next_iter;
4224         if (equal_mad_dst(next_inst, dst))
4225             return false;
4226     }
4227     if (iter != bb->begin())
4228     {
4229         auto prev_iter = std::prev(iter);
4230         G4_INST* prev_inst = *prev_iter;
4231         if (equal_mad_dst(prev_inst, dst))
4232             return false;
4233     }
4234 
4235     // FIXME: remove possile duplicate calls to isGoodAlign16Src, Cm only.
4236     // This will go away if we use an extra opcode to represent muladd.
4237     unsigned extraMov = 0;
4238     for (int k = 0; k < inst->getNumSrc(); k++)
4239     {
4240         if (!isGoodAlign16Src(inst, k))
4241         {
4242             // If need to insert >1 number of moves, then do not use mad.
4243             if (++extraMov > 1)
4244                 return false;
4245         }
4246     }
4247 
4248     return true;
4249 }
4250 
4251 // generate align1 mad, inserting moves if necessary
4252 // returns true if conversion is successful
4253 // for floating point mad this must succeed due to precision requirements
generateAlign1Mad(G4_BB * bb,INST_LIST_ITER iter)4254 bool HWConformity::generateAlign1Mad(G4_BB* bb, INST_LIST_ITER iter)
4255 {
4256 
4257     G4_INST* inst = *iter;
4258     MUST_BE_TRUE(inst->opcode() == G4_pseudo_mad, "expect pseudo mad");
4259     bool mustDoMad = IS_TYPE_FLOAT_ALL(inst->getDst()->getType());
4260 
4261 
4262     // try swapping src0 (really src2) and src1 to see if we can save a move
4263     // some conditions where swap may help:
4264     // -- if src0 is D, as MAD only supports D + D * W
4265     // -- if src1 is imm, as MAD src2 supports 16-bit imm
4266     // -- if src0 is HF in a mix mode MAD, as MAD src1 supports HF
4267     // -- if src1 is scalar, as MAD src2 has more region restrictions
4268     // We perform the swapping before the dst checks as some platforms require dst and src2 to have the same subreg
4269     {
4270         G4_Operand* src0 = inst->getSrc(0);
4271         G4_Operand* src1 = inst->getSrc(1);
4272         if (IS_DTYPE(src0->getType()) && src0->isSrcRegRegion() && !IS_DTYPE(src1->getType()))
4273         {
4274             inst->swapSrc(0, 1);
4275         }
4276         else if (src1->isImm() && src1->getTypeSize() == 2)
4277         {
4278             //swap src0 and src1 as src0 supports imm
4279             inst->swapSrc(0, 1);
4280         }
4281         else if (src0->isSrcRegRegion() && !src0->asSrcRegRegion()->isScalar() &&
4282             src1->isSrcRegRegion() &&
4283             src1->asSrcRegRegion()->isScalar())
4284         {
4285             // Swap src0 and src1 if src1 is scalar but src0 is not, as src2 regioning support is quite limited.
4286             inst->swapSrc(0, 1);
4287         }
4288         else if (isLowPrecisionFloatTy(src0->getType()) && src1->getType() == Type_F)
4289         {
4290             inst->swapSrc(0, 1);
4291         }
4292     }
4293 
4294     if (!isGoodAlign1TernaryDst(inst))
4295     {
4296         if (mustDoMad)
4297         {
4298             auto alignment = builder.noSrc2Regioning() ? GRFALIGN : Four_Word;
4299             inst->setDest(insertMovAfter(iter, inst->getDst(), inst->getDst()->getType(), bb, alignment));
4300         }
4301         else
4302         {
4303             return false;
4304         }
4305     }
4306 
4307     // check src
4308     bool canBeImm = true;
4309     for (int k = inst->getNumSrc() - 1; k >= 0; k--)
4310     {
4311         G4_Operand* src = inst->getSrc(k);
4312         if (!isGoodAlign1TernarySrc(inst, k, canBeImm))
4313         {
4314             if (mustDoMad)
4315             {
4316                 bool isSrc2 = (k == 0);
4317                 if (builder.noSrc2Regioning() && isSrc2)
4318                 {
4319                     fixSrc2(iter, bb, true);
4320                 }
4321                 else
4322                 {
4323                     inst->setSrc(insertMovBefore(iter, k, src->getType(), bb), k);
4324                 }
4325             }
4326             else
4327             {
4328                 // Promote src2 from :b to :w to allow mad, for example:
4329                 //     pseudo_mad (16) V211(0,0)<1>:d V210(0,0)<1;0>:d V106(0,0)<0;0>:b V81(0,0)<1;0>:d
4330                 //  =>
4331                 //     mov (1) TV74(0,0)<1>:w V106(0,0)<0;1,0>:b {Q1, Align1, NoMask}
4332                 //     mad (16) V211(0,0)<1>:d V81(0,0)<1;0>:d V210(0,0)<1;0>:d TV74(0,0)<0;0>:w {H1, Align1}
4333                 // Do not allow mad if both src1 and src2 are :b as it will generate mov+mov+mad. There is no benefit for
4334                 // instruction count as mov+mov+mas equals to mov+mul+add. In some spilled cases the performace may be
4335                 // even worse as more spill codes inserted.
4336                 bool isSrc2 = (k == 0);
4337                 if (builder.noSrc2Regioning() && isSrc2 && IS_BTYPE(src->getType()) && !IS_BTYPE(inst->getSrc(1)->getType()))
4338                 {
4339                     bool hasModMinus = false;
4340                     if (src->isSrcRegRegion())
4341                     {
4342                         G4_SrcModifier mod = src->asSrcRegRegion()->getModifier();
4343                         hasModMinus = (mod == Mod_Minus || mod == Mod_Minus_Abs);
4344                     }
4345 
4346                     // If minus modifier is present, need signed type.
4347                     G4_Type type = (IS_SIGNED_INT(src->getType()) || hasModMinus) ? Type_W : Type_UW;
4348                     auto dstStrideInBytes = inst->getDst()->getHorzStride() * TypeSize(inst->getDst()->getType());
4349                     uint16_t stride = (uint16_t)(dstStrideInBytes / TypeSize(type));
4350                     inst->setSrc(insertMovBefore(iter, k, type, bb, stride, GRFALIGN), k);
4351                 }
4352                 else
4353                 {
4354                     return false;
4355                 }
4356             }
4357         }
4358         else
4359         {
4360             if (src->isImm())
4361             {
4362                 canBeImm = false;
4363             }
4364         }
4365     }
4366 
4367     inst->setOpcode(G4_mad);
4368     //swap src0 and src2 (vISA MAD is src0*src1+src2, while GEN MAD is src1*src2+src0)
4369     inst->swapSrc(0, 2);
4370 
4371     return true;
4372 }
4373 
4374 // convert a FP (HF/F/DF) pseudo-mad into a GEN mad,
4375 // inserting moves if necessary
4376 // returns true if conversion is successful
4377 // note that this must return true for IGC due to precision requirements
generateFPMad(G4_BB * bb,INST_LIST_ITER iter)4378 bool HWConformity::generateFPMad(G4_BB* bb, INST_LIST_ITER iter)
4379 {
4380     G4_INST* inst = *iter;
4381     MUST_BE_TRUE(inst->opcode() == G4_pseudo_mad, "expect pseudo mad");
4382     uint8_t execSize = inst->getExecSize();
4383     G4_DstRegRegion* dst = inst->getDst();
4384 
4385     // Align16 MAD requirements:
4386     // -- dst and all 3 srcs have the same F/HF/DF type (mixed F/HF is allowed on CHV+)
4387     // -- dst and all 3 srcs have direct access
4388     // -- execution size is 16/8/4/1
4389     // -- dst and src must be packed
4390     // -- if src region is not scalar, its subregister must be 16 byte aligned
4391 
4392     // do not force fma for CM since it doesn't have precision requirements
4393     bool preferFpMad = builder.getOption(vISA_forceFPMAD) || builder.favorFpMad();
4394     if (!preferFpMad)
4395     {
4396         preferFpMad = isFpMadPreferred(bb, iter);
4397     }
4398 
4399     auto alignMent = execSize * dst->getTypeSize();
4400     alignMent = (alignMent > 32) ? 32 : alignMent;
4401     alignMent = (alignMent < 16) ? 16 : alignMent;
4402 
4403     if (dst->getRegAccess() != Direct || dst->getHorzStride() != 1 ||
4404         !builder.isOpndAligned(dst, alignMent))
4405     {
4406         if (preferFpMad)
4407         {
4408             G4_DstRegRegion* tmpDst = insertMovAfter(iter, dst, dst->getType(), bb);
4409             inst->setDest(tmpDst);
4410         }
4411         else
4412         {
4413             return false;
4414         }
4415     }
4416 
4417     // check src
4418     for (int k = 0; k < inst->getNumSrc(); k++)
4419     {
4420         G4_Type type = inst->getSrc(k)->getType();
4421         bool goodSrc = isGoodAlign16Src(inst, k);
4422         if (!goodSrc && preferFpMad)
4423         {
4424             // insert moves if type is legal mad type
4425             if (isGoodMadType(type))
4426             {
4427                 G4_Operand* src = inst->getSrc(k);
4428                 bool isReplicated = (type == Type_DF) &&
4429                     src->isSrcRegRegion() &&
4430                     (src->asSrcRegRegion()->getRegion()->width == 2) &&
4431                     (src->asSrcRegRegion()->getRegion()->horzStride == 0) &&
4432                     (src->asSrcRegRegion()->getRegion()->vertStride == 2);
4433                 if ((type == Type_DF ||
4434                     (type == Type_HF && builder.getPlatform() == GENX_BDW)) &&
4435                     execSize > 1 &&
4436                     (src->isImm() || src->asSrcRegRegion()->isScalar()))
4437                 {
4438                     // MAD DF does not support .r, so we have to broadcast the value
4439                     // '.r' on MAD HF on BDW is not a replication of that
4440                     // scalar element but a pair of half.
4441                     auto align = type == Type_HF ? GRFALIGN : Eight_Word;
4442                     broadcast(bb, iter, k, align);
4443                 }
4444                 // No need to insert mov for replicated DF src with <2;2,0> region,
4445                 // which can be encoded as "xyxy" or "zwzw" swizzle based on offfset
4446                 else if (!isReplicated)
4447                 {
4448                     inst->setSrc(insertMovBefore(iter, k, type, bb), k);
4449                 }
4450                 goodSrc = true;
4451             }
4452         }
4453         if (!goodSrc)
4454         {
4455             return false;
4456         }
4457     }
4458 
4459     inst->setOpcode(G4_mad);
4460 
4461     //swap src0 and src2 (vISA MAD is src0*src1+src2, while GEN MAD is src1*src2+src0)
4462     inst->swapSrc(0, 2);
4463 
4464     return true;
4465 }
4466 
4467 // If the LF MAD does not conform to Genx ISA semantics, then translate
4468 // it into a valid GenX sequence - either an equivalent MUL/ADD sequence
4469 // or an equivalent MAC.
4470 // ASSUMPTION:
4471 //    This phase must be called at the end of all other optimizations
4472 //    phases and just prior to testing for ACC spilling.
fixMADInst(G4_BB * bb)4473 void HWConformity::fixMADInst(G4_BB* bb)
4474 {
4475     bool doAlign1Mad = builder.hasAlign1Ternary();
4476     bb->resetLocalIds();
4477     INST_LIST_ITER i = bb->begin();
4478 
4479     for (auto iterEnd = bb->end(); i != iterEnd; ++i)
4480     {
4481         G4_INST* inst = *i;
4482         if (inst->opcode() != G4_pseudo_mad)
4483         {
4484             continue;
4485         }
4486 
4487         tryEliminateMadSrcModifier(builder, inst);
4488 
4489         G4_DstRegRegion* dst = inst->getDst();
4490         uint32_t exec_size = inst->getExecSize();
4491 
4492         bool conforming_genx_mad = true;
4493 
4494         if (exec_size > G4_ExecSize(builder.getNativeExecSize() * 2))
4495         {
4496             conforming_genx_mad = false;
4497         }
4498         else
4499         {
4500             switch (dst->getType())
4501             {
4502             case Type_F:
4503             case Type_HF:
4504             case Type_DF:
4505             case Type_BF:
4506                 break;
4507             case Type_W:
4508             case Type_UW:
4509             case Type_D:
4510             case Type_UD:
4511                 if (!doAlign1Mad)
4512                 {
4513                     conforming_genx_mad = false;
4514                 }
4515                 break;
4516             default:
4517                 conforming_genx_mad = false;
4518             }
4519         }
4520 
4521         if (conforming_genx_mad)
4522         {
4523             bool doMad = doAlign1Mad ?
4524                 generateAlign1Mad(bb, i) : generateFPMad(bb, i);
4525             if (doMad)
4526             {
4527                 // done with this pseudo-mad
4528                 continue;
4529             }
4530         }
4531 
4532         // translate MAD into MUL/ADD
4533         convertMAD2MulAdd(i, bb);
4534         i++;    // skip the add
4535     }
4536 }
4537 
isAccCandidate(G4_INST * inst,Gen4_Operand_Number opndNum,G4_Kernel & kernel)4538 static bool isAccCandidate(G4_INST* inst, Gen4_Operand_Number opndNum, G4_Kernel& kernel)
4539 
4540 {
4541     if (!kernel.fg.builder->canMadHaveSrc0Acc())
4542     {
4543         return false;
4544     }
4545 
4546     switch (opndNum)
4547     {
4548     case Opnd_src0:
4549     case Opnd_src1:
4550         break;
4551     default:
4552         return false;
4553     }
4554 
4555     if (!inst->canSrcBeAcc(opndNum))
4556     {
4557         return false;
4558     }
4559 
4560     return true;
4561 }
4562 
4563 struct LiveNode
4564 {
4565     G4_INST* Inst;
4566     Gen4_Operand_Number OpNum;
LiveNodeLiveNode4567     LiveNode(G4_INST* Inst, Gen4_Operand_Number OpNum)
4568         : Inst(Inst)
4569         , OpNum(OpNum)
4570     {
4571     }
4572 };
4573 
4574 #define GLOBAL_USE_NUM 15
4575 
isSameOperand(G4_Operand * srcOpnd,struct LiveNode * ln)4576 static bool isSameOperand(G4_Operand* srcOpnd, struct LiveNode* ln)
4577 {
4578     G4_Operand* opnd = ln->Inst->getOperand(ln->OpNum);
4579 
4580     if (opnd->compareOperand(srcOpnd) == Rel_eq)
4581     {
4582         return true;
4583     }
4584 
4585     return false;
4586 }
4587 
localizeForAcc(G4_BB * bb)4588 void HWConformity::localizeForAcc(G4_BB* bb)
4589 {
4590     std::map<const G4_Declare*, G4_Operand*> replacedOperand;
4591     std::unordered_map<const G4_Declare*, std::vector<struct LiveNode>> useNodes;
4592     std::vector<const G4_Declare*> erasedCandidates;
4593 
4594     curBB = bb;
4595 
4596     for (auto instIter = bb->begin(), instEnd = bb->end(); instIter != instEnd; ++instIter)
4597     {
4598         G4_INST* inst = *instIter;
4599 
4600         //Not defined in current BB
4601         G4_Operand* dst = inst->getOperand(Opnd_dst);
4602         if (dst && dst->isGreg() && kernel.fg.globalOpndHT.isOpndGlobal(dst))
4603         {
4604             const G4_Declare* dcl = dst->getTopDcl();
4605             if (useNodes.find(dcl) != useNodes.end())
4606             {
4607                 useNodes.erase(dcl); //Maybe added again
4608                 erasedCandidates.emplace_back(dcl); //erased declares
4609             }
4610         }
4611 
4612         //Source operand
4613         for (auto OpNum :
4614             { Gen4_Operand_Number::Opnd_src0, Gen4_Operand_Number::Opnd_src1,
4615               Gen4_Operand_Number::Opnd_src2 })
4616         {
4617             G4_Operand* src = inst->getOperand(OpNum);
4618             if (src && src->isGreg() && kernel.fg.globalOpndHT.isOpndGlobal(src))
4619             {
4620                 const G4_Declare* dcl = src->getTopDcl();
4621                 if ((OpNum != Opnd_src0 &&  //Acc can be used only for src0 and src1
4622                     OpNum != Opnd_src1) ||
4623                     !isAccCandidate(inst, OpNum, kernel)) //The operand is can be replaced with ACC
4624                 {
4625                     auto dclIter = std::find(erasedCandidates.begin(), erasedCandidates.end(), dcl);
4626                     if (dclIter == erasedCandidates.end())
4627                     {
4628                         erasedCandidates.emplace_back(dcl);
4629                     }
4630                 }
4631                 else
4632                 {
4633                     if (useNodes[dcl].empty() ||
4634                         isSameOperand(src, &(useNodes[dcl][0])))
4635                     {
4636                         useNodes[dcl].emplace_back(inst, OpNum);
4637                     }
4638                 }
4639             }
4640         }
4641     }
4642 
4643     for (auto& Nodes : useNodes)
4644     {
4645         const G4_Declare* dcl = Nodes.first;
4646         auto dclIter = std::find(erasedCandidates.begin(), erasedCandidates.end(), dcl);
4647         if (dclIter != erasedCandidates.end())
4648         {
4649             //removed already
4650             continue;
4651         }
4652 
4653         if (Nodes.second.size() >= GLOBAL_USE_NUM)
4654         {
4655             for (auto& LN : Nodes.second)
4656             {
4657                 G4_INST* inst = LN.Inst;
4658                 Gen4_Operand_Number opNum = LN.OpNum;
4659                 int i = inst->getSrcNum(opNum);
4660                 G4_Operand* src = inst->getSrc(i);
4661                 G4_Operand* tmpOpnd = nullptr;
4662 
4663                 auto itR = replacedOperand.find(dcl);
4664                 if (itR != replacedOperand.end())
4665                 {
4666                     tmpOpnd = builder.duplicateOperand(itR->second);
4667                 }
4668                 else
4669                 {
4670                     tmpOpnd = insertCopyAtBBEntry(bb, inst->getExecSize(), src);
4671                     replacedOperand[dcl] = tmpOpnd;
4672                 }
4673                 inst->setSrc(tmpOpnd, i);
4674             }
4675         }
4676     }
4677 
4678     return;
4679 }
4680 
4681 // convert a psuedo mad inst into mul/add
4682 // return the iterator pointing to add
convertMAD2MulAdd(INST_LIST_ITER iter,G4_BB * bb)4683 void HWConformity::convertMAD2MulAdd(INST_LIST_ITER iter, G4_BB* bb)
4684 {
4685     G4_INST* inst = *iter;
4686     assert(inst->opcode() == G4_pseudo_mad && "expect pseudo-mad");
4687 
4688     G4_DstRegRegion* addOpDst = inst->getDst();
4689     G4_Operand* addOpnd2 = inst->getSrc(2);
4690     G4_Type mulOpDstType = addOpDst->getType();
4691     G4_Type mulOpExecType = inst->getExecType();
4692     // pick the widest type of mad's src and dst as the intermediate type
4693     if (TypeSize(mulOpDstType) > TypeSize(mulOpExecType))
4694     {
4695         mulOpExecType = mulOpDstType;
4696     }
4697 
4698     mulOpDstType = mulOpExecType;
4699 
4700     G4_SubReg_Align     subAlign = Get_G4_SubRegAlign_From_Type(mulOpDstType);
4701 
4702     // Reuse the MAD op for MUL.
4703     inst->setOpcode(G4_mul);
4704     inst->setSrc(nullptr, 2);
4705 
4706     G4_Declare* mulDefDcl = builder.createTempVar(inst->getExecSize(), mulOpDstType, subAlign);
4707 
4708     G4_DstRegRegion* mulOpDst = builder.createDstRegRegion(mulDefDcl, 1);
4709     inst->setDest(mulOpDst);
4710 
4711     // Follow with an ADD.
4712     INST_LIST_ITER tIter = iter;
4713     tIter++;
4714 
4715     auto addOpnd1 = builder.createSrcRegRegion(mulDefDcl, builder.getRegionStride1());
4716     G4_INST* addOp = builder.createInternalInst(
4717         inst->getPredicate(),
4718         G4_add,
4719         inst->getCondMod(),
4720         inst->getSaturate(),
4721         inst->getExecSize(),
4722         addOpDst,
4723         addOpnd1,
4724         addOpnd2,
4725         nullptr,
4726         inst->getOption());
4727 
4728     bb->insertBefore(tIter, addOp);
4729 
4730     // predicate/condmod/saturate, if they exist, are propagated to the add instruction
4731     inst->setSaturate(g4::NOSAT);
4732     inst->setPredicate(NULL);
4733     inst->setCondMod(nullptr);
4734 
4735     {
4736         inst->transferDef(addOp, Opnd_src2, Opnd_src1);
4737         if (addOp->getPredicate())
4738         {
4739             inst->transferDef(addOp, Opnd_pred, Opnd_pred);
4740         }
4741         inst->transferUse(addOp);
4742         inst->addDefUse(addOp, Opnd_src0);
4743     }
4744 }
4745 
4746 // See if we can convert the pseudo_sada2 instruction into an actual Gen sada2
4747 // This can be done if the following conditions are met:
4748 // -- We can find the definition of the pseudo sada2 instruction's source 2 in
4749 //    the same basic block, and that
4750 // -- it may be replaced by an acc (i.e., the src2 is its only use, the dst and
4751 //    the src have identical regions, and there are no intervening instructions
4752 //    that update acc)
4753 //
4754 // We additionally attempt to schedule up the sada2 instruction to be as close
4755 // as possible to the src2 defining instruction (subject to the constraints of
4756 // def-use chains for def, src0 and src1), so that more opportunites may be
4757 // exposed for later sada2 instructions
4758 
fixSADA2Inst(G4_BB * bb)4759 void HWConformity::fixSADA2Inst(G4_BB* bb)
4760 {
4761 
4762     INST_LIST_ITER i = bb->begin();
4763     while (i != bb->end())
4764     {
4765 
4766         G4_INST* inst = *i;
4767         if (inst->opcode() != G4_pseudo_sada2)
4768         {
4769             ++i;
4770             continue;
4771         }
4772 
4773         G4_Operand* src2 = inst->getSrc(2);
4774 
4775         bool canDoSada2 = true;
4776         G4_INST* src2Dst = NULL;
4777 
4778         int emask = inst->getMaskOption();
4779         if (!bb->isAllLaneActive() &&
4780             emask != InstOpt_WriteEnable &&
4781             inst->getMaskOffset() != 0)
4782         {
4783             canDoSada2 = false;
4784         }
4785 
4786         G4_DstRegRegion* dst = inst->getDst();
4787         if (canDoSada2)
4788         {
4789             if (src2->isSrcRegRegion() && src2->asSrcRegRegion()->getRegAccess() == Direct)
4790             {
4791                 // check Src2
4792                 if (kernel.fg.globalOpndHT.isOpndGlobal(src2))
4793                 {
4794                     // no sada2 if operand is global
4795                     canDoSada2 = false;
4796                 }
4797                 else if (src2->asSrcRegRegion()->getModifier() != Mod_src_undef)
4798                 {
4799                     // no sada2 if src2 has a modifier
4800                     canDoSada2 = false;
4801                 }
4802                 else
4803                 {
4804                     for (auto defIter = inst->def_begin(), end = inst->def_end(); defIter != end; ++defIter)
4805                     {
4806                         if ((*defIter).second == Opnd_src2)
4807                         {
4808                             if (src2Dst != NULL)
4809                             {
4810                                 // no sada2 if src2 has >1 definition
4811                                 canDoSada2 = false;
4812                                 break;
4813                             }
4814                             src2Dst = (*defIter).first;
4815                         }
4816                     }
4817 
4818                     if (!src2Dst)
4819                     {
4820                         canDoSada2 = false;
4821                     }
4822                     else
4823                     {
4824                         if (!src2Dst->hasOneUse())
4825                         {
4826                             // no sad2 if def has more than one use
4827                             canDoSada2 = false;
4828                         }
4829                         else
4830                         {
4831                             G4_DstRegRegion* src2DstOpnd = src2Dst->getDst();
4832                             G4_Type src2DstType = src2DstOpnd->getType();
4833                             if (src2DstOpnd->getRegAccess() != Direct
4834                                 || (src2DstType != Type_W && src2DstType != Type_UW))
4835                             {
4836                                 // no sada2 if def's dst is indirect, or it type is not W or UW
4837                                 canDoSada2 = false;
4838                             }
4839                             else if (src2DstOpnd->compareOperand(src2) !=
4840                                 Rel_eq)
4841                             {
4842                                 // no sada2 if src2Dst and src2 are not equal
4843                                 canDoSada2 = false;
4844                             }
4845                         }
4846                     }
4847                 }
4848             }
4849             else
4850             {
4851                 canDoSada2 = false;
4852             }
4853         }
4854 
4855         // The new location of the sada2 after the conversion
4856         INST_LIST_ITER newSada2Iter = i;
4857         --newSada2Iter;
4858         if (canDoSada2)
4859         {
4860             // try to schedule up the sada2 to be as close to the src2-defining instruction
4861             // as possible to expose more optmizaition opportunities
4862             for (; *newSada2Iter != src2Dst; --newSada2Iter)
4863             {
4864                 if (inst->isRAWdep(*newSada2Iter) ||
4865                     inst->isWAWdep(*newSada2Iter) ||
4866                     inst->isWARdep(*newSada2Iter))
4867                 {
4868                     break;
4869                 }
4870             }
4871 
4872             // make sure there are no instructions between the sada2's new location
4873             // and the src2-defining instruction that updates acc
4874             for (auto iter = newSada2Iter; *iter != src2Dst; --iter)
4875             {
4876                 G4_INST* aInst = *iter;
4877                 if (aInst->hasACCOpnd())
4878                 {
4879                     canDoSada2 = false;
4880                     break;
4881                 }
4882             }
4883         }
4884 
4885         if (canDoSada2)
4886         {
4887             // We have verified all conditions and can convert this instruction to sada2.
4888             // replace the destination for src2Dst to be acc0.
4889             // The actual acc0 offset will be fixed in a later pass
4890             G4_DstRegRegion* accDstOpnd = builder.createDst(
4891                 builder.phyregpool.getAcc0Reg(),
4892                 0,
4893                 0,
4894                 1,
4895                 src2->getType());
4896             src2Dst->setDest(accDstOpnd);
4897             if (src2Dst->getExecSize() == g4::SIMD1)
4898             {
4899                 // This can happen for the first sada2 instruction if src2 is scalar
4900                 // expand its execution size so that acc is fully defined
4901                 src2Dst->setExecSize(inst->getExecSize());
4902             }
4903 
4904             // create an implicit acc parameter for sada2
4905             inst->setOpcode(G4_sada2);
4906             inst->setSrc(nullptr, 2);
4907             G4_SrcRegRegion* accSrcOpnd = builder.createSrc(
4908                 builder.phyregpool.getAcc0Reg(),
4909                 0,
4910                 0,
4911                 builder.getRegionStride1(),
4912                 src2->getType());
4913 
4914             inst->setImplAccSrc(accSrcOpnd);
4915 
4916             ++newSada2Iter;
4917             bb->insertBefore(newSada2Iter, inst);
4918             i = bb->erase(i);
4919 
4920             // maintain def-use
4921 
4922             for (auto tmpIter = src2Dst->use_begin(), end = src2Dst->use_end(); tmpIter != end; ++tmpIter)
4923             {
4924                 if ((*tmpIter).first == inst && (*tmpIter).second == Opnd_src2)
4925                 {
4926                     (*tmpIter).second = Opnd_implAccSrc;
4927                     break;
4928                 }
4929             }
4930 
4931             for (auto tmpIter = inst->def_begin(), end = inst->def_end(); tmpIter != end; ++tmpIter)
4932             {
4933                 if ((*tmpIter).first == src2Dst && (*tmpIter).second == Opnd_src2)
4934                 {
4935                     (*tmpIter).second = Opnd_implAccSrc;
4936                     break;
4937                 }
4938             }
4939         }
4940         else
4941         {
4942             // pseudo_sada2 (N) dst src0 src1 src2
4943             // becomes
4944             // sad2 (n) tmp<1>:w src0 src1
4945             // add (n) dst tmp<n;n,1>:w src2
4946 
4947             inst->setOpcode(G4_sad2);
4948             inst->setSrc(nullptr, 2);
4949 
4950             G4_SubReg_Align sad2TmpSubAlign = Get_G4_SubRegAlign_From_Type(dst->getType());
4951 
4952             if ((unsigned)inst->getExecSize() * dst->getTypeSize() > numEltPerGRF<Type_UB>())
4953             {
4954                 // align to GRF
4955                 sad2TmpSubAlign = GRFALIGN;
4956             }
4957             // create a new temp variable as sad2's destination
4958             G4_Declare* sad2Tmp = builder.createTempVar(inst->getExecSize(), dst->getType(), sad2TmpSubAlign);
4959             G4_DstRegRegion* sad2Dst = builder.createDstRegRegion(sad2Tmp, 1);
4960             inst->setDest(sad2Dst);
4961 
4962             uint16_t srcVertStride, srcWidth, srcHorzStride;
4963             srcWidth = inst->getExecSize() > g4::SIMD8 ? g4::SIMD8 : inst->getExecSize();
4964             srcHorzStride = 1;
4965             srcVertStride = srcWidth;
4966 
4967             // opnd 0 for add is the new temp we've just created
4968             const RegionDesc* rd = builder.createRegionDesc(srcVertStride, srcWidth, srcHorzStride);
4969             G4_Operand* addSrc0Opnd = builder.createSrc(sad2Dst->getBase(),
4970                 0, 0, rd, sad2Dst->getType());
4971 
4972             // opnd 1 is src2 of the pseudo_sada2
4973             // dst is the same as the pseudo_sada2
4974             G4_INST* addInst = builder.createInternalInst(
4975                 inst->getPredicate(),
4976                 G4_add,
4977                 inst->getCondMod(),
4978                 inst->getSaturate(),
4979                 inst->getExecSize(),
4980                 dst,
4981                 addSrc0Opnd,
4982                 src2,
4983                 NULL,
4984                 inst->getOption());
4985 
4986             INST_LIST_ITER addLoc = i;
4987             ++addLoc;
4988             bb->insertBefore(addLoc, addInst);
4989 
4990             // FIXME: redundant?
4991             inst->addDefUse(addInst, Opnd_src0);
4992 
4993             // The sad2 op should not have the SAT attribute set,
4994             // as this is intended only for the final result of the
4995             // SADA2 (and thus the add op will keep the SAT attribute).
4996             inst->setSaturate(g4::NOSAT);
4997             inst->setPredicate(NULL);
4998 
4999             {
5000                 inst->transferDef(addInst, Opnd_src2, Opnd_src1);
5001                 if (addInst->getPredicate())
5002                 {
5003                     inst->transferDef(addInst, Opnd_pred, Opnd_pred);
5004                 }
5005                 inst->transferUse(addInst);
5006                 inst->addDefUse(addInst, Opnd_src0);
5007             }
5008             ++i;
5009         }
5010     }
5011 }
5012 
fixSendInst(G4_BB * bb)5013 void HWConformity::fixSendInst(G4_BB* bb)
5014 {
5015 
5016     for (INST_LIST_ITER i = bb->begin(), end = bb->end(); i != end; i++)
5017     {
5018 
5019         G4_INST* inst = *i;
5020         if (!inst->isSend())
5021         {
5022             continue;
5023         }
5024 
5025         if (inst->getExecSize() < builder.getNativeExecSize())
5026         {
5027             // A64 messages require a minimum msg len of two for address (src0), which is inconsistent
5028             // with our input IR as it allows <2 GRF address variables (e.g., simd1 A64 scatter r/w).
5029             // To avoid this causing overlap between send dst/src0/src1 (it is known to cause HW hang),
5030             // we have to ensure they are all 2GRF-aligned
5031             G4_Declare* src0Dcl = inst->getSrc(0)->getTopDcl();
5032             // ToDo: check if dst/src1 may also exhibit such size mismatch
5033             bool sizeMismatch = inst->getMsgDesc()->getSrc0LenRegs() == 2 &&
5034                 (src0Dcl && src0Dcl->getRootDeclare()->getByteSize() < 2u * numEltPerGRF<Type_UB>());
5035             auto doEvenAlign = [](G4_Declare* dcl)
5036             {
5037                 if (dcl)
5038                 {
5039                     dcl = dcl->getRootDeclare();
5040                     // variables >= 2 GRF don't need even alignment since they can't possibly overlap
5041                     if (dcl->getByteSize() < 2u * numEltPerGRF<Type_UB>())
5042                     {
5043                         dcl->setEvenAlign();
5044                     }
5045                 }
5046             };
5047             if (sizeMismatch)
5048             {
5049                 doEvenAlign(inst->getSrc(0)->getTopDcl());
5050                 if (inst->isSplitSend())
5051                 {
5052                     doEvenAlign(inst->getSrc(1)->getTopDcl());
5053                 }
5054                 if (builder.WaDisableSendSrcDstOverlap())
5055                 {
5056                     doEvenAlign(inst->getDst()->getTopDcl());
5057                 }
5058             }
5059         }
5060 
5061         uint16_t offset = 0;
5062         if (!builder.isOpndAligned(inst->getDst(), offset, numEltPerGRF<Type_UB>()))
5063         {
5064             replaceDst(i, inst->getDst()->getType(), GRFALIGN);
5065         }
5066 
5067         G4_Operand* src0 = inst->getSrc(0);
5068         G4_Declare* src0TopDcl = src0->getTopDcl();
5069 
5070         // if src0 and src1 are hard-wired GRF, check that
5071         // they satisfy EOT and preemption restrictions
5072         auto needsTempSrc = [this](G4_INST* inst, G4_Declare* dcl)
5073         {
5074             return dcl->getRegVar() && dcl->getRegVar()->getPhyReg() &&
5075                 ((inst->isEOT() && builder.hasEOTGRFBinding() &&
5076                     dcl->getRegVar()->getPhyReg()->asGreg()->getRegNum() < 112) ||
5077                     (builder.getOption(vISA_enablePreemption) &&
5078                         dcl->getRegVar()->getPhyReg()->asGreg()->getRegNum() < 2));
5079         };
5080 
5081         auto fixSrc = [&](G4_INST* inst, bool isSrc0)
5082         {
5083             auto sendSrc = isSrc0 ? inst->getSrc(0)->asSrcRegRegion() : inst->getSrc(1)->asSrcRegRegion();
5084             uint16_t rows = isSrc0 ? inst->getMsgDesc()->getSrc0LenRegs() : inst->getMsgDesc()->getSrc1LenRegs();
5085             G4_Type type = sendSrc->getType();
5086             G4_Declare* dcl = builder.createTempVar(rows * builder.getNativeExecSize(), type, GRFALIGN);
5087 
5088             MUST_BE_TRUE(TypeSize(type) == 4, "Invalid src opnd type for send.");
5089 
5090             const RegionDesc* region = builder.getRegionStride1();
5091             G4_VarBase* base = sendSrc->getBase();
5092             short baseOff = sendSrc->getRegOff();
5093             short baseSubOff = sendSrc->getSubRegOff();
5094             for (uint16_t idx = 0; idx != rows; ++idx) {
5095                 G4_SrcRegRegion* src = builder.createSrc(base, baseOff + idx, baseSubOff + 0, region, type);
5096                 G4_DstRegRegion* dst = builder.createDst(dcl->getRegVar(), idx, 0, 1, type);
5097                 G4_INST* newInst = builder.createMov(builder.getNativeExecSize(), dst, src, InstOpt_WriteEnable, false);
5098                 bb->insertBefore(i, newInst);
5099             }
5100 
5101             G4_Operand* newSrc = builder.createSrcRegRegion(dcl, builder.getRegionStride1());
5102             inst->setSrc(newSrc, isSrc0 ? 0 : 1);
5103         };
5104 
5105         if (needsTempSrc(inst, src0TopDcl))
5106         {
5107             fixSrc(inst, true);
5108         }
5109 
5110         if (inst->isSplitSend() && !inst->getSrc(1)->isNullReg())
5111         {
5112             // src1 may be null because some messages (e.g., CPS) require split send
5113             if (!builder.isOpndAligned(inst->getSrc(1), numEltPerGRF<Type_UB>()))
5114             {
5115                 inst->setSrc(insertMovBefore(i, 1, inst->getSrc(1)->getType(), bb, GRFALIGN), 1);
5116             }
5117             G4_Operand* src1 = inst->getSrc(1);
5118             G4_Declare* src1TopDcl = src1->getTopDcl();
5119 
5120             if (needsTempSrc(inst, src1TopDcl))
5121             {
5122                 fixSrc(inst, false);
5123             }
5124         }
5125 
5126         if (builder.getOption(vISA_enablePreemption))
5127         {
5128             G4_DstRegRegion* dst = inst->getDst();
5129             if (!dst->isNullReg())
5130             {
5131                 G4_Declare* dstTopDcl = dst->getTopDcl();
5132                 if (dstTopDcl != NULL &&
5133                     dstTopDcl->getRegVar() &&
5134                     dstTopDcl->getRegVar()->getPhyReg())
5135                 {
5136                     MUST_BE_TRUE((dstTopDcl->getRegVar()->getPhyReg()->asGreg()->getRegNum() > 2), "Unexpected preg used for send destination.");
5137                 }
5138             }
5139         }
5140 
5141         if (builder.WaDisableSendSrcDstOverlap())
5142         {
5143             // create copy if dst and src0/src1 overlap due to being the same variable
5144             bool src0Overlap = inst->getDst()->compareOperand(inst->getSrc(0)) != Rel_disjoint;
5145             bool src1Overlap = inst->isSplitSend() && inst->getDst()->compareOperand(inst->getSrc(1)) != Rel_disjoint;
5146             if (src0Overlap || src1Overlap)
5147             {
5148                 int dstSize = inst->getMsgDesc()->getDstLenRegs();
5149                 int src0Size = src0Overlap ? inst->getMsgDesc()->getSrc0LenRegs() : 0;
5150                 int src1Size = src1Overlap ? inst->getMsgDesc()->getSrc1LenRegs() : 0;
5151                 if (inst->getPredicate() || (bb->isDivergent() && !inst->isWriteEnableInst()) || dstSize > src0Size + src1Size)
5152                 {
5153                     //copy src0/src1 if inst does not update all channels
5154                     //ToDo: the copies may be OOB if src0/src1 are scalar. It should be ok since we don't care about the values,
5155                     //but IR verifier might complain about OOB.
5156                     if (src0Overlap)
5157                     {
5158                         G4_Declare* copyDst = builder.createTempVar(src0Size * numEltPerGRF<Type_UD>(), Type_UD, Any);
5159                         copyRegs(copyDst, 0, inst->getSrc(0)->getBase()->asRegVar()->getDeclare(),
5160                             inst->getSrc(0)->asSrcRegRegion()->getRegOff() * getGRFSize(), src0Size, bb, i);
5161                         inst->setSrc(builder.createSrcRegRegion(copyDst, builder.getRegionStride1()), 0);
5162                     }
5163                     if (src1Overlap)
5164                     {
5165                         G4_Declare* copyDst = builder.createTempVar(src1Size * numEltPerGRF<Type_UD>(), Type_UD, Any);
5166                         copyRegs(copyDst, 0, inst->getSrc(1)->getBase()->asRegVar()->getDeclare(),
5167                             inst->getSrc(1)->asSrcRegRegion()->getRegOff() * getGRFSize(), src1Size, bb, i);
5168                         inst->setSrc(builder.createSrcRegRegion(copyDst, builder.getRegionStride1()), 1);
5169                     }
5170                 }
5171                 else
5172                 {
5173                     // copy dst
5174                     auto dst = inst->getDst();
5175                     auto dstDcl = dst->getBase()->asRegVar()->getDeclare();
5176                     auto copyIter = std::next(i);
5177                     G4_Declare* copySrc = builder.createTempVar(dstSize * numEltPerGRF<Type_UD>(), Type_UD, Any);
5178                     // speical case when send dst declare is <1 GRF (it must still be GRF-aligned)
5179                     if (dstDcl->getByteSize() < getGRFSize())
5180                     {
5181                         auto numDWords = dstDcl->getByteSize() / TypeSize(Type_UD);
5182                         assert(numDWords > 0);
5183                         copyDwords(dstDcl, 0, copySrc, 0, numDWords, bb, copyIter);
5184                     }
5185                     else
5186                     {
5187                         copyRegs(dstDcl, dst->getRegOff() * getGRFSize(),
5188                             copySrc, 0, dstSize, bb, copyIter);
5189                     }
5190                     inst->setDest(builder.createDstRegRegion(copySrc, 1));
5191                 }
5192             }
5193         }
5194 
5195     }
5196 
5197 }
5198 
fixsrc1src2Overlap(G4_BB * bb)5199 void HWConformity::fixsrc1src2Overlap(G4_BB* bb)
5200 {
5201     for (INST_LIST_ITER i = bb->begin(), end = bb->end(); i != end; i++)
5202     {
5203         G4_INST* inst = *i;
5204 
5205         if (inst->opcode() != G4_mad)
5206         {
5207             continue;
5208         }
5209 
5210         G4_Operand* src1 = inst->getSrc(1);
5211         G4_Operand* src2 = inst->getSrc(2);
5212 
5213         if (src1 && src2 &&
5214             !src1->isNullReg() && !src2->isNullReg() &&
5215             src1->getType() == src2->getType())
5216         {
5217             G4_CmpRelation cmpResult = src1->compareOperand(src2);
5218             if (cmpResult != Rel_disjoint && cmpResult != Rel_undef)
5219             {
5220                 G4_Type movType = src2->getType();
5221                 bool changeType = true;
5222                 switch (src2->getType())
5223                 {
5224                 case Type_DF:
5225                     movType = Type_UQ;
5226                     break;
5227                 case Type_F:
5228                     movType = Type_UD;
5229                     break;
5230                 case Type_HF:
5231                     movType = Type_UW;
5232                     break;
5233                 default:
5234                     changeType = false;
5235                     break;
5236                 }
5237                 if (changeType)
5238                 {
5239                     G4_Operand* opnd = insertMovBefore(i, 2, movType, bb);
5240                     INST_LIST_ITER prev_it = i;
5241                     prev_it--;
5242                     G4_INST* movInst = (*prev_it);
5243                     movInst->getSrc(0)->asSrcRegRegion()->setType(movType);
5244                     opnd->asSrcRegRegion()->setType(src2->getType());
5245                     inst->setSrc(opnd, 2);
5246                 }
5247             }
5248         }
5249     }
5250 }
5251 
fixOverlapInst(G4_BB * bb)5252 void HWConformity::fixOverlapInst(G4_BB* bb)
5253 {
5254     for (INST_LIST_ITER i = bb->begin(), end = bb->end(); i != end; i++)
5255     {
5256         G4_INST* inst = *i;
5257 
5258         if (inst->mayExceedTwoGRF() || inst->opcode() == G4_madm)
5259         {
5260             continue;
5261         }
5262 
5263         if (inst->getDst() != NULL)
5264         {
5265             // create copy if dst and src0/src1 overlap due to being the same variable
5266             G4_Operand* dst = inst->getDst();
5267             if (dst != NULL && dst->isDstRegRegion() && dst->getTopDcl() && dst->getTopDcl()->getRegFile() == G4_GRF)
5268             {
5269                 int dstSize = (dst->getLinearizedEnd() - dst->getLinearizedStart() + 1) / numEltPerGRF<Type_UB>();
5270                 int srcSize = 1;
5271 
5272                 bool srcOverlap = false;
5273                 for (int i = 0; i < inst->getNumSrc(); i++)
5274                 {
5275                     G4_Operand* src = inst->getSrc(i);
5276                     if (src != NULL && !src->isNullReg() && src->getTopDcl() && src->getTopDcl()->getRegFile() == G4_GRF)
5277                     {
5278                         srcOverlap |= inst->getDst()->compareOperand(inst->getSrc(i)) == Rel_interfere;
5279                         if (srcOverlap)
5280                         {
5281                             srcSize = (src->getLinearizedEnd() - src->getLinearizedStart() + 1) / numEltPerGRF<Type_UB>();
5282                             break;
5283                         }
5284                     }
5285                 }
5286 
5287                 if (srcOverlap && (dstSize > 1 || srcSize > 1))
5288                 {
5289                     G4_AccRegSel accSel = inst->getDst()->getAccRegSel();
5290                     G4_DstRegRegion* newDst = insertMovAfter(i, inst->getDst(), inst->getDst()->getType(), bb);
5291                     newDst->setAccRegSel(accSel);
5292                     inst->setDest(newDst);
5293                 }
5294             }
5295         }
5296     }
5297 }
5298 
5299 //
5300 // Fix sel and csel instructions:
5301 //  -- set their cond mod to null as they don't modify it.  They will be hard-coded to f0.0 in Gen asm
5302 
fixSelCsel(INST_LIST_ITER it,G4_BB * bb)5303 void HWConformity::fixSelCsel(INST_LIST_ITER it, G4_BB* bb)
5304 {
5305     G4_INST* inst = *it;
5306     if (inst->opcode() == G4_sel || inst->opcode() == G4_csel)
5307     {
5308         G4_CondMod* condMod = inst->getCondMod();
5309         if (condMod)
5310         {
5311             condMod->setBase(nullptr);
5312         }
5313     }
5314 }
5315 
avoidDstSrcOverlap(PointsToAnalysis & p)5316 void HWConformity::avoidDstSrcOverlap(PointsToAnalysis& p)
5317 {
5318     for (auto& bb : kernel.fg)
5319     {
5320         INST_LIST_ITER i = bb->begin(), iEnd = bb->end();
5321         INST_LIST_ITER next_iter = i;
5322         curBB = bb;
5323         for (; i != iEnd; i = next_iter)
5324         {
5325             ++next_iter;
5326             avoidInstDstSrcOverlap(i, bb, p);
5327         }
5328     }
5329 }
5330 
5331 //
5332 //  Avoid the dst and src overlap when they are using the same variable by inserting a mov instruction
5333 //  add(8)  var1<2>, var2, var1<0, 1, 0>
5334 //
avoidInstDstSrcOverlap(INST_LIST_ITER it,G4_BB * bb,PointsToAnalysis & p)5335 void HWConformity::avoidInstDstSrcOverlap(INST_LIST_ITER it, G4_BB* bb, PointsToAnalysis& p)
5336 {
5337     G4_INST* inst = *it;
5338 
5339     if (inst->mayExceedTwoGRF() ||
5340         inst->opcode() == G4_nop ||
5341         inst->opcode() == G4_madm ||
5342         inst->isLabel())
5343     {
5344         return;
5345     }
5346 
5347     auto dst = inst->getDst();
5348     if (!dst ||
5349         dst->isNullReg() ||
5350         !dst->getBase()->isRegVar())
5351     {
5352         return;
5353     }
5354 
5355     auto dstSize = inst->getExecSize() * dst->getTypeSize() * dst->getHorzStride();
5356     //Handle VxH
5357     if (dstSize > getGRFSize())
5358     {
5359         // special check for 2-GRF instruction with VxH operands
5360         // strictly speaking dst and VxH src may overlap only if src's address may point to dst variable,
5361         // but we skip such check as VxH access is rare and already expensive, so adding an extra move won't cause much extra overhead
5362         bool hasVxH = std::any_of(inst->src_begin(), inst->src_end(),
5363             [](G4_Operand* src) { return src && src->isSrcRegRegion() && src->asSrcRegRegion()->getRegion()->isRegionWH(); });
5364         if (hasVxH)
5365         {
5366             replaceDst(it, dst->getType());
5367             return;
5368         }
5369     }
5370 
5371     G4_Declare* dstDcl = dst->getTopDcl();
5372     if (dstDcl)
5373     {
5374         G4_DstRegRegion* dstRgn = dst;
5375         int dstOpndNumRows = ((dstRgn->getLinearizedEnd() - dstRgn->getLinearizedStart()) / numEltPerGRF(Type_UB)) + 1;
5376         int dstLeft = dstRgn->getLinearizedStart();
5377         int dstRight = dstOpndNumRows > 1 ? ((dstLeft / numEltPerGRF(Type_UB) + 1) * numEltPerGRF(Type_UB) - 1) :
5378             dstRgn->getLinearizedEnd();
5379 
5380         for (int i = 0, nSrcs = inst->getNumSrc(); i < nSrcs; i++)
5381         {
5382             G4_Operand* src = inst->getSrc(i);
5383 
5384             if (!src || src->isNullReg() || !src->getTopDcl())
5385             {
5386                 continue;
5387             }
5388             G4_Declare* srcDcl = src->getTopDcl();
5389             G4_CmpRelation rel = dst->compareOperand(src);
5390             if (src->isSrcRegRegion())
5391             {
5392                 G4_SrcRegRegion* srcRg = src->asSrcRegRegion();
5393                 if (srcDcl == dstDcl &&
5394                     srcRg->getRegAccess() == Direct &&
5395                     srcRg->getBase()->isRegVar())
5396                 {
5397                     if (rel != Rel_disjoint && rel != Rel_undef) //Overlap
5398                     {
5399                         G4_SrcRegRegion* srcRgn = src->asSrcRegRegion();
5400                         int srcOpndNumRows = ((srcRgn->getLinearizedEnd() - srcRgn->getLinearizedStart()) / numEltPerGRF(Type_UB)) + 1;
5401                         int srcLeft = srcRgn->getLinearizedStart();
5402                         int srcRight = srcRgn->getLinearizedEnd();
5403 
5404                         if (!srcRgn->isScalar() && srcOpndNumRows > 1)
5405                         {
5406                             srcLeft = (srcRgn->getLinearizedStart() / numEltPerGRF(Type_UB) + 1) * numEltPerGRF(Type_UB);
5407                         }
5408 
5409                         if (dstOpndNumRows > 1 || srcOpndNumRows > 1)
5410                         {
5411                             if (!(srcLeft > dstRight || dstLeft > srcRight))
5412                             {
5413                                 inst->setSrc(insertMovBefore(it, i, src->getType(), bb), i);
5414                             }
5415                         }
5416                     }
5417                 }
5418                 else if (srcRg->isIndirect())
5419                 {
5420                     G4_RegVar* ptvar = NULL;
5421                     int vid = 0;
5422                     while ((ptvar = p.getPointsTo(srcDcl->getRegVar(), vid++)) != NULL)
5423                     {
5424                         G4_Declare* dcl = ptvar->getDeclare();
5425                         if (dstDcl == dcl)
5426                         {
5427                             G4_AccRegSel accSel = inst->getDst()->getAccRegSel();
5428                             G4_DstRegRegion* newDst = insertMovAfter(it, inst->getDst(), inst->getDst()->getType(), bb);
5429                             newDst->setAccRegSel(accSel);
5430                             inst->setDest(newDst);
5431                             return;
5432                         }
5433                     }
5434                 }
5435             }
5436         }
5437     }
5438 }
5439 
fixCalla(INST_LIST_ITER it,G4_BB * bb)5440 void HWConformity::fixCalla(INST_LIST_ITER it, G4_BB *bb)
5441 {
5442     G4_INST* fcall = *it;
5443     G4_Operand* src0 = fcall->getSrc(0);
5444 
5445     // fcall could have imm/label src for direct call
5446     // No need to fix src reg at the case
5447     if (!src0->isSrcRegRegion())
5448         return;
5449 
5450     if (builder.isOpndAligned(src0, getGRFSize()))
5451         return;
5452 
5453     // insert a mov before fcall(calla) to mov src to a grf aligned reg
5454     replaceSrc(it, 0, src0->getType(), bb, GRFALIGN);
5455 }
5456 
replaceHFBFwithFloat(INST_LIST_ITER it,G4_BB * bb)5457 void HWConformity::replaceHFBFwithFloat(INST_LIST_ITER it, G4_BB* bb)
5458 {
5459     auto* inst = *it;
5460     auto* dst = inst->getDst();
5461     auto* src0 = inst->getSrc(0);
5462     assert(src0->getType() == Type_BF || src0->getType() == Type_HF);
5463 
5464     G4_InstDpas* dpasInst = inst->asDpasInst();
5465     uint8_t C = dpasInst->getRepeatCount();
5466 
5467     unsigned int src_l = src0->getLinearizedStart();
5468     unsigned int src_r = src0->getLinearizedEnd();
5469     unsigned int dstGRFSize = (src_r - src_l + 1) * (TypeSize(Type_F) / src0->getTypeSize());
5470     unsigned movInstNum = (((dstGRFSize + getGRFSize() - 1) / getGRFSize()) + 1) / 2; //2 GRFs per instruction
5471 
5472     G4_Declare* dcl = builder.createTempVar(builder.getNativeExecSize() * C, Type_F, ThirtyTwo_Word);
5473 
5474     // Copy HF/BF data to float with mov instructions.
5475     // If the new destination is more than 2 GRFs, multiple moves required.
5476     for (unsigned i = 0; i < movInstNum; i++)
5477     {
5478         G4_DstRegRegion* newDst = builder.createDst(
5479             dcl->getRegVar(),
5480             2 * i,
5481             0,
5482             dst->getHorzStride(),
5483             Type_F);
5484 
5485         G4_Operand* newSrc = builder.createSrc(
5486             src0->getBase(),
5487             src0->asSrcRegRegion()->getRegOff() + i,
5488             src0->asSrcRegRegion()->getSubRegOff(),
5489             builder.getRegionStride1(),
5490             src0->asSrcRegRegion()->getType());
5491 
5492         G4_ExecSize numOfF {(2 * getGRFSize()) / TypeSize(Type_F)};
5493         if (i == movInstNum - 1)
5494         {
5495             numOfF = G4_ExecSize((dstGRFSize / TypeSize(Type_F)) - i * numOfF);
5496         }
5497         G4_INST* newInst = builder.createMov(numOfF, newDst, newSrc, InstOpt_WriteEnable, false);
5498 
5499         bb->insertBefore(it, newInst);
5500     }
5501 
5502     //Replace the original source with the float type operand
5503     G4_Operand* newSrc0 = builder.createSrc(
5504         dcl->getRegVar(),
5505         0,
5506         0,
5507         builder.getRegionStride1(),
5508         dcl->getElemType());
5509     inst->setSrc(newSrc0, 0);
5510 
5511     return;
5512 }
5513 
fixDPAS(INST_LIST_ITER it,G4_BB * bb)5514 void HWConformity::fixDPAS(INST_LIST_ITER it, G4_BB *bb)
5515 {
5516     G4_INST* inst = *it;
5517 
5518     if (VISA_WA_CHECK(builder.getPWaTable(), Wa_22010725011) &&
5519         !builder.getOption(vISA_EnableDPASBFHFH))
5520     {
5521         G4_Type src0Type = inst->getSrc(0)->getType();
5522 
5523         if (src0Type == Type_BF || src0Type == Type_HF)
5524         {
5525             replaceHFBFwithFloat(it, bb);
5526         }
5527     }
5528 }
5529 
conformBB(G4_BB * bb)5530 void HWConformity::conformBB(G4_BB* bb)
5531 {
5532     INST_LIST_ITER i = bb->begin(), iEnd = bb->end();
5533     INST_LIST_ITER next_iter = i;
5534     for (; i != iEnd; i = next_iter)
5535     {
5536         // by default we skip the newly inserted instructions as we assume they are already HW conformed
5537         // if a check may produce new instructions that violate HW rules, it must adjust the next_iter
5538         // to point to them
5539         ++next_iter;
5540         G4_INST* inst = *i;
5541         G4_opcode opcode = inst->opcode();
5542 
5543         if (inst->isDpas())
5544         {
5545             fixDPAS(i, bb);
5546             continue;
5547         }
5548 
5549         if (inst->isFCall() && builder.supportCallaRegSrc())
5550             fixCalla(i, bb);
5551 
5552         if ((inst->mayExceedTwoGRF() && !inst->isSend()) ||
5553             opcode == G4_nop ||
5554             opcode == G4_label)
5555         {
5556             continue;
5557         }
5558 
5559         if (builder.getOption(vISA_InsertDummyMovForHWRSWA) &&
5560             (VISA_WA_CHECK(builder.getPWaTable(), Wa_16012061344) ||
5561              VISA_WA_CHECK(builder.getPWaTable(), Wa_16012292205)))
5562         {
5563             fixPredicateIndirectInst(i, bb);
5564         }
5565         // do this early since otherwise the moves inserted by other passes may still
5566         // inherit bad regions from the original inst
5567         fixSrcRegion(inst);
5568 
5569         bool changed = fixMov(i, bb);
5570         if (changed)
5571         {
5572             next_iter = i;
5573             next_iter++;
5574         }
5575 
5576         fixOpndType(i, bb);
5577 
5578         fixSelCsel(i, bb);
5579 
5580         fixPredCtrl(i, bb);
5581 
5582         if (inst->getExecSize() > builder.getNativeExecSize())
5583         {
5584             if (inst->opcode() == G4_math &&
5585                 inst->getDst()->getType() == Type_HF &&
5586                 inst->getSrc(0)->getType() == Type_HF &&
5587                 (!inst->getSrc(1) || inst->getSrc(1)->getType() == Type_HF))
5588             {
5589                 // split pure HF math to simd8
5590                 evenlySplitInst(i, bb);
5591             }
5592         }
5593         fix3SrcInst(i, bb);
5594 
5595         G4_Operand* dst = inst->getDst();
5596 
5597 #ifdef _DEBUG
5598         verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
5599 #endif
5600 
5601         if (inst->isMath())
5602         {
5603             if (fixMathInst(i, bb))
5604             {
5605                 // check the newly added insts later
5606                 next_iter = i;
5607                 next_iter++;
5608             }
5609         }
5610 
5611         inst = *i;
5612 
5613 #ifdef _DEBUG
5614         verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
5615 #endif
5616 
5617         if (inst->opcode() == G4_mul)
5618         {
5619             if (fixMULInst(i, bb))
5620             {
5621                 // inserted mach and mov
5622                 // check the newly added insts later (MUL, MACH, MOV)
5623                 next_iter = i;
5624                 next_iter++;
5625             }
5626         }
5627 
5628 #ifdef _DEBUG
5629         verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
5630 #endif
5631 
5632         if (inst->opcode() == G4_mulh)
5633         {
5634             fixMULHInst(i, bb);
5635             next_iter = i;
5636             continue;
5637         }
5638 
5639 #ifdef _DEBUG
5640         verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
5641 #endif
5642 
5643         if (inst->opcode() == G4_madw)
5644         {
5645             next_iter = fixMadwInst(i, bb);
5646             continue;
5647         }
5648 
5649 #ifdef _DEBUG
5650         verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
5651 #endif
5652 
5653         // HW check #6: indirect operand spilling
5654         fixIndirectOpnd(i, bb);
5655 
5656 #ifdef _DEBUG
5657         verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
5658 #endif
5659         // HW check #8: unsigned dst with execution type F
5660         /* If the execution type is F and the destination type if either UD, UW
5661          * or UB and the detination is not saturated, then we need to add an
5662          * intermediate type conversion to D.
5663          */
5664         inst = *i;
5665         opcode = inst->opcode();
5666 
5667         if (opcode == G4_cmp || opcode == G4_cmpn)
5668         {
5669             dst = inst->getDst();
5670             int dst_elsize = 0;
5671             bool null_dst = !dst || inst->hasNULLDst();
5672             if (!null_dst)
5673             {
5674                 dst_elsize = dst->isPredicate() ? TypeSize(Type_UW) : dst->getTypeSize();
5675             }
5676             int extypesize;
5677             G4_Type extype = inst->getOpExecType(extypesize);
5678             fixCompareInst(i, bb, extype, dst_elsize);
5679         }
5680         dst = inst->getDst();
5681 
5682 #ifdef _DEBUG
5683         verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
5684 #endif
5685         if (fixAcc(i, bb))
5686         {
5687             next_iter = i;
5688             next_iter++;
5689         }
5690 
5691 #ifdef _DEBUG
5692         verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
5693 #endif
5694 
5695         {
5696             dst = inst->getDst();
5697             G4_Type extype = inst->getExecType2();
5698             int extypesize = TypeSize(extype);
5699             int dst_elsize = 0;
5700             if (dst)
5701             {
5702                 dst_elsize = dst->getTypeSize();
5703             }
5704 
5705             if (dst &&
5706                 inst->getExecSize() == g4::SIMD1 &&
5707                 dst_elsize < extypesize &&
5708                 !IS_VTYPE(extype) &&
5709                 !inst->isMixedMode() &&
5710                 !hasDedicateAlignRegionConformity(inst) &&
5711                 !inst->isSend())
5712             {
5713                 fixDstHstride(i, extypesize);
5714             }
5715         }
5716 
5717 #ifdef _DEBUG
5718         verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
5719 #endif
5720 
5721         bool planeDeleted = fixPlaneInst(i, bb);
5722         if (planeDeleted)
5723         {
5724             continue;
5725         }
5726 
5727         fixLine(i, bb);
5728         fixRotate(i, bb);
5729 
5730         if (!builder.hasVxHFloat64b())
5731         {
5732             fixVxHFloat64b(i, bb);
5733         }
5734 
5735         if (fix64bInst(i, bb))
5736         {
5737             continue;
5738         }
5739 
5740 #ifdef _DEBUG
5741         verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
5742 #endif
5743         fixImm64(i, bb); // fixed immediates for DF4 in fixImm64()
5744 
5745         if ((*i)->opcode() == G4_mov)
5746         {
5747             if (fixBFMove(i, bb))
5748             {
5749                 continue;
5750             }
5751         }
5752         if ((*i)->opcode() == G4_fcvt)
5753         {
5754             (void)fixFcvt(i, bb);
5755             continue;
5756         }
5757         if ((*i)->opcode() == G4_srnd)
5758         {
5759             (void)fixSrnd(i, bb);
5760             continue;
5761         }
5762 
5763         if ((*i)->opcode() == G4_shl || (*i)->opcode() == G4_shr || (*i)->opcode() == G4_asr)
5764         {
5765             fixShiftInsts(i, bb);
5766             continue;
5767         }
5768 
5769         if (builder.getPlatform() == GENX_BDW)
5770         {
5771             fixPackedHFConversions(i, bb);
5772         }
5773 
5774         fixFloatARFDst(i, bb);
5775     }
5776 
5777     if (!builder.supportFloatOr64bRegioning())
5778     {
5779         for (auto iter = bb->begin(), iterEnd = bb->end(); iter != iterEnd; /* empty */)
5780         {
5781             // pre-compute nextIter as the call may destroy iter
5782             auto nextIter = std::next(iter);
5783             // since insertMovBefore/After and similar helper instructions do not
5784             // understand XeHP_SDV regioning restrictions, they may produce illegal moves
5785             // We do a catch call pass here to catch them
5786             fixUnalignedRegions(iter, bb);
5787             iter = nextIter;
5788         }
5789     }
5790 
5791     // previous legalization passes may introduce int64 moves on platforms that don't support int64
5792     // we do another catch-all pass here to legalize any such moves
5793     // ToDo: see if we can remove other calls to emulate64Mov()
5794     if (builder.noInt64())
5795     {
5796         for (auto I = bb->begin(), E = bb->end(); I != E;)
5797         {
5798             auto inst = *I;
5799             auto next = std::next(I);
5800             if (inst->opcode() == G4_mov && (IS_QTYPE(inst->getDst()->getType()) || IS_QTYPE(inst->getSrc(0)->getType())))
5801             {
5802                 emulate64bMov(I, bb);
5803             }
5804             I = next;
5805         }
5806     }
5807 
5808     if (builder.getNativeExecSize() <= g4::SIMD8)
5809     {
5810         return;
5811     }
5812     i = bb->begin(), iEnd = bb->end();
5813     next_iter = i;
5814     for (; i != iEnd; i = next_iter)
5815     {
5816         // by default we skip the newly inserted instructions as we assume they are already HW conformed
5817         // if a check may produce new instructions that violate HW rules, it must adjust the next_iter
5818         // to point to them
5819         ++next_iter;
5820         fixByteXBarRestriction(i, bb);
5821 #ifdef _DEBUG
5822         verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
5823 #endif
5824     }
5825 
5826     if (builder.getPlatform() == GENX_PVCXT) {
5827       for (auto I = bb->begin(), E = bb->end(); I != E;) {
5828         auto inst = *I;
5829         auto next = std::next(I);
5830 
5831         G4_DstRegRegion *dst = inst->getDst();
5832         bool crossGRFDst = dst && dst->isCrossGRFDst();
5833 
5834         if (crossGRFDst && IS_QTYPE(dst->getType()) && !inst->isSend() &&
5835             !inst->isDpas()) {
5836           bool hasQTypeSrc = false;
5837           for (int i = 0; i < inst->getNumSrc(); i++) {
5838             if (IS_QTYPE(inst->getSrc(i)->getType())) {
5839               hasQTypeSrc = true;
5840               break;
5841             }
5842           }
5843 
5844           if (!hasQTypeSrc) {
5845             evenlySplitInst(I, bb);
5846 
5847 #ifdef _DEBUG
5848             verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
5849 #endif
5850           }
5851         }
5852 
5853         I = next;
5854       }
5855     }
5856 }
5857 
5858 //
5859 // SIMD16 addc/subb are illegal on GEN, since they write to acc and there are
5860 // only 8 acc channels for D/UD type.  In vISA IR we should get something like
5861 //   addc (16|M0) V0  V2       V3
5862 //   use  (16|M0) V1  ... acc0:ud // or :d
5863 // which needs to be translated to
5864 //   addc (8|M0)  V0(0)  V2(0)  V3(0)
5865 //   use  (8|M0)  V1(0) ... acc0:ud
5866 //   addc (8|M8)  V0(1)  V2(1)  V3(1)
5867 //   use  (8|M8)  V1(1) ... acc0:ud
5868 // NOTE: we also support other consumers such as add.
5869 //
5870 //
5871 // We do this first thing in HW conformity to avoid REXES from splitting addc/subb incorrectly
5872 // We also count on previous opt to preserve the inst pair by not inserting any acc using inst in between;
5873 // it should hopefully be the case since we generally don't optimize instructions with acc src/dst
5874 //
5875 // If exec size of addc is < 8, we also have to make sure both the addc's dst and the carry move's dst are
5876 // GRF-aligned, since acc's channel is dependent on the dst's subreg offset.  In other words, we fix
5877 //   addc (1) r1.0 ...
5878 //   mov (1) r1.1 acc0.0<0;1,0>
5879 // into
5880 //   addc (1) r1.0 ...
5881 //   mov (1) r2.0 acc0.0<0;1,0>
5882 //   mov (1) r1.1 r2.0
5883 //
fixAddcSubb(G4_BB * bb)5884 bool HWConformity::fixAddcSubb(G4_BB* bb)
5885 {
5886     bool changed = false;
5887     for (auto iter = bb->begin(), iterEnd = bb->end();
5888         iter != iterEnd; ++iter)
5889     {
5890         G4_INST* inst = *iter;
5891 
5892         if (inst->opcode() != G4_addc && inst->opcode() != G4_subb)
5893         {
5894             continue;
5895         }
5896 
5897         // Fix the src1 if it's a immediate operand whose type can only be :ud
5898         for (int i = 0; i < 2; i++)
5899         {
5900             G4_Operand* src = inst->getSrc(i);
5901             if (src && src->isImm() && src->getType() == Type_UW)
5902             {
5903                 // just change the immediate's type to :ud
5904                 uint32_t immVal = (uint32_t)src->asImm()->getImm();
5905                 inst->setSrc(builder.createImm(immVal, Type_UD), i);
5906             }
5907         }
5908 
5909         if (inst->getExecSize() != builder.getNativeExecSize())
5910         {
5911             // find the matching carry move
5912             G4_INST* carryUse = nullptr;
5913             auto srchIter = iter;
5914             for (++srchIter; srchIter != iterEnd; ++srchIter)
5915             {
5916                 G4_INST* inst2 = *srchIter;
5917                 auto op = inst2->opcode();
5918 
5919                 bool opPossibleConsumer =
5920                     op == G4_mov || op == G4_add || op == G4_addc ||
5921                     op == G4_mad || op == G4_pseudo_mad || op == G4_add3;
5922 
5923                 // only check for a handful of user instructions
5924                 // this list could be extended
5925                 if (opPossibleConsumer &&
5926                     inst2->getExecSize() == inst->getExecSize() &&
5927                     inst2->useAcc())
5928                 {
5929                     carryUse = inst2;
5930                     break;
5931                 }
5932                 else if (inst2->useAcc())
5933                 {
5934                     // someone redefines acc0; we can stop looking
5935                     break;
5936                 }
5937             }
5938 
5939             if (carryUse == NULL)
5940             {
5941                 // can't find the move using acc, skip this addc/subb
5942                 assert(false && "unable to find addc/subc consumer");
5943                 continue;
5944             }
5945 
5946             if (inst->getExecSize() > builder.getNativeExecSize())
5947             {
5948                 // we're breaking a bigger instruction into a smaller one
5949                 evenlySplitInst(iter, bb);
5950                 evenlySplitInst(srchIter, bb);
5951 
5952                 // srchIter now points to the second half of move, and we want to move the first move to be
5953                 // before the second half of the addc/subb, which is pointed by iter
5954                 --srchIter;
5955                 G4_INST* mov1 = *srchIter;
5956                 bb->erase(srchIter);
5957                 bb->insertBefore(iter, mov1);
5958 
5959                 changed = true;
5960             }
5961             else
5962             {
5963                 // we will need to GRF-align addc's dst as well as the move dst,
5964                 // so that the acc will have the correct offset
5965                 // note that insertMovAfter will align the tmp since addc/subb has implicit acc use
5966                 if (!builder.isOpndAligned(inst->getDst(), 32))
5967                 {
5968                     inst->setDest(
5969                         insertMovAfter(iter, inst->getDst(), inst->getDst()->getType(), bb));
5970                     changed = true;
5971                 }
5972                 if (!builder.isOpndAligned(carryUse->getDst(), 32))
5973                 {
5974                     carryUse->setDest(
5975                         insertMovAfter(srchIter, carryUse->getDst(), carryUse->getDst()->getType(), bb));
5976                     changed = true;
5977                 }
5978             }
5979         }
5980     }
5981     return changed;
5982 }
5983 
5984 //
5985 // Mixed mode instruction allows bfloat16 operands in the following cases:
5986 //   1. dst, src0, and src1 for 2 source instructions format not involving multiplier(mov, add, cmp, sel).
5987 //   2. dst and src0 for 2 source instructions format involving multiplier(mul, mac etc).
5988 //   3. dst, src0, and src1 for 3 source instructions format(mad).
5989 //   4. Broadcast of bfloat16 scalar is not supported.
5990 //   5. Unpacked bfloat16 destination with stride 2 when register offset is 0 or 1.
5991 //   6. Packed bfloat16 source and destination when register offset is 0 or 8.
5992 //   7. Execution size must not be greater than 8.
5993 //   8. Instructions with pure bfloat16 operands are not supported.
5994 //   6 & 7: register offset would be 0 or 16; execution size is at most 16
5995 //
5996 // **More examples**
5997 //   1. BF imm is not allowed
5998 //      mov  (1|M0)  r12.0<1>:f  0xffff:bf - ILLEGAL "Imm operand with BF type is not allowed"
5999 //   2. BF scalar operand can be used in SIMD1
6000 //      mul  (1|M0)  r14.0<1>:f  r11.0<0;1,0>:bf  r12.3<0;1,0>:f - OK
6001 //   3. For SIMD1, scalar operands (both dst/src) of F or BF can have any subreg!
6002 //      add  (1|M0)  r16.3<1>:bf  r11.0<0;1,0>:f  r12.3<0;1,0>:f - OK
6003 //   4. F Operand should have subreg = 0 if execSize > SIMD1
6004 //      add  (2|M0)  r10.4<1>:f  r11.0<1;1,0>:bf   0x12345:f
6005 //       ILLEGAL "Src0 regioning must be aligned to destination or scalar for Float/64bit pipes"
6006 //   5. Others
6007 //     add  (8|M0)  r16.0<2>:bf  r11.0<1;1,0>:f  r12.0<1;1,0>:f- OK
6008 //     add  (8|M0)  r16.1<2>:bf  r11.0<1;1,0>:f  r12.8<1;1,0>:f- OK
6009 //     add  (8|M0)  r16.0<1>:bf  r11.0<1;1,0>:f  r12.8<1;1,0>:f- OK
6010 //     add  (8|M0)  r16.8<1>:bf  r11.0<1;1,0>:f  r12.0<1;1,0>:f- OK
6011 //         Note that float source operands  can be scalar region <0;1,0>
6012 //
fixBFMixedMode()6013 void HWConformity::fixBFMixedMode()
6014 {
6015     auto useGivenType = [](G4_INST* I, G4_Type GivenTy)
6016     {
6017         G4_Operand* dst = I->getDst();
6018         // ignore cmp's dst (?)
6019         if (dst && !dst->isNullReg() && !I->isCompare())
6020         {
6021             if (dst->getType() == GivenTy)
6022                 return true;
6023         }
6024         for (int i = 0; i < I->getNumSrc(); ++i)
6025         {
6026             G4_Operand* src = I->getSrc(i);
6027             if (src && !src->isNullReg())
6028             {
6029                 if (src->getType() == GivenTy)
6030                     return true;
6031             }
6032         }
6033         return false;
6034     };
6035 
6036     auto allowBFForInst = [](G4_INST* I, Gen4_Operand_Number OpndNum = Opnd_total_num)
6037     {
6038         // Only mul/mac/mad/add/cmp/mov/sel support BF mixed mode.
6039         switch (I->opcode())
6040         {
6041         case G4_mul:
6042         case G4_mac:
6043             {
6044                 if (OpndNum == Opnd_src1)
6045                     return false;
6046                 return true;
6047             }
6048         case G4_mad:
6049         case G4_pseudo_mad:
6050             {
6051                 if (OpndNum == Opnd_src2)
6052                     return false;
6053                 return true;
6054             }
6055         case G4_add:
6056         case G4_cmp:
6057         case G4_mov:
6058         case G4_sel:
6059             return true;
6060         default:
6061             break;
6062         }
6063         return false;
6064     };
6065 
6066     auto skipBFCheck = [&useGivenType](G4_INST* I)
6067     {
6068         // Skip dpas/send
6069         if (I->isDpas() || I->isSend())
6070             return true;
6071 
6072         // Do not use BF, skip
6073         if (!useGivenType(I, Type_BF))
6074             return true;
6075 
6076         // Special case:
6077         //   1.  mov d:bf  s:bf   --> mov d:uw  s:uw
6078         //   2.  mov d:f   s:bf   --> shl d:ud  s:uw  16:ud
6079         if (I->opcode() == G4_mov && I->getSrc(0)->getType() == Type_BF)
6080         {
6081             // this will be handled by fixBFMov.
6082             return true;
6083         }
6084         return false;
6085     };
6086 
6087     if (!kernel.fg.builder->hasBFMixMode())
6088     {
6089         return;
6090     }
6091 
6092     const G4_ExecSize nativeES = kernel.fg.builder->getNativeExecSize();
6093     for (auto& bb : kernel.fg)
6094     {
6095         // First iteration:
6096         //    1. Legalize scalar BF operand for insts that need splitting
6097         //       (If this is done in 3, we will have more than 1 scalar mov.)
6098         //         mul (16|M0)  d<1>:f  s0<1;1,0>:bf  s1<0;1,0>:bf
6099         //       ==>
6100         //         (W) mov (1|M0) t<1>:f   s1<0;1,0>:bf
6101         //         mul (16|M0)  d<1>:f  s0<1;1,0>:bf  t<0;1,0>:f
6102         //    2. split instructions  (case 7)
6103         //         add (16|M0)   d:bf   s0:bf    s1:bf
6104         //       ==>
6105         //         add (8|M0)   d:bf    s0:bf    s1:bf
6106         //         add (8|M8)   d.8:bf  s0.8:bf  s1.8:bf
6107         //    3. legalize operands by using cvt mov to BF or from BF. (case 1&2&3)
6108         //         mul (8|M0) d:bf   s0:bf  s1:bf
6109         //       ==>
6110         //         mov (8|M0) s:f   s1:bf
6111         //         mul (8|M0) t:bf   s0:bf  s:f
6112         //    Note pure BF insts will be handled in the second iteration.
6113         INST_LIST_ITER nextII = bb->begin();
6114         for (auto II = nextII, IE = bb->end(); II != IE; II = nextII)
6115         {
6116             ++nextII;
6117             G4_INST* Inst = *II;
6118             if (skipBFCheck(Inst))
6119                 continue;
6120 
6121             const bool isBFAllowedInst = allowBFForInst(Inst);
6122             const G4_ExecSize currES = Inst->getExecSize();
6123             std::list<INST_LIST_ITER> instsToSplit;
6124 
6125             // 1. Handle illegal BF scalar by generating mov
6126             //    First generate mov for scalars instead of splitting first and
6127             //    than generating mov. Doing so would need just one mov.
6128             bool changed = false;
6129             if (currES > nativeES)
6130             {
6131                 // If inst's execsize <= nativeES, it doesn't need splitting,
6132                 // as its operand takes one GRF at most.
6133                 for (int i = 0, nsrc = (int)Inst->getNumSrc(); i < nsrc; ++i)
6134                 {
6135                     G4_Operand* S = Inst->getSrc(i);
6136                     Gen4_Operand_Number opndNum = Inst->getSrcOperandNum(i);
6137                     if (S->getType() == Type_BF && S->isSrcRegRegion())
6138                     {
6139                         if (S->asSrcRegRegion()->getRegion()->isScalar()
6140                             && (!isBFAllowedInst || !allowBFForInst(Inst, opndNum)))
6141                         {
6142                             G4_Operand* newSrc = insertMovBefore(II, i, Type_F, bb);
6143                             Inst->setSrc(newSrc, i);
6144                             changed = true;
6145                         }
6146                     }
6147                     else if (S->getType() == Type_BF && S->isImm())
6148                     {
6149                         assert(false && "BF immediate not supported!");
6150                     }
6151                 }
6152             }
6153 
6154             // If changed, check if it still uses BF. Skip if not.
6155             if (changed && !useGivenType(Inst, Type_BF))
6156             {
6157                 continue;
6158             }
6159 
6160             // 2. Split instruction (case 7) if needed
6161             //    Now, BF operands are all non-scalar for insts that need splitting.
6162             //    We split inst under the following:
6163             //      1. If an inst, which don't support BF, has BF operands. Those BF operands
6164             //         must be replaced with F operands (by inserting mov to convert BF to F).
6165             //         If replacing a BF operand with a F operand makes it cross 2 GRF, it must
6166             //         be splitted (currES * F" > 2 GRF); or
6167             //      2. Split if currES > nativeES for insts that support BF. (case 7)
6168             std::list<INST_LIST_ITER> instsToCheck;
6169             if ((!isBFAllowedInst && (TypeSize(Type_F) * currES) > (getGRFSize() * 2))
6170                 || (isBFAllowedInst && currES > nativeES))
6171             {
6172                 if (currES == g4::SIMD32)
6173                 {
6174                     splitSIMD32Inst(II, bb);
6175                     if (isBFAllowedInst && nativeES == g4::SIMD8)
6176                     {
6177                         // need to split again.
6178                         INST_LIST_ITER prev_it = std::prev(II);
6179                         evenlySplitInst(prev_it, bb);
6180                         instsToCheck.push_back(std::prev(prev_it));
6181                         instsToCheck.push_back(prev_it);
6182                         evenlySplitInst(II, bb);
6183                     }
6184                 }
6185                 else
6186                 {
6187                     evenlySplitInst(II, bb);
6188                 }
6189                 instsToCheck.push_back(std::prev(II));
6190                 instsToCheck.push_back(II);
6191             }
6192             else
6193             {
6194                 instsToCheck.push_back(II);
6195             }
6196 
6197             // 3. Change BF operands, which are not legal, to F by generating mov.
6198             //    (isBFAllowedInst should be still valid to check if any new instruction
6199             //     from splitting is BF allowed or not.)
6200             for (auto LI : instsToCheck)
6201             {
6202                 INST_LIST_ITER thisII = LI;
6203                 G4_INST* tI = *thisII;
6204                 for (int i = 0, nsrc = (int)tI->getNumSrc(); i < nsrc; ++i)
6205                 {
6206                     G4_Operand* S = tI->getSrc(i);
6207                     Gen4_Operand_Number opndNum = tI->getSrcOperandNum(i);
6208                     if (S->getType() == Type_BF
6209                         && (!isBFAllowedInst || !allowBFForInst(tI, opndNum)))
6210                     {
6211                         G4_Operand* newSrc = insertMovBefore(thisII, i, Type_F, bb);
6212                         tI->setSrc(newSrc, i);
6213                     }
6214                 }
6215 
6216                 G4_DstRegRegion* Dst = tI->getDst();
6217                 if (!isBFAllowedInst && Dst && !Dst->isNullReg() && Dst->getType() == Type_BF)
6218                 {
6219                     G4_DstRegRegion* newDst = insertMovAfter(thisII, Dst, Type_F, bb);
6220                     tI->setDest(newDst);
6221 
6222                     auto movII = std::next(II);
6223                     instsToSplit.push_back(movII);
6224                     G4_INST* movI = *movII;
6225 
6226                     Inst->transferUse(movI);
6227                     Inst->addDefUse(movI, Opnd_src0);
6228                 }
6229             }
6230             instsToCheck.clear();
6231         }
6232 
6233         // Second iteration:
6234         //     Legalize regions by using mov.
6235         nextII = bb->begin();
6236         for (auto II = nextII, IE = bb->end(); II != IE; II = nextII)
6237         {
6238             ++nextII;
6239             G4_INST* Inst = *II;
6240             if (skipBFCheck(Inst))
6241                 continue;
6242 
6243             // Because of the first iteration above, this inst must support bf mixed mode.
6244             assert(allowBFForInst(Inst));
6245 
6246             const G4_ExecSize currES = Inst->getExecSize();
6247             bool changed = false;
6248             // case 4: broadcast of bf is not supported!
6249             //    As this bf operand is changed to F. At the end of loop, need to check
6250             //    if this inst still has both BF and F, and "changed" is for this purpose.
6251             // case 8: pure BF is not allowed.
6252             for (int i = 0, nsrc = (int)Inst->getNumSrc(); i < nsrc; ++i)
6253             {
6254                 G4_Operand* S = Inst->getSrc(i);
6255                 if (S->getType() == Type_BF)
6256                 {
6257                     assert(S->isSrcRegRegion());
6258                     G4_SrcRegRegion* srcReg = S->asSrcRegRegion();
6259                     if ((srcReg->getRegion()->isScalar() && currES > g4::SIMD1)  // broadcast BF scalar
6260                         || (i == (nsrc - 1) && !useGivenType(Inst, Type_F)))     // pure BF.
6261                     {
6262                         // Insert bf->f, which is just a left-shift.
6263                         uint32_t nelts = (uint32_t)(srcReg->getRegion()->isScalar() ? g4::SIMD1 : currES);
6264                         G4_Declare* newDcl = builder.createTempVar(nelts,
6265                             Type_UD, (nelts == 1) ? Even_Word : GRFALIGN, "cvtF", false);
6266                         G4_DstRegRegion* newDst = builder.createDst(newDcl->getRegVar(), Type_UD);
6267                         srcReg->setType(Type_UW);
6268                         G4_INST* shlInst = builder.createBinOp(G4_shl,
6269                             (nelts== 1) ? g4::SIMD1 : currES,
6270                             newDst, S, builder.createImm(16, Type_UD), InstOpt_WriteEnable, false);
6271                         bb->insertBefore(II, shlInst);
6272 
6273                         // srcMod, if present, must be on the promoted F operand!
6274                         G4_SrcModifier sMod = srcReg->getModifier();
6275                         srcReg->setModifier(Mod_src_undef);
6276                         G4_SrcRegRegion* newSrc = builder.createSrc(
6277                             newDcl->getRegVar(), 0, 0,
6278                             (nelts == 1) ? builder.getRegionScalar() : builder.getRegionStride1(), Type_F);
6279                         newSrc->setModifier(sMod);
6280                         Inst->setSrc(newSrc, i);
6281 
6282                         Gen4_Operand_Number opndNum = Inst->getSrcOperandNum(i);
6283                         Inst->transferDef(shlInst, opndNum, Opnd_src0);
6284                         shlInst->addDefUse(Inst, opndNum);
6285 
6286                         changed = true;
6287                     }
6288                 }
6289             }
6290 
6291             if (changed)
6292             {
6293                 // Check again if there is still BF type, if not, we are done.
6294                 if (!useGivenType(Inst, Type_BF))
6295                 {
6296                     continue;
6297                 }
6298             }
6299 
6300             if (currES == g4::SIMD1)
6301             {
6302                 // Done
6303                 continue;
6304             }
6305 
6306             for (int i = 0, nsrc = (int)Inst->getNumSrc(); i < nsrc; ++i)
6307             {
6308                 G4_Operand* S = Inst->getSrc(i);
6309                 if (S->getType() == Type_F
6310                     && (S->isImm() || (S->isSrcRegRegion() && S->asSrcRegRegion()->getRegion()->isScalar())))
6311                 {
6312                     continue;
6313                 }
6314 
6315                 assert(S->isSrcRegRegion());
6316                 G4_SrcRegRegion* sReg = S->asSrcRegRegion();
6317 
6318                 // case 6: Packed bfloat16 source and destination when register offset is 0 or 8.
6319                 //         (also for Float dst/src alignment)
6320                 //         Note that for F, enforce it to have subRegOff = 0 (too restrictive?)
6321                 bool isPackedSrc = (sReg->getRegion()->isContiguous(Inst->getExecSize())
6322                     && (sReg->getSubRegOff() == 0 || (sReg->getType() == Type_BF && sReg->getSubRegOff() == nativeES)));
6323                 if (isPackedSrc)
6324                 {
6325                     continue;
6326                 }
6327 
6328                 G4_Operand* newSrc = insertMovBefore(II, i, sReg->getType(), bb, GRFALIGN);
6329                 Inst->setSrc(newSrc, i);
6330             }
6331 
6332             if (Inst->isCompare())
6333             {
6334                 // Ignore compare's dst.
6335                 continue;
6336             }
6337 
6338             G4_DstRegRegion* dst = Inst->getDst();
6339             uint32_t subOff = dst->getSubRegOff();
6340             // case 5
6341             bool isUnpackedDst = (dst->getType() == Type_BF
6342                 && dst->getHorzStride() == 2 && (subOff == 0 || subOff == 1));
6343             // case 6, note for F, force it to have subOff = 0
6344             bool isPackedDst = (dst->getHorzStride() == 1
6345                 && (subOff == 0 || (subOff == nativeES && dst->getType() == Type_BF)));
6346             if (!(isPackedDst || isUnpackedDst))
6347             {
6348                 // case 5 Unpacked bfloat16 destination with stride 2 when register offset is 0 or 1.
6349                 G4_DstRegRegion* newDst = insertMovAfter(II, dst, dst->getType(), bb, GRFALIGN);
6350                 Inst->setDest(newDst);
6351 
6352                 auto movII = std::next(II);
6353                 G4_INST* movI = *movII;
6354 
6355                 Inst->transferUse(movI, false);
6356                 Inst->addDefUse(movI, Opnd_src0);
6357             }
6358         }
6359     }
6360 }
6361 
chkHWConformity()6362 void HWConformity::chkHWConformity()
6363 {
6364     fixDataLayout();
6365 
6366     fixBFMixedMode();
6367 
6368     for (auto bb : kernel.fg)
6369     {
6370         curBB = bb;
6371         fixIntToHFMove(bb);
6372 #ifdef _DEBUG
6373         verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
6374 #endif
6375         fixAddcSubb(bb);
6376 #ifdef _DEBUG
6377         verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
6378 #endif
6379 
6380         fixMADInst(bb);
6381 
6382 #ifdef _DEBUG
6383         verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
6384 #endif
6385         // fix source operand first to avoid redundant MOVs if this fix is done after
6386         // reducing execution size.
6387         // used by 3d. Mainly to fix sel with two imm sources
6388         fixOpndTypeAlign(bb);
6389 
6390 #ifdef _DEBUG
6391         verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
6392 #endif
6393 
6394         fixInstExecSize(bb);
6395 
6396 #ifdef _DEBUG
6397         verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
6398 #endif
6399 
6400         fixMixedHFInst(bb);
6401 
6402 #ifdef _DEBUG
6403         verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
6404 #endif
6405         fixSADA2Inst(bb);
6406 
6407 #ifdef _DEBUG
6408         verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
6409 #endif
6410 
6411         fixSendInst(bb);
6412 
6413         if (builder.avoidDstSrcOverlap())
6414         {
6415             fixOverlapInst(bb);
6416         }
6417 
6418         if (builder.avoidSrc1Src2Overlap())
6419         {
6420             fixsrc1src2Overlap(bb);
6421         }
6422 
6423 #ifdef _DEBUG
6424         verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
6425 #endif
6426 
6427         conformBB(bb);
6428 
6429 #ifdef _DEBUG
6430         verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
6431 #endif
6432     }
6433 
6434     if (builder.avoidDstSrcOverlap())
6435     {
6436         PointsToAnalysis p(kernel.Declares, kernel.fg.getNumBB());
6437         p.doPointsToAnalysis(kernel.fg);
6438 
6439         avoidDstSrcOverlap(p);
6440     }
6441 }
6442 
hasBadRegion(G4_INST * inst)6443 bool HWConformity::hasBadRegion(G4_INST* inst)
6444 {
6445     if (inst->getImplAccDst() || inst->getImplAccSrc())
6446         return false;
6447     bool badRegion = false;
6448 
6449 #define G4_MAX_ADDR_IMM        511
6450 #define         GENX_MAX_H_STRIDE           4
6451     for (unsigned int srcNum = 0, n_srcs = inst->getNumSrc(); srcNum < n_srcs; srcNum++)
6452     {
6453         if (!(inst->getSrc(srcNum)->isSrcRegRegion()))
6454         {
6455             continue;
6456         }
6457         const RegionDesc* rd = inst->getSrc(srcNum)->asSrcRegRegion()->getRegion();
6458         if (rd->isRegionWH())
6459         {
6460             badRegion = true;
6461             break;
6462         }
6463         if (rd->horzStride == GENX_MAX_H_STRIDE && rd->width > 1)
6464         {
6465             badRegion = true;
6466             break;
6467         }
6468         G4_SrcRegRegion* expandSrcRegion = inst->getSrc(srcNum)->asSrcRegRegion();
6469         if (expandSrcRegion->getRegAccess() != Direct)
6470         {
6471             const RegionDesc* origRegion = expandSrcRegion->getRegion();
6472             short secondSubRegOffDiff = 0, secondAddrImmedDiff = 0;
6473 
6474             if (origRegion->width == 1)
6475             {
6476                 secondSubRegOffDiff = origRegion->vertStride;
6477             }
6478             else
6479             {
6480                 secondSubRegOffDiff = origRegion->horzStride;
6481             }
6482             secondAddrImmedDiff = (short)(secondSubRegOffDiff * expandSrcRegion->getTypeSize());
6483             if ((expandSrcRegion->getAddrImm() + secondAddrImmedDiff) > G4_MAX_ADDR_IMM)
6484             {
6485                 badRegion = true;
6486                 break;
6487             }
6488         }
6489     }
6490     return badRegion;
6491 }
6492 
canSplitInst(G4_INST * inst,G4_INST * use_op)6493 bool HWConformity::canSplitInst(G4_INST* inst, G4_INST* use_op)
6494 {
6495     if ((inst->getPredicate() && inst->getExecSize() < g4::SIMD16) || hasBadRegion(inst))
6496         return false;
6497 
6498     G4_CondMod* condMod = inst->getCondMod();
6499     if (condMod)
6500     {
6501         return false;
6502     }
6503 
6504     for (int i = 0; i < inst->getNumSrc(); i++)
6505     {
6506         G4_Operand* src = inst->getSrc(i);
6507         if (src->isAccReg())
6508         {
6509             // don't split inst with explicit acc
6510             return false;
6511         }
6512         if (src->isSrcRegRegion() &&
6513             src->asSrcRegRegion()->getRegion()->vertStride == 32 &&
6514             src->asSrcRegRegion()->getRegion()->width == 1)
6515         {
6516             // don't split the source into even/odd since verstride can't exceed 32
6517             // ToDo: check for horizontal stride as well?
6518             return false;
6519         }
6520     }
6521 
6522     return true;
6523 }
6524 
canSplitByteDst(G4_opcode op)6525 bool HWConformity::canSplitByteDst(G4_opcode op)
6526 {
6527     switch (op)
6528     {
6529     case G4_mac:
6530     case G4_mach:
6531     case G4_cmp:
6532     case G4_mad:
6533     case G4_sad2:
6534     case G4_sada2:
6535     case G4_line:
6536     case G4_send:
6537     case G4_sendc:
6538         return false;
6539     default:
6540         return true;
6541     }
6542 }
6543 // split one instruction into 2 if its dstination is packed byte and execution type is W.
6544 // for example:
6545 // add <16> V1(0,0)<1>:b V1(0,0)<16;16,1>:w V2(0,0)<16;16,1>:w
6546 // ==>
6547 // add <8> V1(0,0)<2>:b V1(0,0)<16;8,2>:w V2(0,0)<16;8,2>:w
6548 // add <8> V1(0,1)<2>:b V1(0,1)<16;8,2>:w V2(0,1)<16;8,2>:w
6549 
6550 // if predicate is used for instruction, the definition of this predicate is tracked and the
6551 // corresponding instruction is checked to see if it can do the same split.
splitInstListForByteDst(INST_LIST_ITER it,G4_BB * bb,uint16_t extypesize)6552 bool HWConformity::splitInstListForByteDst(INST_LIST_ITER it, G4_BB* bb, uint16_t extypesize)
6553 {
6554     G4_INST* inst = *it;
6555     G4_opcode inst_op = inst->opcode();
6556     G4_DstRegRegion* dst = inst->getDst();
6557     // check if we can split the inst
6558     if (!canSplitByteDst(inst_op) ||
6559         inst->getExecSize() == g4::SIMD1 ||
6560         (!bb->isAllLaneActive() && !inst->isWriteEnableInst()) ||
6561         dst->getByteOffset() % extypesize != 0 ||
6562         dst->getHorzStride() != 1 ||
6563         extypesize != TypeSize(Type_W))
6564     {
6565         return false;
6566     }
6567 
6568     if (inst->getPredicate() || inst->getCondMod())
6569     {
6570         return false;
6571     }
6572 
6573     // recursively the inst that defines its predicate can be split
6574     INST_LIST expandOpList;
6575     bool canSplit = canSplitInst(inst, NULL);
6576     if (canSplit)
6577     {
6578         expandOpList.push_back(inst);
6579     }
6580 
6581     G4_INST* currInst = inst;
6582     while (canSplit && currInst->getPredicate())
6583     {
6584         // look for predicate def inst
6585         uint16_t defNum = 0;
6586         G4_INST* defInst = NULL;
6587 
6588         // FIXME: should be currInst->defInstList.begin()?
6589         for (auto def_iter = inst->def_begin(), end = inst->def_end(); def_iter != end; def_iter++)
6590         {
6591             if ((*def_iter).second == Opnd_pred)
6592             {
6593                 defNum++;
6594                 defInst = (*def_iter).first;
6595             }
6596         }
6597         if (defNum != 1 || !defInst->getCondMod())
6598         {
6599             canSplit = false;
6600             break;
6601         }
6602         if (canSplit)
6603         {
6604             if (!bb->isAllLaneActive() && !defInst->isWriteEnableInst())
6605             {
6606                 canSplit = false;
6607             }
6608             else
6609             {
6610                 canSplit = canSplitInst(defInst, currInst);
6611             }
6612         }
6613         // check if def inst can be split
6614         if (!canSplit)
6615         {
6616             break;
6617         }
6618         else
6619         {
6620             expandOpList.push_back(defInst);
6621             currInst = defInst;
6622         }
6623     }
6624 
6625     // split inst into two
6626     INST_LIST_ITER new_iter = it;
6627     new_iter++;
6628     if (canSplit)
6629     {
6630         while (!expandOpList.empty())
6631         {
6632             G4_INST* expand_op = expandOpList.front();
6633             expandOpList.pop_front();
6634             // find location of expand_op in instruction list
6635             do
6636             {
6637                 new_iter--;
6638                 if ((*new_iter) == expand_op)
6639                 {
6640                     break;
6641                 }
6642             } while (new_iter != bb->begin());
6643 
6644             MUST_BE_TRUE(new_iter != bb->end(), "Cannot find predicate definition function in BB.");
6645             new_iter++;
6646             G4_INST* secondHalfOp = splitInstWithByteDst(expand_op);
6647             MUST_BE_TRUE(secondHalfOp, "Error in spliting instruction.");
6648             bb->insertBefore(new_iter, secondHalfOp);
6649         }
6650     }
6651 
6652 
6653     return canSplit;
6654 }
6655 
splitInstWithByteDst(G4_INST * expand_op)6656 G4_INST* HWConformity::splitInstWithByteDst(G4_INST* expand_op)
6657 {
6658     G4_ExecSize newExecSize {expand_op->getExecSize() / 2};
6659 
6660     if (expand_op->getPredicate())
6661     {
6662         expand_op->getPredicate()->splitPred();
6663     }
6664     if (expand_op->getCondMod())
6665     {
6666         expand_op->getCondMod()->splitCondMod();
6667     }
6668     G4_INST* expand_sec_half_op = builder.createInternalInst(
6669         builder.duplicateOperand(expand_op->getPredicate()),
6670         expand_op->opcode(),
6671         builder.duplicateOperand(expand_op->getCondMod()),
6672         expand_op->getSaturate(),
6673         newExecSize,
6674         NULL,
6675         NULL,
6676         NULL,
6677         NULL,
6678         expand_op->getOption());
6679     MUST_BE_TRUE(expand_sec_half_op != NULL, ERROR_MEM_ALLOC);
6680 
6681     expand_op->setExecSize(newExecSize);
6682 
6683     if (expand_op->getDst() && !expand_op->hasNULLDst())
6684     {
6685         G4_DstRegRegion* old_dst = expand_op->getDst();
6686         short secondSubRegOff = old_dst->getSubRegOff() + 1;
6687 
6688         G4_DstRegRegion* newDstOpnd = nullptr;
6689 
6690         if (!old_dst->isIndirect())
6691         {
6692             newDstOpnd = builder.createDst(
6693                 old_dst->getBase(),
6694                 old_dst->getRegOff(),
6695                 old_dst->getSubRegOff(),
6696                 old_dst->getHorzStride() * 2,
6697                 old_dst->getType());
6698         }
6699         else
6700         {
6701             newDstOpnd = builder.createIndirectDst(
6702                 old_dst->getBase(),
6703                 old_dst->getSubRegOff(),
6704                 old_dst->getHorzStride() * 2,
6705                 old_dst->getType(),
6706                 old_dst->getAddrImm());
6707             secondSubRegOff -= 1;
6708         }
6709 
6710         expand_op->setDest(newDstOpnd);
6711 
6712         G4_DstRegRegion* secondDstOpnd = nullptr;
6713 
6714         if (!old_dst->isIndirect())
6715         {
6716             secondDstOpnd = builder.createDst(
6717                 old_dst->getBase(),
6718                 old_dst->getRegOff(),
6719                 secondSubRegOff,
6720                 old_dst->getHorzStride() * 2,
6721                 old_dst->getType());
6722         }
6723         else
6724         {
6725             secondDstOpnd = builder.createIndirectDst(
6726                 old_dst->getBase(),
6727                 secondSubRegOff,
6728                 old_dst->getHorzStride() * 2,
6729                 old_dst->getType(),
6730                 old_dst->getAddrImm() + 1);
6731         }
6732 
6733         expand_sec_half_op->setDest(secondDstOpnd);
6734     }
6735     else
6736     {
6737         expand_sec_half_op->setDest(expand_op->getDst());
6738     }
6739 
6740     for (int k = 0, n_srcs = expand_op->getNumSrc(); k < n_srcs; k++)
6741     {
6742         G4_Operand* expand_src = expand_op->getSrc(k);
6743 
6744         if (!expand_src)
6745             continue;
6746 
6747         if ((expand_op->isMath() && k == 1 && expand_src->isNullReg()) ||
6748             expand_src->isImm()) {
6749             expand_sec_half_op->setSrc(expand_src, k);
6750         }
6751         else if (expand_src->isSrcRegRegion()) {
6752             G4_SrcRegRegion* expandSrcRegion = expand_src->asSrcRegRegion();
6753 
6754             if (expandSrcRegion->isScalar()) {
6755                 expand_sec_half_op->setSrc(builder.duplicateOperand(expand_src), k);
6756             }
6757             else {
6758                 short secondSubRegOffDiff = 0, secondAddrImmedDiff = 0;
6759 
6760                 const RegionDesc* origRegion = expandSrcRegion->getRegion();
6761                 const RegionDesc* newRegion = NULL;
6762 
6763                 if (origRegion->width == 1)
6764                 {
6765                     newRegion = builder.createRegionDesc(origRegion->vertStride * 2, origRegion->width, origRegion->horzStride);
6766                     secondSubRegOffDiff = origRegion->vertStride;
6767                 }
6768                 else
6769                 {
6770                     unsigned short newWD = origRegion->width / 2;
6771                     secondSubRegOffDiff = origRegion->horzStride;
6772                     newRegion = builder.createRegionDesc(
6773                         (newWD == 1 && newExecSize == 1) ? 0 : origRegion->vertStride,
6774                         newWD, (newWD == 1) ? 0 : origRegion->horzStride * 2);
6775                 }
6776                 secondAddrImmedDiff = (short)(secondSubRegOffDiff * expand_src->getTypeSize());
6777                 expandSrcRegion->setRegion(newRegion);
6778 
6779                 bool directSrc = (expandSrcRegion->getRegAccess() == Direct);
6780                 if (secondAddrImmedDiff >= (int)numEltPerGRF<Type_UB>())
6781                 {
6782                     secondSubRegOffDiff =
6783                         (short)((secondAddrImmedDiff - numEltPerGRF<Type_UB>()) / expand_src->getTypeSize());
6784                 }
6785                 G4_SrcRegRegion* secondSrcOpnd = builder.createSrcRegRegion(
6786                     expandSrcRegion->getModifier(),
6787                     expandSrcRegion->getRegAccess(),
6788                     expandSrcRegion->getBase(),
6789                     expandSrcRegion->getRegOff() + ((directSrc && secondAddrImmedDiff >= (int)numEltPerGRF<Type_UB>()) ? 1 : 0),
6790                     expandSrcRegion->getSubRegOff() + (directSrc ? secondSubRegOffDiff : 0),
6791                     newRegion,
6792                     expandSrcRegion->getType());
6793                 if (expandSrcRegion->getRegAccess() != Direct)
6794                 {
6795                     secondSrcOpnd->setImmAddrOff(expandSrcRegion->getAddrImm() + secondAddrImmedDiff);
6796                 }
6797                 expand_sec_half_op->setSrc(secondSrcOpnd, k);
6798             }
6799         }
6800     }
6801     expand_sec_half_op->inheritDIFrom(expand_op);
6802 
6803     if (expand_op->getPredicate() || expand_op->getCondMod())
6804     {
6805         if (expand_op->getMaskOffset() == 0)
6806         {
6807             expand_sec_half_op->setMaskOption(InstOpt_M8);
6808         }
6809         else if (expand_op->getMaskOffset() == 16)
6810         {
6811             expand_sec_half_op->setMaskOption(InstOpt_M24);
6812         }
6813         else if (!(expand_op->opcode() == G4_sel && !(expand_op->getPredicate()) && expand_op->getCondMod()))
6814         {
6815             expand_sec_half_op->setMaskOption(newExecSize > 8 ? InstOpt_M16 : InstOpt_M8);
6816         }
6817     }
6818     return expand_sec_half_op;
6819 }
6820 
6821 //  in addition, fix the source region to follow the region restriction:
6822 //  1. ExecSize must be greater than or equal to Width.  -- no check for this one
6823 //  2. If ExecSize = Width and HorzStride ? 0, VertStride must be set to Width * HorzStride.
6824 //  3. If ExecSize = Width and HorzStride = 0, there is no restriction on VertStride.
6825 //  4. If Width = 1, HorzStride must be 0 regardless of the values of ExecSize and VertStride.
6826 //  5. If ExecSize = Width = 1, both VertStride and HorzStride must be 0. This defines a scalar.
6827 //  6. If VertStride = HorzStride = 0, Width must be 1 regardless of the value of ExecSize.
6828 //  7. Dst.HorzStride must not be 0.        -- this needs not to be checked.
6829 //  8. VertStride must be used to cross GRF register boundaries. This rule implies that
6830 //      elements within a 'Width' cannot cross GRF boundaries.
fixSrcRegion(G4_INST * inst)6831 void HWConformity::fixSrcRegion(G4_INST* inst)
6832 {
6833     bool comprInst = isCompressedInst(inst);
6834     for (int i = 0; i < G4_MAX_SRCS; i++)
6835     {
6836         if (inst->getSrc(i) && inst->getSrc(i)->isSrcRegRegion() && !inst->getSrc(i)->isNullReg())
6837         {
6838             G4_SrcRegRegion* src = inst->getSrc(i)->asSrcRegRegion();
6839             const RegionDesc* srcRegion = src->getRegion();
6840             if (srcRegion->isRegionWH() || srcRegion->isRegionV() || srcRegion->isRegionSW())
6841             {
6842                 // normalize VxH regions if possible
6843                 if (srcRegion->isRegionWH() && srcRegion->width == inst->getExecSize())
6844                 {
6845                     // r[a0.0]<E, S> -> r[a0.0]<S;1,0>
6846                     src->setRegion(builder.createRegionDesc(srcRegion->horzStride, 1, 0));
6847                 }
6848                 // ToDo: add other legalization
6849                 continue;
6850             }
6851 
6852             //ToDo: most of these checks should be obsolete at this point
6853             uint16_t vs = srcRegion->vertStride, wd = srcRegion->width, hs = srcRegion->horzStride;
6854             uint8_t exSize = inst->getExecSize();
6855             MUST_BE_TRUE(inst->isSend() || exSize >= wd, " Bad source region: Width is greater than execution size.");
6856             if (comprInst)
6857             {
6858                 if (inst->getSrc(i)->getTypeSize() > G4_WSIZE &&
6859                     wd == exSize &&
6860                     vs == wd && hs == 1)
6861                 {
6862                     vs = wd = exSize / 2;
6863                 }
6864             }
6865             if (wd == exSize && hs != 0 && vs != wd * hs)
6866             {
6867                 // <V;E,H> --> <V*H;E,H>
6868                 vs = wd * hs;
6869             }
6870             if (wd == 1)
6871             {
6872                 // <V;1,H> -> <V;1,0> or <0;1,0>
6873                 hs = 0;
6874                 if (1 == exSize)
6875                     vs = 0;
6876             }
6877             if (vs == 0 && hs == 0)
6878             {
6879                 // <0;N,0> -> <0;1,0>
6880                 wd = 1;
6881             }
6882             if (hs == 0 &&
6883                 ((inst->getSrc(i)->getTypeSize() == G4_WSIZE &&
6884                     exSize == 32 && vs == 32 && wd == 32) ||
6885                     (inst->getSrc(i)->getTypeSize() == G4_DSIZE &&
6886                         exSize == 16 && vs == 16 && wd == 16)))
6887             {
6888                 vs = 0;
6889                 wd = 1;
6890             }
6891 
6892             // check cross GRF (rule 2H)
6893             // TODO! for the following two cases, split the instruction:
6894             // source region is like <8;4,1>
6895             // source region is like <2;4,1>
6896             if (src->getRegAccess() == Direct && src->crossGRF() && hs != 0)
6897             {
6898                 // TODO: this is a temp fix
6899                 if ((builder.getPlatform() == GENX_BDW || builder.getPlatform() == GENX_CHV) && vs < wd * hs)
6900                     continue;
6901                 // check number of elements in first GRF.
6902                 uint16_t execTypeSize = hs * src->getElemSize();
6903                 uint16_t sizeInFirstGRF = numEltPerGRF<Type_UB>() - src->getLeftBound() % numEltPerGRF<Type_UB>();
6904                 uint16_t vertSize = vs * src->getTypeSize();
6905                 uint16_t numEle = (sizeInFirstGRF + execTypeSize - 1) / execTypeSize;
6906                 uint16_t rowSize = wd * execTypeSize;
6907 
6908                 if (sizeInFirstGRF <= vertSize)
6909                 {
6910                     if (numEle >= wd)
6911                     {
6912                         numEle = wd;
6913                     }
6914                 }
6915                 else if (vs > wd)
6916                 {
6917                     numEle = sizeInFirstGRF / vertSize * wd +
6918                         ((sizeInFirstGRF % vertSize > rowSize) ? wd : (sizeInFirstGRF % vertSize + execTypeSize - 1) / execTypeSize);
6919                 }
6920                 // wd is used to cross GRF, change to <vs;1,0>
6921                 if (numEle < wd || (wd >= vs && numEle % wd != 0))
6922                 {
6923 
6924                     wd = 1;
6925                     if (hs == 0)
6926                     {
6927                         vs = 1;
6928                     }
6929                     else
6930                     {
6931                         vs = hs;
6932                     }
6933                     hs = 0;
6934                 }
6935             }
6936 
6937             if (vs != srcRegion->vertStride || wd != srcRegion->width || hs != srcRegion->horzStride)
6938             {
6939                 G4_SrcRegRegion* origSrc = inst->getSrc(i)->asSrcRegRegion();
6940                 origSrc->setRegion(builder.createRegionDesc(vs, wd, hs));
6941             }
6942         }
6943     }
6944     if (inst->getDst() && !inst->hasNULLDst())
6945     {
6946         MUST_BE_TRUE(inst->getDst()->getHorzStride() != 0,
6947             "Bad source region: Width is greater than execution size.");
6948     }
6949 }
6950 
6951 //
6952 //single entry point for HW conformity checks
6953 //
HWConformityChk(IR_Builder & builder,G4_Kernel & kernel,Mem_Manager & mem)6954 void HWConformityChk(IR_Builder& builder, G4_Kernel& kernel, Mem_Manager& mem)
6955 {
6956     HWConformity conformity(builder, kernel, mem);
6957     conformity.chkHWConformity();
6958 }
6959 
markPackedByteReference(G4_Kernel & kernel,G4_Operand * opnd,G4_INST * inst)6960 bool HWConformity::markPackedByteReference(G4_Kernel& kernel, G4_Operand* opnd, G4_INST* inst)
6961 {
6962     G4_Declare* dcl = NULL, * topdcl = NULL;
6963     bool foundOptCandidate = false;
6964 
6965     if ((opnd->isSrcRegRegion() || opnd->isDstRegRegion()))
6966     {
6967         if (opnd->getBase() && opnd->getBase()->isRegVar())
6968         {
6969             dcl = opnd->getBase()->asRegVar()->getDeclare();
6970             topdcl = dcl->getRootDeclare();
6971         }
6972     }
6973 
6974     if (topdcl != NULL &&
6975         topdcl->getRegFile() == G4_GRF &&
6976         !(topdcl->getAddressed()))
6977     {
6978         if (topdcl->doNotWiden() || inst->mayExceedTwoGRF())
6979         {
6980             //send has no regioning so it is certainly illegal to change data layout
6981             setAccessPattern(topdcl, ACCESS_PATTERN_INVALID);
6982             return false;
6983         }
6984 
6985         if (opnd->isDstRegRegion() &&
6986             // check if the opnd has pre-assigned physical regsiter
6987             !(topdcl->getRegVar()->isPhyRegAssigned()) &&
6988             // check if the opnd is global
6989             !(kernel.fg.globalOpndHT.isOpndGlobal(opnd)) &&
6990             // check if the opnd is used as packed byte
6991             opnd->getTypeSize() == 1 &&
6992             !hasDedicateAlignRegionConformity(inst) &&
6993             dcl->getElemSize() == 1 &&
6994             opnd->asDstRegRegion()->getHorzStride() == 1 &&
6995             // check if the instruction is a raw mov
6996             !inst->isRawMov() &&
6997             // check if the instruction execution type is word
6998             // (This should be the most common case that can benefit
6999             //  from this optimization. It could be extended to other
7000             //  cases like D execution type).
7001             TypeSize(inst->getExecType()) == 2)
7002         {
7003             unsigned int leftBound = opnd->asDstRegRegion()->getLeftBound();
7004             unsigned int rightBound = opnd->asDstRegRegion()->getRightBound();
7005 
7006             if (((rightBound * 2 / numEltPerGRF<Type_UB>() - leftBound * 2 / numEltPerGRF<Type_UB>()) > 1) ||
7007                 (builder.getPlatform() == GENX_BDW &&
7008                 (rightBound * 2 / numEltPerGRF<Type_UB>() != leftBound * 2 / numEltPerGRF<Type_UB>())))
7009             {
7010                 setAccessPattern(topdcl, ACCESS_PATTERN_INVALID);
7011             }
7012             else if (getAccessPattern(topdcl) == ACCESS_PATTERN_UNDEF)
7013             {
7014                 setAccessPattern(topdcl, ACCESS_PATTERN_PACKED_BYTE);
7015                 foundOptCandidate = true;
7016             }
7017         }
7018         else if (opnd->isSrcRegRegion() &&
7019             // check if the opnd has pre-assigned physical regsiter
7020             !(opnd->asSrcRegRegion()->getBase()->asRegVar()->isPhyRegAssigned()) &&
7021             // check if the opnd is global
7022             !(kernel.fg.globalOpndHT.isOpndGlobal(opnd)) &&
7023             // check if the opnd is used as packed byte
7024             opnd->getTypeSize() == 1 &&
7025             dcl->getElemSize() == 1 &&
7026             opnd->asSrcRegRegion()->getRegion()->isContiguous(inst->getExecSize()))
7027         {
7028             unsigned int leftBound = opnd->asSrcRegRegion()->getLeftBound();
7029             unsigned int rightBound = opnd->asSrcRegRegion()->getRightBound();
7030 
7031             if (((rightBound * 2 / numEltPerGRF<Type_UB>() - leftBound * 2 / numEltPerGRF<Type_UB>()) > 1) ||
7032                 (builder.getPlatform() == GENX_BDW &&
7033                 (rightBound * 2 / numEltPerGRF<Type_UB>() != leftBound * 2 / numEltPerGRF<Type_UB>())))
7034             {
7035                 setAccessPattern(topdcl, ACCESS_PATTERN_INVALID);
7036             }
7037         }
7038         else
7039         {
7040             setAccessPattern(topdcl, ACCESS_PATTERN_INVALID);
7041         }
7042     }
7043 
7044     return foundOptCandidate;
7045 }
7046 
fixPackedByteReference(IR_Builder & builder,G4_Operand * opnd)7047 G4_Operand* HWConformity::fixPackedByteReference(IR_Builder& builder, G4_Operand* opnd)
7048 {
7049     G4_Operand* newOpnd = NULL;
7050     G4_Declare* topdcl = NULL;
7051 
7052     if (opnd->isDstRegRegion() ||
7053         opnd->isSrcRegRegion())
7054     {
7055         topdcl = GetTopDclFromRegRegion(opnd);
7056     }
7057 
7058     if (topdcl != NULL &&
7059         getAccessPattern(topdcl) == ACCESS_PATTERN_PACKED_BYTE)
7060     {
7061         if (opnd->isDstRegRegion())
7062         {
7063             short dst_regoff = opnd->asDstRegRegion()->getRegOff();
7064             short dst_subregoff = opnd->asDstRegRegion()->getSubRegOff();
7065             short off = (dst_regoff * numEltPerGRF<Type_UB>() + dst_subregoff) * 2;
7066 
7067             dst_regoff = off / numEltPerGRF<Type_UB>();
7068             dst_subregoff = off % numEltPerGRF<Type_UB>();
7069 
7070             G4_DstRegRegion* newDstOpnd = builder.createDst(
7071                 opnd->getBase()->asRegVar(),
7072                 dst_regoff,
7073                 dst_subregoff,
7074                 2,
7075                 opnd->getType());
7076             newOpnd = newDstOpnd;
7077         }
7078         else if (opnd->isSrcRegRegion())
7079         {
7080             short src_regoff = opnd->asSrcRegRegion()->getRegOff();
7081             short src_subregoff = opnd->asSrcRegRegion()->getSubRegOff();
7082             short off = (src_regoff * numEltPerGRF<Type_UB>() + src_subregoff) * 2;
7083 
7084             src_regoff = off / numEltPerGRF<Type_UB>();
7085             src_subregoff = off % numEltPerGRF<Type_UB>();
7086 
7087             const RegionDesc* rd = builder.getRegionStride2();
7088             G4_SrcRegRegion* newSrcOpnd = builder.createSrcRegRegion(opnd->asSrcRegRegion()->getModifier(),
7089                 Direct,
7090                 opnd->getBase()->asRegVar(),
7091                 src_regoff,
7092                 src_subregoff,
7093                 rd,
7094                 opnd->getType());
7095             newOpnd = newSrcOpnd;
7096         }
7097     }
7098 
7099     return newOpnd;
7100 }
7101 
fixDataLayout()7102 void HWConformity::fixDataLayout()
7103 {
7104     bool changeDataLayout = false;
7105 
7106     for (auto& bb : kernel.fg)
7107     {
7108         for (auto& inst : *bb)
7109         {
7110             if (G4_Inst_Table[inst->opcode()].n_dst == 1)
7111             {
7112                 G4_Operand* dst = inst->getDst();
7113 
7114                 if (dst)
7115                 {
7116                     bool foundOptCandidate = markPackedByteReference(kernel, dst, inst);
7117                     if (changeDataLayout == false && foundOptCandidate)
7118                     {
7119                         changeDataLayout = true;
7120                     }
7121                 }
7122             }
7123 
7124             for (int i = 0; i < inst->getNumSrc(); i++)
7125             {
7126                 G4_Operand* src = inst->getSrc(i);
7127 
7128                 if (src)
7129                 {
7130                     markPackedByteReference(kernel, src, inst);
7131                 }
7132             }
7133         }
7134     }
7135 
7136     if (changeDataLayout)
7137     {
7138         for (auto& dcl : kernel.Declares)
7139         {
7140             G4_Declare* topdcl = dcl->getRootDeclare();
7141 
7142             if (getAccessPattern(topdcl) == ACCESS_PATTERN_PACKED_BYTE)
7143             {
7144                 dcl->setTotalElems(dcl->getTotalElems() * 2);
7145 
7146                 if (dcl != topdcl)
7147                 {
7148                     G4_Declare* aliasDcl = dcl->getAliasDeclare();
7149                     unsigned int aliasOffset = dcl->getAliasOffset();
7150                     dcl->setAliasDeclare(aliasDcl, aliasOffset * 2);
7151                 }
7152             }
7153         }
7154 
7155         for (auto& bb : kernel.fg)
7156         {
7157             for (auto& inst : *bb)
7158             {
7159                 if (G4_Inst_Table[inst->opcode()].n_dst == 1)
7160                 {
7161                     G4_Operand* dst = inst->getDst();
7162                     G4_Operand* newDst = NULL;
7163 
7164                     if (dst)
7165                     {
7166                         newDst = fixPackedByteReference(builder, dst);
7167                         if (newDst)
7168                         {
7169                             inst->setDest(newDst->asDstRegRegion());
7170                         }
7171                     }
7172                 }
7173 
7174                 for (int i = 0; i < inst->getNumSrc(); i++)
7175                 {
7176                     G4_Operand* src = inst->getSrc(i);
7177                     G4_Operand* newSrc = NULL;
7178 
7179                     if (src)
7180                     {
7181                         newSrc = fixPackedByteReference(builder, src);
7182                         if (newSrc)
7183                         {
7184                             inst->setSrc(newSrc, i);
7185                         }
7186                     }
7187                 }
7188             }
7189         }
7190     }
7191 }
7192 
7193 // maintain def-use chain for current inst and the MOV inst generated for its dst
maintainDU4TempMov(G4_INST * inst,G4_INST * newInst)7194 void HWConformity::maintainDU4TempMov(G4_INST* inst, G4_INST* newInst)
7195 {
7196     if (newInst->getPredicate())
7197     {
7198         inst->transferDef(newInst, Opnd_pred, Opnd_pred);
7199     }
7200 
7201     inst->transferUse(newInst);
7202 
7203     inst->addDefUse(newInst, Opnd_src0);
7204 }
7205 
expandPlaneMacro(IR_Builder & builder,INST_LIST_ITER it,G4_BB * bb,bool secondHalf)7206 static void expandPlaneMacro(IR_Builder& builder, INST_LIST_ITER it, G4_BB* bb, bool secondHalf)
7207 {
7208     G4_INST* inst = *it;
7209     G4_DstRegRegion* dst = inst->getDst();
7210     G4_SrcRegRegion* src0 = inst->getSrc(0)->asSrcRegRegion();
7211     G4_SrcRegRegion* src1 = inst->getSrc(1)->asSrcRegRegion();
7212 
7213     G4_SrcRegRegion* srcP = builder.createSrcRegRegion(src0->getModifier(), Direct, src0->getBase(),
7214         src0->getRegOff(), src0->getSubRegOff(), builder.getRegionScalar(), src0->getType());
7215     G4_SrcRegRegion* srcQ = builder.createSrcRegRegion(src0->getModifier(), Direct, src0->getBase(),
7216         src0->getRegOff(), src0->getSubRegOff() + 1, builder.getRegionScalar(), src0->getType());
7217     G4_SrcRegRegion* srcR = builder.createSrcRegRegion(src0->getModifier(), Direct, src0->getBase(),
7218         src0->getRegOff(), src0->getSubRegOff() + 3, builder.getRegionScalar(), src0->getType());
7219 
7220     auto u = builder.createSrcWithNewRegOff(src1, src1->getRegOff() + (secondHalf ? 2 : 0));
7221     auto v = builder.createSrcWithNewRegOff(src1, src1->getRegOff() + (secondHalf ? 3 : 1));
7222     if (getGRFSize() == 64)
7223     {
7224         u = builder.createSrcRegRegion(src1->getModifier(), Direct, src1->getBase(),
7225             src1->getRegOff() + (secondHalf ? 1 : 0), src1->getSubRegOff(), src1->getRegion(), src1->getType(), src1->getAccRegSel());
7226         v = builder.createSrcRegRegion(src1->getModifier(), Direct, src1->getBase(),
7227             src1->getRegOff() + (secondHalf ? 1 : 0), src1->getSubRegOff() + 8, src1->getRegion(), src1->getType(), src1->getAccRegSel());
7228     }
7229 
7230     uint32_t options = inst->getOption();
7231     if (inst->getExecSize() == g4::SIMD16)
7232     {
7233         options &= ~InstOpt_QuarterMasks;
7234         int maskOffset = inst->getMaskOffset() + (secondHalf ? 8 : 0);
7235         switch (maskOffset)
7236         {
7237         case 0:
7238             options |= InstOpt_M0;
7239             break;
7240         case 8:
7241             options |= InstOpt_M8;
7242             break;
7243         case 16:
7244             options |= InstOpt_M16;
7245             break;
7246         case 24:
7247             options |= InstOpt_M24;
7248             break;
7249         default:
7250             MUST_BE_TRUE(false, "unexpected offset value");
7251         }
7252     }
7253 
7254     G4_Declare* tmpVal = builder.hasNFType() ? nullptr : builder.createTempVar(8, Type_F, Any);
7255     G4_DstRegRegion* accDst = builder.hasNFType() ?
7256         builder.createDst(builder.phyregpool.getAcc0Reg(), 0, 0, 1, Type_NF) :
7257         builder.createDstRegRegion(tmpVal, 1);
7258     G4_INST* madInst = builder.createInternalInst(
7259         nullptr, G4_mad, nullptr, g4::NOSAT, g4::SIMD8,
7260         accDst, srcR, u, srcP,
7261         options | InstOpt_WriteEnable);
7262     bb->insertBefore(it, madInst);
7263 
7264     G4_Predicate* pred = inst->getPredicate() ? builder.duplicateOperand(inst->getPredicate()) : nullptr;
7265     G4_CondMod* condMod = inst->getCondMod() ? builder.duplicateOperand(inst->getCondMod()) : nullptr;
7266     G4_SrcRegRegion* accSrc = builder.hasNFType() ?
7267         builder.createSrc(builder.phyregpool.getAcc0Reg(), 0, 0, builder.getRegionStride1(), Type_NF) :
7268         builder.createSrcRegRegion(tmpVal, builder.getRegionStride1());
7269     G4_DstRegRegion* newDst = builder.createDst(dst->getBase(),
7270         dst->getRegOff() + (secondHalf ? 1 : 0), dst->getSubRegOff(), dst->getHorzStride(), dst->getType());
7271     if (getGRFSize() == 64)
7272     {
7273         newDst = builder.createDst(dst->getBase(),
7274             dst->getRegOff(), dst->getSubRegOff() + (secondHalf ? 8 : 0), dst->getHorzStride(), dst->getType());
7275     }
7276     G4_INST* secondMadInst = builder.createInternalInst(
7277         pred, G4_mad, condMod, inst->getSaturate(), g4::SIMD8,
7278         newDst, accSrc, v, srcQ, options);
7279     bb->insertBefore(it, secondMadInst);
7280 }
7281 
7282 // Replace plane with a macro sequence:
7283 // pln dest:f src0:f src1:f
7284 // -->
7285 // mad acc0:nf src0.3:f src1:f src0.0:f
7286 // mad dest:f acc0:nf src1+1:f src0.1:f
7287 // simd16 pln also needs to be split as the macro is simd8 only
7288 
expandPlaneInst(INST_LIST_ITER it,G4_BB * bb)7289 void HWConformity::expandPlaneInst(INST_LIST_ITER it, G4_BB* bb)
7290 {
7291     G4_INST* inst = *it;
7292     MUST_BE_TRUE(inst->opcode() == G4_pln, "expect a plane inst");
7293     MUST_BE_TRUE(inst->getSrc(0)->isSrcRegRegion(), "src0 must be source reg region");
7294 
7295     G4_DstRegRegion* dst = inst->getDst();
7296     if (dst->getRegAccess() == IndirGRF || dst->getHorzStride() > 1)
7297     {
7298         inst->setDest(insertMovAfter(it, dst, dst->getType(), bb));
7299     }
7300     G4_SrcRegRegion* src0 = inst->getSrc(0)->asSrcRegRegion();
7301     if (src0->getRegAccess() == IndirGRF)
7302     {
7303         // insert move to make src0 direct
7304         inst->setSrc(insertMovBefore(it, 0, src0->getType(), bb), 0);
7305     }
7306     G4_SrcRegRegion* src1 = inst->getSrc(1)->asSrcRegRegion();
7307     if (src1->getRegAccess() == IndirGRF)
7308     {
7309         // insert move to make src1 direct
7310         inst->setSrc(insertMovBefore(it, 1, src1->getType(), bb), 1);
7311     }
7312 
7313     expandPlaneMacro(builder, it, bb, false);
7314     if (inst->getExecSize() == g4::SIMD16)
7315     {
7316         expandPlaneMacro(builder, it, bb, true);
7317     }
7318 
7319     it = bb->erase(it);
7320 }
7321 
7322 // plane does not support pln with non-packed dst.
7323 // also fix up plane sources, which don't support modifiers
7324 // returns true if the original plane is deleted
fixPlaneInst(INST_LIST_ITER it,G4_BB * bb)7325 bool HWConformity::fixPlaneInst(INST_LIST_ITER it, G4_BB* bb)
7326 {
7327 
7328     G4_INST* inst = *it;
7329     if (inst->opcode() == G4_pln)
7330     {
7331         if (!builder.doPlane())
7332         {
7333             expandPlaneInst(it, bb);
7334             return true;
7335         }
7336         G4_DstRegRegion* dst = inst->getDst();
7337         if (dst->getHorzStride() != 1)
7338         {
7339             G4_DstRegRegion* newDst = insertMovAfter(it, dst, dst->getType(), bb);
7340             inst->setDest(newDst);
7341         }
7342 
7343         G4_Operand* src0 = inst->getSrc(0);
7344         G4_Operand* src1 = inst->getSrc(1);
7345 
7346         // Source modifiers are not supported for pln instruction
7347         if (src0 &&
7348             ((src0->isSrcRegRegion() &&
7349                 src0->asSrcRegRegion()->getModifier() != Mod_src_undef) ||
7350                 !builder.isOpndAligned(src0, 16)))
7351         {
7352             // src0 needs a temp
7353             G4_Declare* tmpDcl = builder.createTempVar(4, Type_F,
7354                 GRFALIGN);
7355 
7356             // Before:
7357             // pln (16) dst, (mod)src0, src1
7358             //
7359             // After:
7360             // mov (4) tmp(0,0):f (mod)src0(r)<4;4,1>:f
7361             // pln (16) dst, tmp(0,0)<0;1,0>, src1
7362             G4_DstRegRegion* dstRgn = builder.createDst(
7363                 tmpDcl->getRegVar(),
7364                 0,
7365                 0,
7366                 1,
7367                 Type_F);
7368 
7369             const RegionDesc* rd = builder.createRegionDesc(4, 4, 1);
7370             G4_SrcRegRegion* srcRgn = builder.createSrcRegRegion(
7371                 src0->asSrcRegRegion()->getModifier(),
7372                 Direct,
7373                 src0->asSrcRegRegion()->getBase(),
7374                 src0->asSrcRegRegion()->getRegOff(),
7375                 src0->asSrcRegRegion()->getSubRegOff(),
7376                 rd,
7377                 Type_F);
7378 
7379             G4_INST* newInst = builder.createMov(g4::SIMD4, dstRgn, srcRgn, InstOpt_NoOpt, false);
7380 
7381             bb->insertBefore(it, newInst);
7382 
7383             rd = builder.getRegionScalar();
7384             G4_SrcRegRegion* newSrcRgn = builder.createSrc(
7385                 tmpDcl->getRegVar(),
7386                 0,
7387                 0,
7388                 rd,
7389                 Type_F);
7390 
7391             inst->setSrc(newSrcRgn, 0);
7392             inst->transferDef(newInst, Opnd_src0, Opnd_src0);
7393             newInst->addDefUse(inst, Opnd_src0);
7394         }
7395 
7396         if (src1 && src1->isSrcRegRegion() && src1->asSrcRegRegion()->getModifier() != Mod_src_undef)
7397         {
7398             // src1 needs a temp
7399             // For pln instruction src2 is implied from src1 and exec_size
7400             // When exec_size = 8, src2 is 1 GRF after src1 with size = 1 GRF
7401             // When exec_size = 16, src2 is 2 GRFs after src1 with size = 2 GRFs
7402             unsigned short numGRFsToCopy = inst->getExecSize() == g4::SIMD8 ? 2 : 4;
7403 
7404             G4_Declare* tmpDcl = builder.createTempVar((unsigned short)(numEltPerGRF<Type_UB>() / TypeSize(Type_F) * numGRFsToCopy), Type_F,
7405                 Any);
7406 
7407             // Before:
7408             // pln (16) dst, src0, (mod)src1
7409             //
7410             // After:
7411             // mov (16) tmp(0,0):f (mod)src1(r)<8;8,1>:f
7412             // mov (16) tmp(2,0):f (mod)src1(r+2)<8;8,1>:f <-- only if exec_size = 16
7413             // pln (16) dst, src0, tmp(0,0)
7414             for (int i = 0; i < numGRFsToCopy; i += 2)
7415             {
7416                 G4_DstRegRegion* dstRgn = builder.createDst(
7417                     tmpDcl->getRegVar(),
7418                     (short)i,
7419                     0,
7420                     1,
7421                     Type_F);
7422 
7423                 const RegionDesc* rd = builder.createRegionDesc(8, 8, 1);
7424                 G4_SrcRegRegion* srcRgn = builder.createSrcRegRegion(
7425                     src1->asSrcRegRegion()->getModifier(),
7426                     Direct,
7427                     src1->asSrcRegRegion()->getBase(),
7428                     src1->asSrcRegRegion()->getRegOff() + i,
7429                     0,
7430                     rd,
7431                     Type_F);
7432 
7433                 G4_INST* newInst = builder.createMov(g4::SIMD16, dstRgn, srcRgn, InstOpt_NoOpt, false);
7434 
7435                 bb->insertBefore(it, newInst);
7436 
7437                 if (i == 0)
7438                 {
7439                     G4_SrcRegRegion* newSrcRgn = builder.createSrc(
7440                         tmpDcl->getRegVar(),
7441                         0,
7442                         0,
7443                         rd,
7444                         Type_F);
7445 
7446                     inst->setSrc(newSrcRgn, 1);
7447                     inst->transferDef(newInst, Opnd_src1, Opnd_src0);
7448                 }
7449                 newInst->addDefUse(inst, Opnd_src1);
7450             }
7451         }
7452     }
7453     return false;
7454 }
7455 
fixImm64(INST_LIST_ITER i,G4_BB * bb)7456 void HWConformity::fixImm64(INST_LIST_ITER i,
7457     G4_BB* bb)
7458 {
7459     G4_INST* inst = *i;
7460     for (int j = 0, n_srcs = inst->getNumSrc(); j < n_srcs; j++)
7461     {
7462         G4_Operand* src = inst->getSrc(j);
7463         if (!src ||
7464             !(src->isImm()) ||
7465             src->getTypeSize() != 8)
7466         {
7467             continue;
7468         }
7469         // a 64bit immediate is supported ONLY for a MOV operation
7470         bool needsSplit = false;
7471 
7472         if (VISA_WA_CHECK(builder.getPWaTable(), WaDisallow64BitImmMov))
7473         {
7474             needsSplit = true;
7475         }
7476         if (needsSplit)
7477         {
7478             char* immPtr = NULL;
7479             double dfValue = 0.0f;
7480             int64_t qValue = 0;
7481 
7482             if (IS_DFTYPE(src->getType()))
7483             {
7484                 dfValue = src->asImm()->getDouble();
7485                 immPtr = (char*)&dfValue;
7486             }
7487             else
7488             {
7489                 qValue = src->asImm()->getInt();
7490                 immPtr = (char*)&qValue;
7491             }
7492             unsigned int lowValue = *((unsigned int*)(immPtr));
7493             unsigned int highValue = *((unsigned int*)(immPtr + 4));
7494             G4_Imm* lowImm = builder.createImm((int64_t)lowValue, Type_UD);
7495             G4_Imm* highImm = builder.createImm((int64_t)highValue, Type_UD);
7496 
7497             G4_Declare* defDcl = NULL;
7498 
7499             defDcl = builder.createTempVar(1, src->getType(), Eight_Word);
7500             G4_Declare* dcl = builder.createTempVar(2, Type_UD, Eight_Word);
7501             dcl->setAliasDeclare(defDcl, 0);
7502 
7503             G4_DstRegRegion* dstRegion = builder.createDstRegRegion(dcl, 1);
7504             G4_INST* lowMovInst = builder.createMov(g4::SIMD1, dstRegion, lowImm, InstOpt_WriteEnable, false);
7505 
7506             bb->insertBefore(i, lowMovInst);
7507 
7508             auto newDst = builder.createDst(dcl->getRegVar(), 0, 1, 1, dcl->getElemType());
7509             G4_INST* highMovInst = builder.createMov(g4::SIMD1, newDst, highImm, InstOpt_WriteEnable, false);
7510             bb->insertBefore(i, highMovInst);
7511 
7512             inst->transferDef(lowMovInst, Gen4_Operand_Number(j + 1), Opnd_src0);
7513             lowMovInst->addDefUse(inst, Gen4_Operand_Number(j + 1));
7514             inst->transferDef(highMovInst, Gen4_Operand_Number(j + 1), Opnd_src0);
7515             highMovInst->addDefUse(inst, Gen4_Operand_Number(j + 1));
7516 
7517             unsigned short vs = 0, hs = 0, wd = 1; // gen7_5: always 0;1,0
7518             G4_SrcRegRegion* new_src = builder.createSrcRegRegion(defDcl,
7519                 builder.createRegionDesc(vs, wd, hs));
7520             inst->setSrc(new_src, j);
7521         }
7522         else
7523         {
7524             if (inst->opcode() != G4_mov)
7525             {
7526                 inst->setSrc(insertMovBefore(i, j, src->getType(), bb), j);
7527             }
7528         }
7529     }
7530 }
7531 
7532 // Check if the source of def_inst is redefined before inst
checkSrcDefInst(G4_INST * inst,G4_INST * def_inst,uint32_t srcNum)7533 G4_INST* HWConformity::checkSrcDefInst(G4_INST* inst,
7534     G4_INST* def_inst,
7535     uint32_t srcNum)
7536 {
7537     G4_INST* valid_inst = def_inst;
7538 
7539     if (def_inst != NULL)
7540     {
7541         MUST_BE_TRUE(def_inst->opcode() == G4_mov, "def inst must be a mov instruction");
7542 
7543         G4_INST* def_inst1 = NULL;
7544         for (auto def_it1 = inst->def_begin(), end = inst->def_end(); def_it1 != end; def_it1++)
7545         {
7546             if ((*def_it1).second == srcNum + 1)
7547             {
7548                 def_inst1 = (*def_it1).first;
7549             }
7550         }
7551 
7552         if (def_inst1 != NULL)
7553         {
7554             G4_INST* def_inst2 = NULL;
7555             for (auto def_it2 = def_inst->def_begin(), end2 = def_inst->def_end(); def_it2 != end2; def_it2++)
7556             {
7557                 if ((*def_it2).second == Opnd_src0)
7558                 {
7559                     def_inst2 = (*def_it2).first;
7560                 }
7561             }
7562 
7563             if (def_inst1 != def_inst2)
7564             {
7565                 valid_inst = NULL;
7566             }
7567         }
7568     }
7569 
7570     return valid_inst;
7571 }
7572 
7573 /*
7574     Helper function for fixMixedHFInst
7575     It assumes dst is not null and is of type DstRegRegion.
7576     This check must be done before this method is called.
7577 */
helperGenerateTempDst(G4_BB * bb,INST_LIST_ITER instIter,G4_INST * inst,uint8_t hStride,G4_Type tempDstType,G4_SubReg_Align subAlign)7578 void HWConformity::helperGenerateTempDst(
7579     G4_BB* bb,
7580     INST_LIST_ITER instIter,
7581     G4_INST* inst,
7582     uint8_t hStride,
7583     G4_Type tempDstType,
7584     G4_SubReg_Align subAlign)
7585 {
7586     G4_DstRegRegion* dst = inst->getDst();
7587     G4_ExecSize execSize = inst->getExecSize();
7588     uint8_t dstSize = execSize * TypeSize(tempDstType);
7589     //create a new temp with horizontal stride of 1 (packed)
7590     //create a move to dst.
7591 
7592     uint32_t numElt = execSize == 1 ? 1 : execSize * hStride;
7593     if (numElt > 1 && isLowPrecisionFloatTy(tempDstType) && hStride == 1 && subAlign < Eight_Word)
7594         subAlign = Eight_Word;
7595     subAlign = getDclAlignment(dstSize, inst, execSize == 1);
7596 
7597     G4_Declare* dcl = builder.createTempVar(numElt, tempDstType, subAlign);
7598 
7599     G4_DstRegRegion* dstRegion = builder.createDstRegRegion(dcl, hStride);
7600     inst->setDest(dstRegion);
7601 
7602     const RegionDesc* region =
7603         execSize == 1 ?
7604         builder.getRegionScalar() :
7605         builder.createRegionDesc(execSize * hStride, execSize, hStride);
7606     G4_SrcRegRegion* srcRegion = builder.createSrcRegRegion(dcl, region);
7607 
7608     //creating a mov from temp dst to final destination using original options of fixed instruction
7609     G4_INST* movInst = builder.createMov(
7610         execSize, dst, srcRegion, inst->getMaskOption(), false);
7611 
7612     ++instIter;
7613     //inserting mov after fixed instruction
7614     bb->insertBefore(instIter, movInst);
7615 
7616     /*
7617     Need to remove dst from uses list of mulh, and add them to movInst useList
7618     add movInst to uselist of mulh.
7619     Add mulh to def instruction list of movInst
7620     */
7621     inst->transferUse(movInst);
7622     inst->addDefUse(movInst, Opnd_src0);
7623 }
7624 
7625 /*
7626     Not Implemented rules:
7627 
7628     3:  (Does this mean align1 doesn't support replication?)
7629         In Align16 mode, replicate is supported and is coissueable.
7630 
7631     4: (handled in reduce execution size)
7632         No simd16 in mixed mode when destination is packed f16 for both Align1 and Align16.
7633 
7634             mad(8) r3.xyzw:hf r4.xyzw:f r6.xyzw:hf r7.xyzw:hf
7635 
7636             add(8) r20.0<1>:hf r3<8;8,1>:f r6.0<8;8,1>:hf {Q1}
7637 
7638     5: (we are not producing this type of code)
7639         No accumulator read access for align16 mixed float
7640 
7641     6: (we do not generate code like this)
7642         [DevCHV, DevSKL+]: When source is float from accumulator register and destination is half float with a stride of 1, the source must register aligned. i.e., source must have offset zero.
7643 
7644     7: (doesn't seem like it is applicable to our code)
7645         In Align16, vertical stride can never be zero for f16
7646 
7647     8.a: (handled by another check)
7648         Math operations for mixed mode,
7649             - In Align16, only packed format is supported
7650 
7651     11. (handled in reduce execution size)
7652         [DevCHV, DevSKL, DevBXT]: No simd16 in mixed mode when destination is f32. Instruction Execution size must be no more than 8.
7653 
7654 */
fixMixedHFInst(G4_BB * bb)7655 void HWConformity::fixMixedHFInst(G4_BB* bb)
7656 {
7657     for (auto instIter = bb->begin(); instIter != bb->end(); ++instIter)
7658     {
7659         G4_INST* inst = *instIter;
7660 
7661         if (inst->mayExceedTwoGRF() || !inst->getDst())
7662         {
7663             continue;
7664         }
7665 
7666         if (VISA_WA_CHECK(builder.getPWaTable(), WaSrc1ImmHfNotAllowed))
7667         {
7668             G4_Operand* tSrc1 = inst->getSrc(1);
7669             if (tSrc1 && tSrc1->isImm() && tSrc1->getType() == Type_HF)
7670             {
7671                 inst->setSrc(insertMovBefore(instIter, 1, Type_HF, bb), 1);
7672             }
7673         }
7674 
7675         if (builder.hasPartialMixMode() && inst->getNumSrc() > 1)
7676         {
7677             bool isPureBF = true;
7678             if (inst->getDst()->getType() != Type_BF)
7679             {
7680                 isPureBF = false;
7681             }
7682             for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; ++i)
7683             {
7684                 if (inst->getSrc(i)->getType() != Type_BF)
7685                 {
7686                     isPureBF = false;
7687                     break;
7688                 }
7689             }
7690             if (isPureBF)
7691             {
7692                 // pure BF arithmetic instruction is not supported, we make src0 F
7693                 replaceSrc(instIter, 0, Type_F, bb);
7694             }
7695 
7696             // no HF on mad src2 or mul src1
7697             if (inst->isMixedMode())
7698             {
7699                 auto canSwapSource = [](G4_INST* inst)
7700                 {
7701                     int srcPos = inst->opcode() == G4_mad ? 2 : 1;
7702                     G4_Operand* src = inst->getSrc(srcPos);
7703                     G4_Operand* otherSrc = inst->getSrc(srcPos - 1);
7704                     if (src->isImm() || otherSrc->getType() != Type_F)
7705                     {
7706                         // swapping won't work
7707                         return false;
7708                     }
7709                     if (inst->opcode() == G4_mad)
7710                     {
7711                         // src2 has more restrictive regioning, so we can swap only when
7712                         // src1 is scalar or has contiguous region
7713                         if (otherSrc->isSrcRegRegion())
7714                         {
7715                             G4_SrcRegRegion* other = otherSrc->asSrcRegRegion();
7716                             if (other->getRegion()->isScalar() ||
7717                                 other->getRegion()->isContiguous(inst->getExecSize()))
7718                             {
7719                                 return true;
7720                             }
7721                         }
7722                         return false;
7723                     }
7724                     else
7725                     {
7726                         // swapping is always legal for mul
7727                         return true;
7728                     }
7729                 };
7730                 if (inst->opcode() == G4_mad)
7731                 {
7732                     if (isLowPrecisionFloatTy(inst->getSrc(2)->getType()))
7733                     {
7734                         if (canSwapSource(inst))
7735                         {
7736                             inst->swapSrc(1, 2);
7737                         }
7738                         else
7739                         {
7740                             inst->setSrc(insertMovBefore(instIter, 2, Type_F, bb), 2);
7741                         }
7742                     }
7743                     // at this point src2 must be F. Dst must be aligned to
7744                     // same subreg as src2 if src2 is non-scalar
7745                     bool nonScalarSrc2 = inst->getSrc(2)->isSrcRegRegion() &&
7746                         !inst->getSrc(2)->asSrcRegRegion()->getRegion()->isScalar();
7747                     if (nonScalarSrc2)
7748                     {
7749                         if (!builder.isOpndAligned(inst->getDst(), numEltPerGRF<Type_UB>()))
7750                         {
7751                             replaceDst(instIter, Type_F, GRFALIGN);
7752                         }
7753                         if (!builder.isOpndAligned(inst->getSrc(2), numEltPerGRF<Type_UB>()))
7754                         {
7755                             inst->setSrc(insertMovBefore(instIter, 2, inst->getSrc(2)->getType(), bb, GRFALIGN), 2);
7756                         }
7757                     }
7758                 }
7759                 else if (inst->opcode() == G4_mul && isLowPrecisionFloatTy(inst->getSrc(1)->getType()))
7760                 {
7761                     if (canSwapSource(inst))
7762                     {
7763                         inst->swapSrc(0, 1);
7764                     }
7765                     else
7766                     {
7767                         inst->setSrc(insertMovBefore(instIter, 1, Type_F, bb), 1);
7768                     }
7769                 }
7770             }
7771         }
7772 
7773         // The execution size must be no more than 8 when half-floats are used in source or destination operand.
7774         // ToDO: move this to fixmathinst
7775         if (inst->getExecSize() > builder.getNativeExecSize())
7776         {
7777             if (inst->opcode() == G4_math &&
7778                 inst->getDst()->getType() == Type_HF &&
7779                 inst->getSrc(0)->getType() == Type_HF &&
7780                 (!inst->getSrc(1) || inst->getSrc(1)->getType() == Type_HF))
7781             {
7782                 evenlySplitInst(instIter, bb);
7783             }
7784         }
7785 
7786         G4_DstRegRegion* dst = inst->getDst();
7787         if (INST_FLOAT_SRC_ONLY(inst->opcode()) && dst && !dst->isNullReg() && isLowPrecisionFloatTy(dst->getType()))
7788         {
7789             helperGenerateTempDst(bb, instIter, inst, 1, Type_F);
7790         }
7791 
7792         if (!inst->isMixedMode())
7793             continue;
7794 
7795         if (inst->getDst() && !inst->getDst()->isNullReg())
7796             dst = inst->getDst();
7797 
7798         if ((VISA_WA_CHECK(builder.getPWaTable(), WaMixModeSelInstDstNotPacked) ||
7799             VISA_WA_CHECK(builder.getPWaTable(), WaFloatMixedModeSelNotAllowedWithPackedDestination)) &&
7800             inst->opcode() == G4_sel &&
7801             dst &&
7802             (VISA_WA_CHECK(builder.getPWaTable(), WaMixModeSelInstDstNotPacked) || dst->getHorzStride() == 1) &&
7803             dst->getType() == Type_HF)
7804         {
7805             helperGenerateTempDst(bb, instIter, inst, 1, Type_F);
7806         }
7807 
7808         if (!inst->isMixedMode())
7809             continue;
7810 
7811         if (builder.getPlatform() >= GENX_CHV)
7812         {
7813             // no SIMD16 mix mode instruction
7814             if (inst->getExecSize() > builder.getNativeExecSize() && inst->isMixedMode())
7815             {
7816                 evenlySplitInst(instIter, bb, false);
7817                 //instruction was split, and new instruction inserted before
7818                 //going back to previous instruction to double check it still confirms.
7819                 --instIter;
7820                 inst = *instIter;
7821             }
7822         }
7823 
7824         /*
7825             12: [DevCHV, DevSKL]: Indirect Addressing on source is not supported when source and destination data types are mixed float.
7826         */
7827         if (builder.getPlatform() == GENX_CHV || builder.getPlatform() == GENX_SKL)
7828         {
7829             for (uint8_t i = 0; i < inst->getNumSrc(); ++i)
7830             {
7831                 G4_Operand* src = inst->getSrc(i);
7832                 if (src == nullptr || !src->isSrcRegRegion() || !src->asSrcRegRegion()->isIndirect())
7833                 {
7834                     continue;
7835                 }
7836                 inst->setSrc(insertMovBefore(instIter, i, src->getType(), bb), i);
7837             }
7838         }
7839 
7840         if (inst->getDst()->getBase()->isRegVar() &&
7841             inst->getDst()->getType() == Type_HF &&
7842             inst->getDst()->getHorzStride() == 1)
7843         {
7844             inst->getDst()->getBase()->asRegVar()->getDeclare()->setSubRegAlign(Eight_Word);
7845         }
7846     }
7847 }
7848 
7849 // Fix for packed half types on BDW.
7850 // Conversions from F to packed HF are not supported on this platform,
7851 // only unpacked HF is supported on destination.
7852 // When we encounter an instruction with HF type on destination with <1> stride
7853 // and float on source, add an additional mov that handles unpacking.
fixPackedHFConversions(INST_LIST_ITER it,G4_BB * bb)7854 void HWConformity::fixPackedHFConversions(INST_LIST_ITER it, G4_BB* bb)
7855 {
7856     G4_INST* inst = *it;
7857     G4_DstRegRegion* dst = inst->getDst();
7858     if (dst && dst->getType() == Type_HF && dst->getHorzStride() == 1 &&
7859         TypeSize(inst->getExecType()) > 2)
7860     {
7861         helperGenerateTempDst(bb, it, inst, 2, Type_HF);
7862     }
7863 }
7864 
fixSrc2(INST_LIST_ITER it,G4_BB * bb,bool swapSrc0and2)7865 void HWConformity::fixSrc2(INST_LIST_ITER it, G4_BB* bb, bool swapSrc0and2)
7866 {
7867     G4_INST* inst = *it;
7868     int srcPos = swapSrc0and2 ? 0 : 2; // unfortunate side effect of vISA mad and Gen mad having difference src order
7869     assert(inst->getNumSrc() == 3 && "expect 3-src inst");
7870     if (builder.noSrc2Regioning())
7871     {
7872         auto src = inst->getSrc(srcPos);
7873         // we have to make sure src2 and dst are aligned
7874         // Promote src2's type to f if mix mode is supported.
7875         // e.g.,
7876         // mad (4) r10.0<1>:f src0 src1 r12.0<1>:hf  --> f
7877         // mad (4) r10.0<2>:hf src0 src1 r12.0<1>:hf --> f
7878         // mad (4) r10.0<1>:hf src0 src1 r12.0<2>:hf --> hf
7879         // mad (4) r10.0<2>:hf src0 src1 r12.1<2>:hf --> f
7880         // ditto for 3-src inst with int types
7881         G4_Type srcTy = src->getType();
7882         unsigned short dstEltSz = inst->getDst()->getExecTypeSize();
7883         if (dstEltSz >= 4)
7884         {
7885             if (IS_SIGNED_INT(srcTy))
7886             {
7887                 srcTy = Type_D;
7888             }
7889             else if (IS_UNSIGNED_INT(srcTy))
7890             {
7891                 srcTy = Type_UD;
7892             }
7893             else if (builder.hasMixMode() && builder.getMixModeType() == srcTy)
7894             {
7895                 // we can change operand type to F to save one move
7896                 srcTy = Type_F;
7897             }
7898         }
7899         inst->setSrc(insertMovBefore(it, srcPos, srcTy, bb, GRFALIGN), srcPos);
7900 
7901         // Check if dst stride aligns with src2.
7902         if (dstEltSz != TypeSize(srcTy))
7903         {
7904             replaceDst(it, inst->getDst()->getType(), GRFALIGN);
7905         }
7906     }
7907 }
7908 
fixVxHFloat64b(INST_LIST_ITER it,G4_BB * bb)7909 void HWConformity::fixVxHFloat64b(INST_LIST_ITER it, G4_BB* bb)
7910 {
7911     // at this point VxH region should only be on src0
7912     G4_INST* inst = *it;
7913     G4_SrcRegRegion* src0 = inst->getSrc(0) && inst->getSrc(0)->isSrcRegRegion() ?
7914         inst->getSrc(0)->asSrcRegRegion() : nullptr;
7915 
7916     if (src0 && src0->getRegAccess() == IndirGRF && src0->getRegion()->isRegionWH())
7917     {
7918         auto type = src0->getType();
7919         // additionally check for int->float type conversion
7920         // FIXME: replace with SWSB's pipe check functions
7921         bool isFloatPipe = type == Type_HF || type == Type_F;
7922         if (inst->opcode() == G4_mov)
7923         {
7924             isFloatPipe |= TypeSize(type) < 8 && (inst->getDst()->getType() == Type_HF || inst->getDst()->getType() == Type_F);
7925         }
7926         if (isFloatPipe)
7927         {
7928             auto intType = TypeSize(type) == 4 ? Type_UD : Type_UW;
7929             if (inst->isRawMov())
7930             {
7931                 // directly change the dst/src type to int
7932                 inst->getDst()->setType(intType);
7933                 src0->setType(intType);
7934             }
7935             else
7936             {
7937                 // generate a copy move using int type
7938                 // FIXME: code is a bit hacky, may want to change insertMovBefore
7939                 // so that we could specify the move type
7940                 auto origType = src0->getType();
7941                 auto origMod = src0->getModifier();
7942                 src0->setType(intType);
7943                 src0->setModifier(Mod_src_undef);
7944                 auto newSrc = insertMovBefore(it, 0, intType, bb);
7945                 newSrc->asSrcRegRegion()->setType(origType);
7946                 newSrc->asSrcRegRegion()->setModifier(origMod);
7947                 inst->setSrc(newSrc, 0);
7948             }
7949         }
7950         else if (TypeSize(type) == 8)
7951         {
7952             int numDwords = inst->getExecSize() * 2;
7953             G4_Declare* tmpSrc = builder.createTempVar(numDwords / 2, src0->getType(), Any);
7954             const RegionDesc* newRegion = builder.getRegionStride1();
7955             copyDwordsIndirect(tmpSrc, src0, numDwords, bb, it);
7956             G4_SrcRegRegion* tmpSrcOpnd = builder.createSrcRegRegion(src0->getModifier(),
7957                 Direct, tmpSrc->getRegVar(), 0, 0, newRegion, tmpSrc->getElemType());
7958             inst->setSrc(tmpSrcOpnd, 0);
7959         }
7960     }
7961 }
7962 
fixIntToHFMove(G4_BB * bb)7963 bool HWConformity::fixIntToHFMove(G4_BB* bb)
7964 {
7965     // int to HF move requires dst to have stride 2, which would result in
7966     // an illegal SIMD32 inst. So we split in this case
7967     // we put it in a separate pass so that the split instructions may be legalized later
7968     bool changed = false;
7969     for (auto I = bb->begin(), E = bb->end(); I != E; ++I)
7970     {
7971         auto inst = *I;
7972         if (inst->opcode() == G4_mov && inst->getDst()->getType() == Type_HF &&
7973             IS_INT(inst->getSrc(0)->getType()))
7974         {
7975             if (inst->getExecSize() * 2 * 2 > getGRFSize() * 2)
7976             {
7977                 evenlySplitInst(I, bb);
7978                 changed = true;
7979             }
7980         }
7981     }
7982     return changed;
7983 }
7984 
fixPredCtrl(INST_LIST_ITER it,G4_BB * bb)7985 void HWConformity::fixPredCtrl(INST_LIST_ITER it, G4_BB* bb)
7986 {
7987     G4_INST* inst = *it;
7988     G4_Predicate* pred = inst->getPredicate();
7989     if (pred && (pred->getControl() == PRED_ANY_WHOLE || pred->getControl() == PRED_ALL_WHOLE))
7990     {
7991         // we need WA if pred's size is greater than inst's exec size
7992         // and the platform does not support predctrl group size (indicated by the fact we
7993         // have PRED_ANY_WHOLE and PRED_ALL_WHOLE)
7994         // The case where pred size is less than inst's exec size is already undefined
7995         // even with predCtrl group size..
7996         G4_Declare* flagDcl = pred->getTopDcl();
7997         if (flagDcl->getNumberFlagElements() > inst->getExecSize())
7998         {
7999             // convert
8000             // (f0.any32h) sel (1) ...
8001             // into
8002             // cmp (1) [ne] f1 f0 0
8003             // (f1) sel (1) ...
8004             // and
8005             // (f0.all32h) sel (1) ...
8006             // into
8007             // cmp (1) [e] f1 f0 0xFFFFFFFF
8008             //
8009             // if f0 happens to be < 16 elements we have to clear upper bits as well in case it has garbage values
8010             assert(!inst->getCondMod() && "currently don't handle an instruction with conditional modifier");
8011             assert((inst->isWriteEnableInst() || bb->isAllLaneActive()) && "don't handle instruction in SIMD CF for now");
8012             G4_Declare* tmpFlag = builder.createTempFlag(1);
8013             G4_Type flagType = flagDcl->getNumberFlagElements() == 32 ? Type_UD : Type_UW;
8014             uint32_t allOneMask = (uint32_t)((1ULL << flagDcl->getNumberFlagElements()) - 1);
8015             G4_Declare* cmpSrc0Flag = flagDcl;
8016             if (flagDcl->getNumberFlagElements() < 16)
8017             {
8018                 // clear the upper bit of the flag
8019                 auto andInst = builder.createBinOp(G4_and, g4::SIMD1, builder.createDstRegRegion(tmpFlag, 1),
8020                     builder.createSrcRegRegion(flagDcl, builder.getRegionScalar()),
8021                     builder.createImm(allOneMask, Type_UW), InstOpt_WriteEnable, false);
8022                 bb->insertBefore(it, andInst);
8023                 cmpSrc0Flag = tmpFlag;
8024             }
8025             G4_CondMod* condMod = builder.createCondMod(pred->getControl() == PRED_ANY_WHOLE ? Mod_ne : Mod_e,
8026                 tmpFlag->getRegVar(), 0);
8027 
8028             G4_Imm* immVal = builder.createImm(pred->getControl() == PRED_ANY_WHOLE ? 0 : allOneMask, flagType);
8029             // cmp needs to be as wide as the original inst but is uniform and NoMask otherwise
8030             auto cmpInst = builder.createInternalInst(
8031                 nullptr, G4_cmp, condMod, g4::NOSAT, inst->getExecSize(),
8032                 builder.createNullDst(flagType),
8033                 builder.createSrc(cmpSrc0Flag->getRegVar(), 0, 0, builder.getRegionScalar(), flagType),
8034                 immVal,
8035                 InstOpt_WriteEnable);
8036             bb->insertBefore(it, cmpInst);
8037             inst->setPredicate(builder.createPredicate(pred->getState(), tmpFlag->getRegVar(), 0));
8038         }
8039     }
8040 }
8041 
8042 // emulate mov F BF
8043 // with
8044 // shl UD UW 16
fixBFMove(INST_LIST_ITER i,G4_BB * bb)8045 bool HWConformity::fixBFMove(INST_LIST_ITER i, G4_BB* bb)
8046 {
8047     G4_INST* inst = *i;
8048     if (inst->opcode() != G4_mov)
8049     {
8050         return false;
8051     }
8052     G4_Operand* src0 = inst->getSrc(0);
8053 
8054     if (inst->getDst()->getType() == Type_BF)
8055     {
8056         // allow BF->BF moves as they may be introduced during HW conformity
8057         // we will change their type to HF later
8058         assert((src0->getType() == Type_F || src0->getType() == Type_BF) &&
8059             "Only F->BF conversion is supported");
8060         assert(!inst->getPredicate() && !inst->getCondMod() && !inst->getSaturate() &&
8061             "F->BF move does not support pred/cond mod/sat");
8062         if (src0->isSrcRegRegion())
8063         {
8064             assert(src0->asSrcRegRegion()->getModifier() == Mod_src_undef &&
8065                 "F->BF move does not support source modifier");
8066         }
8067         if (src0->getType() == Type_BF)
8068         {
8069             // change type of copy move to uw
8070             inst->getDst()->setType(Type_UW);
8071             src0->asSrcRegRegion()->setType(Type_UW);
8072         }
8073         return false;
8074     }
8075 
8076     if (src0->getType() == Type_BF)
8077     {
8078         assert(inst->getDst()->getType() == Type_F && "Only BF->F conversion is supported");
8079         assert(!inst->getPredicate() && !inst->getCondMod() && !inst->getSaturate() &&
8080             "BF->F move does not support pred/cond mod/sat");
8081         // don't support BF imm for now
8082         assert(src0->isSrcRegRegion() &&
8083             src0->asSrcRegRegion()->getModifier() == Mod_src_undef &&
8084             "F->BF move does not support source modifier");
8085 
8086         auto src0RR = src0->asSrcRegRegion();
8087 
8088         src0RR->setType(Type_UW);
8089         G4_SrcRegRegion* newSrc0 = src0RR;
8090 
8091         inst->getDst()->setType(Type_UD);
8092         auto newDst = inst->getDst();
8093 
8094         auto shlInst = builder.createBinOp(G4_shl,
8095             inst->getExecSize(), newDst, newSrc0, builder.createImm(16, Type_UW), inst->getOption(), false);
8096         bb->insertBefore(i, shlInst);
8097         bb->erase(i);
8098 
8099         return true;
8100     }
8101 
8102     return false;
8103 }
8104 
isFloatOr64b(G4_INST * inst)8105 bool HWConformity::isFloatOr64b(G4_INST* inst)
8106 {
8107     auto dst = inst->getDst();
8108     auto dstTy = dst->getType();
8109 
8110     bool goFloatPipe = IS_TYPE_FLOAT_ALL(dstTy) || TypeSize(dstTy) >= 8;
8111 
8112     if (!goFloatPipe)
8113     {
8114         for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; ++i)
8115         {
8116             auto src = inst->getSrc(i);
8117             if (src)
8118             {
8119                 bool nonScalarFloat = IS_TYPE_FLOAT_ALL(src->getType()) &&
8120                     src->isSrcRegRegion() && !src->asSrcRegRegion()->isScalar();
8121                 // Q type may be mixed with other int (e.g., D = Q + D), so always needs checking as we may need
8122                 // to fix the other operands.
8123                 // float type only needs checking if it's non-scalar
8124                 // ToDo: consider skipping all mixed mode as they should already confirm to region rules
8125                 if (IS_QTYPE(src->getType()) || nonScalarFloat)
8126                 {
8127                     goFloatPipe = true;
8128                     break;
8129                 }
8130             }
8131         }
8132     }
8133     return goFloatPipe;
8134 }
8135 
getSrcStride(G4_SrcRegRegion * src)8136 uint16_t HWConformity::getSrcStride(G4_SrcRegRegion* src)
8137 {
8138     uint16_t srcStride = 0;
8139     src->getRegion()->isSingleStride(src->getInst()->getExecSize(), srcStride);
8140     srcStride *= src->getTypeSize();
8141     return srcStride;
8142 };
8143 
change64bStride2CopyToUD(INST_LIST_ITER it,G4_BB * bb)8144 void HWConformity::change64bStride2CopyToUD(INST_LIST_ITER it, G4_BB* bb)
8145 {
8146     G4_INST* inst = *it;
8147     G4_Operand* src = inst->getSrc(0);
8148     MUST_BE_TRUE(src != nullptr && src->isSrcRegRegion(), "source must be a SrcRegRegion");
8149     G4_SrcRegRegion* origSrc = src->asSrcRegRegion();
8150     G4_Type execType = inst->getDst()->getType();
8151     uint16_t stride = inst->getDst()->getHorzStride();
8152     short dstRegOff = inst->getDst()->getRegOff();
8153     short dstSubRegOff = inst->getDst()->getSubRegOff();
8154 
8155     assert((execType == Type_Q || execType == Type_DF) && "Only 64b data type support");
8156     execType = Type_UD;
8157     dstSubRegOff *= 2;
8158 
8159     G4_DstRegRegion* newDst = builder.createDst(
8160         inst->getDst()->getBase(),
8161         dstRegOff,
8162         dstSubRegOff + 1,
8163         stride * 2,
8164         execType);
8165     G4_SrcRegRegion* newSrc = builder.createSrcRegRegion(origSrc->getModifier(), Direct, origSrc->getBase(),
8166         origSrc->getRegOff(), origSrc->getSubRegOff() * 2 + 1, builder.createRegionDesc(2, 1, 0), Type_UD);
8167     inst->setSrc(newSrc, 0);
8168     inst->setDest(newDst);
8169 
8170     G4_DstRegRegion* newDst1 = builder.createDst(
8171         inst->getDst()->getBase(),
8172         dstRegOff,
8173         dstSubRegOff,
8174         stride * 2,
8175         execType);
8176     G4_SrcRegRegion* newSrc1 = builder.createSrcRegRegion(origSrc->getModifier(), Direct, origSrc->getBase(),
8177         origSrc->getRegOff(), origSrc->getSubRegOff() * 2, builder.createRegionDesc(2, 1, 0), Type_UD);
8178 
8179     G4_INST* movInst = builder.createMov(inst->getExecSize(), newDst1, newSrc1, inst->getOption(), false);
8180 
8181     INST_LIST_ITER iter = it;
8182     iter++;
8183     bb->insertBefore(it, movInst);
8184 }
8185 
8186 // on XeHP_SDV we have to make sure each source element is alignd to each dst element
8187 // for all float/64b inst (packed HF is ok in mixed mode inst)
8188 // For all violating instructions, we align each operand to the execution type
8189 // for float copy moves we could directly convert their type to int
fixUnalignedRegions(INST_LIST_ITER it,G4_BB * bb)8190 void HWConformity::fixUnalignedRegions(INST_LIST_ITER it, G4_BB* bb)
8191 {
8192     G4_INST* inst = *it;
8193     if (!inst->getDst() || inst->isSend() || inst->isDpas() ||
8194         hasDedicateAlignRegionConformity(it) ||
8195         inst->getExecSize() == g4::SIMD1)
8196     {
8197         // only check non-scalar ALU instructions
8198         return;
8199     }
8200 
8201     if (!isFloatOr64b(inst))
8202     {
8203         return;
8204     }
8205     auto dst = inst->getDst();
8206     auto dstTy = dst->getType();
8207     G4_Type execTy = inst->getExecType();
8208     if (TypeSize(dstTy) > TypeSize(execTy))
8209     {
8210         // getExecType() does not take dst ty into account, while we have to consider the widest type
8211         // in all operands here
8212         execTy = dstTy;
8213     }
8214     auto execTyWidth = TypeSize(execTy);
8215 
8216     // input must be a 64b copy move with packed dst and singly-strided src
8217     // this works for both direct and indirect dst and src
8218     auto change64bCopyToUD = [this](G4_INST* movInst, uint16_t srcStride)
8219     {
8220         auto oldSrc = movInst->getSrc(0)->asSrcRegRegion();
8221         G4_SrcRegRegion* movSrc = nullptr;
8222         if (oldSrc->getRegAccess() == Direct)
8223         {
8224             // change region, type, and subreg offset
8225             movSrc = builder.createSrcRegRegion(oldSrc->getModifier(), Direct, oldSrc->getBase(),
8226                 oldSrc->getRegOff(), oldSrc->getSubRegOff() * 2, builder.createRegionDesc(srcStride * 2, 2, 1), Type_UD);
8227         }
8228         else
8229         {
8230             // change region and type
8231             movSrc = builder.createIndirectSrc(oldSrc->getModifier(), oldSrc->getBase(), oldSrc->getRegOff(),
8232                 oldSrc->getSubRegOff(), builder.createRegionDesc(srcStride * 2, 2, 1), Type_UD, oldSrc->getAddrImm());
8233         }
8234         movInst->setSrc(movSrc, 0);
8235 
8236         auto oldDst = movInst->getDst();
8237         G4_DstRegRegion* movDst = nullptr;
8238         if (oldDst->getRegAccess() == Direct)
8239         {
8240             movDst = builder.createDst(oldDst->getBase(), oldDst->getRegOff(), oldDst->getSubRegOff() * 2, oldDst->getHorzStride(), Type_UD, oldDst->getAccRegSel());
8241         }
8242         else
8243         {
8244             movDst = builder.createIndirectDst(oldDst->getBase(), oldDst->getSubRegOff(), oldDst->getHorzStride(), Type_UD, oldDst->getAddrImm());
8245         }
8246         movInst->setDest(movDst);
8247         movInst->setExecSize(G4_ExecSize(movInst->getExecSize() * 2u));
8248         movInst->setOptionOn(InstOpt_WriteEnable);
8249         // caller guarantees movInst is not predicated, so we can reset its mask offset to 0
8250         // this is to avoid a bug where changing
8251         // mov (8|M24) r2.0<1>:q
8252         // -->
8253         // mov (16|M24) r2.0<1>:ud
8254         // would result in illegal mask offset for SIMD16
8255         movInst->setMaskOption(InstOpt_M0);
8256     };
8257 
8258     if (inst->isRawMov())
8259     {
8260         // we can do better for float/64b copy moves by directly changing their type
8261         bool done = true;
8262         if (inst->getSrc(0)->isSrcRegRegion() && !inst->getSrc(0)->asSrcRegRegion()->isScalar())
8263         {
8264             auto src0RR = inst->getSrc(0)->asSrcRegRegion();
8265             int dstStride = TypeSize(dstTy) * inst->getDst()->getHorzStride();
8266             int srcStride = getSrcStride(src0RR);
8267             if (dstStride != srcStride || !builder.isOpndAligned(inst->getSrc(0), getGRFSize()) ||
8268                 !builder.isOpndAligned(inst->getDst(), getGRFSize()))
8269             {
8270                 bool isNoMaskInst = !inst->getPredicate() && (inst->isWriteEnableInst() || bb->isAllLaneActive());
8271                 if (execTyWidth < 8)
8272                 {
8273                     auto intType = TypeSize(dstTy) == 4 ? Type_UD : Type_UW;
8274                     inst->getDst()->setType(intType);
8275                     src0RR->setType(intType);
8276                 }
8277                 else if (isNoMaskInst && inst->getDst()->getHorzStride() == 1 && srcStride != 0)
8278                 {
8279                     // for packed 64b copy moves that are not under divergent CF, we can
8280                     // change its type to UD
8281                     change64bCopyToUD(inst, srcStride / inst->getSrc(0)->getTypeSize());
8282                 }
8283                 else if (isNoMaskInst && inst->getDst()->getHorzStride() == 2 && execTyWidth == 8 &&
8284                     src0RR->getRegion()->isContiguous(inst->getExecSize()))
8285                 {
8286                     change64bStride2CopyToUD(it, bb);
8287                 }
8288                 else if (execTyWidth == 8 && IS_TYPE_INT(dstTy) && IS_TYPE_INT(src0RR->getType()) && srcStride != 0 && !src0RR->isIndirect())
8289                 {
8290                     // we can split 64b moves with single source stride into 2UD moves
8291                     // ToDo: check if this subsumes the previous else if
8292                     emulate64bMov(it, bb);
8293                 }
8294                 else
8295                 {
8296                     // a move we don't know how to handle without inserting more moves
8297                     done = false;
8298                 }
8299             }
8300         }
8301         if (done)
8302         {
8303             // the move is ok at this point
8304             return;
8305         }
8306     }
8307 
8308     // some operands may have fixed offset (e.g., input), and we can directly check if all operands have the same sub-reg
8309     // for simplicity we require all operands to have same type and are packed.
8310     {
8311         bool goodOperand = true;
8312         if (inst->getDst()->getHorzStride() != 1)
8313         {
8314             goodOperand = false;
8315         }
8316         for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; ++i)
8317         {
8318             if (inst->getSrc(i)->isSrcRegRegion())
8319             {
8320                 auto srcRR = inst->getSrc(i)->asSrcRegRegion();
8321                 if (srcRR->getType() != inst->getDst()->getType() ||
8322                     (!srcRR->isScalar() && !srcRR->getRegion()->isContiguous(inst->getExecSize())))
8323                 {
8324                     goodOperand = false;
8325                     break;
8326                 }
8327             }
8328         }
8329         uint32_t commonOffset = 0;
8330         if (goodOperand && hasSameSubregOffset(inst, commonOffset) && commonOffset != 0)
8331         {
8332             //for some strange reason HW requires null operands to have the same subreg offset as other operands as well
8333             if (inst->getDst()->isNullReg())
8334             {
8335                 inst->setDest(builder.createDst(builder.phyregpool.getNullReg(), 0, commonOffset / dst->getTypeSize(), 1, dst->getType()));
8336             }
8337             return;
8338         }
8339     }
8340 
8341     if (inst->getExecSize() == g4::SIMD2 && inst->getNumSrc() != 3)
8342     {
8343         if (inst->getDst()->getAccRegSel() != ACC_UNDEFINED)
8344         {
8345             // this instruction is internally generated, no need to check
8346             return;
8347         }
8348 
8349         // split currently can't handle packed imm
8350         // Also don't split src byte type since scalar byte to float conversion is not allowed
8351         auto canSplit = [](G4_INST* inst)
8352         {
8353             if (inst->getPredicate() || inst->getCondMod())
8354             {
8355                 return false;
8356             }
8357             for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; ++i)
8358             {
8359                 auto ty = inst->getSrc(i)->getType();
8360                 if (IS_VINTTYPE(ty) || ty == Type_VF || IS_BTYPE(ty))
8361                 {
8362                     return false;
8363                 }
8364             }
8365             return true;
8366         };
8367         if (canSplit(inst))
8368         {
8369             auto prevIt = it == bb->begin() ? it : std::prev(it);
8370             if (evenlySplitInst(it, bb))
8371             {
8372                 // split introduces new moves which may need fixing
8373                 // after splitting it points to the second instruction
8374                 INST_LIST_ITER splitIt = std::prev(it);
8375                 INST_LIST_ITER insertIt = prevIt == bb->begin() ? prevIt : std::next(prevIt);
8376                 while (insertIt != splitIt)
8377                 {
8378                     fixUnalignedRegions(insertIt, bb);
8379                     insertIt++;
8380                 }
8381             }
8382             return;
8383         }
8384     }
8385 
8386     // fix Dst if necessary
8387     // some special mix mode dst are allowed provided the instruction has F type:
8388     // r1.0<2>:bf
8389     // r1.1<2>:bf
8390     // r1.0<1>:bf
8391     // r1.8<1>:bf
8392     bool isSpecialMixModeDst = false;
8393     bool canDoPackedFtoHFMove = builder.hasFtoPackedHFMove() && inst->opcode() == G4_mov && inst->getExecSize() >= builder.getNativeExecSize() &&
8394         dstTy == Type_HF && !dst->isIndirect();
8395     if ((builder.getMixModeType() == dstTy || canDoPackedFtoHFMove) && IS_FTYPE(execTy))
8396     {
8397         uint16_t offset = 0;
8398         bool isAligned = builder.isOpndAligned(dst, offset, getGRFSize() / 2);
8399         if (dst->getHorzStride() == 1)
8400         {
8401             isSpecialMixModeDst = isAligned;
8402         }
8403         else if (dst->getHorzStride() == 2)
8404         {
8405             isSpecialMixModeDst = isAligned || (offset % 32) == 2;
8406         }
8407     }
8408 
8409     if (canDoPackedFtoHFMove && isSpecialMixModeDst)
8410     {
8411         if (inst->getExecSize() > builder.getNativeExecSize())
8412         {
8413             evenlySplitInst(it, bb);
8414         }
8415         return;
8416     }
8417 
8418     auto dstStride = TypeSize(dstTy) * dst->getHorzStride();
8419     uint16_t dstAlign = inst->getSrc(0)->getType() == Type_VF ? 16 : getGRFSize();
8420     if (dst->getRegAccess() == Direct && !isSpecialMixModeDst &&
8421         (!builder.isOpndAligned(dst, dstAlign) || dstStride != execTyWidth))
8422     {
8423         inst->setDest(insertMovAfter(it, dst, dst->getType(), bb, GRFALIGN));
8424         if (IS_TYPE_FLOAT_ALL(dst->getType()) || dst->getTypeSize() == 8)
8425         {
8426             // the move may need more fixing
8427             fixUnalignedRegions(std::next(it), bb);
8428         }
8429     }
8430     else if (dst->getRegAccess() == IndirGRF && dst->getType() == Type_F)
8431     {
8432         // Since we can't know if an indirect dst is aligned or not,
8433         // The proper fix is to insert a move then change its type to int.
8434         // FIXME: not sure how to handle fp64 yet
8435         inst->setDest(insertMovAfter(it, dst, dst->getType(), bb, GRFALIGN));
8436         // the move may need more fixing
8437         fixUnalignedRegions(std::next(it), bb);
8438     }
8439 
8440     auto getUnsignedType = [](int numByte)
8441     {
8442         switch (numByte)
8443         {
8444         case 1:
8445             return Type_UB;
8446         case 2:
8447             return Type_UW;
8448         case 4:
8449             return Type_UD;
8450         case 8:
8451             return Type_UQ;
8452         default:
8453             assert(false && "illegal type width");
8454             return Type_UD;
8455         }
8456     };
8457 
8458     // generate a move where each element is aligned to execTyWidth
8459     // e.g.,
8460     // mov (8) V1<1>:q V2<1;1,0>:ud
8461     // becomes
8462     // mov (8) tmp<2>:ud V2<1;1,0>:ud
8463     // mov (8) V1<1>:q tmp<2;1,0>:ud
8464     // or
8465     // add (8) V1<1>:f V2<2;1,0>:f V3<1;1,0>:f
8466     // becomes
8467     // mov (8) tmp<1>:ud V2<2;1,0>:ud
8468     // add (8) V1<1>:f tmp<1;1,0>:f V3<1;1,0>:f
8469     // note that for float types we have to do the move in int since the move may be illegal otherwise
8470     auto doAlignMove = [&](G4_INST* inst, int srcPos, int stride)
8471     {
8472         // caller must ensure src is a srcregregion
8473         bool movNeedsFix = false;
8474         auto src = inst->getSrc(srcPos)->asSrcRegRegion();
8475         auto srcTy = src->getType();
8476         auto tmpTy = getUnsignedType((int)TypeSize(srcTy));
8477         auto movSrcTy = tmpTy;
8478         auto newSrcTy = srcTy;
8479         if (stride == 8 || (tmpTy == Type_UB &&
8480             builder.getNativeExecSize() > g4::SIMD8 &&
8481             (stride == 2 || stride == 4)))
8482         {
8483             // use UW as the tmp, and divide the stride by 2
8484             // there are two reasons for this transform,
8485             // 1) stride 8 is not supported
8486             // 2) avoid read-modify-write on bytes
8487             // mov (4) V1<4>:uw V2:ub
8488             // then use <4;1,0>:uw in the original inst
8489             tmpTy = (srcTy == Type_UB) ? Type_UW : Type_W;
8490             movSrcTy = srcTy;
8491             stride = stride / 2;
8492             newSrcTy = tmpTy;
8493         }
8494         auto tmp = builder.createTempVar(inst->getExecSize() * stride, tmpTy, GRFALIGN);
8495         auto movSrc = builder.createSrcRegRegion(*src);
8496         movSrc->setModifier(Mod_src_undef);
8497         movSrc->setType(movSrcTy);
8498         auto movInst = builder.createMov(inst->getExecSize(),
8499             builder.createDstRegRegion(tmp, stride), movSrc, inst->getOption(), false);
8500         if (movSrc->getTypeSize() == 8)
8501         {
8502             assert(stride == 1 && "expect dst stride to be 1 here");
8503             // the move instruction is itself illegal due to the source region being non-contiguous/not GRF-aligned
8504             // if the region is singly-strided, we can change it into a UD move, e.g.,
8505             // mov (8) V1<1>:q V2<2;1,0>:q
8506             // becomes
8507             // (W) mov (16) V1<1>:ud V2<4;2,1>:ud
8508             uint16_t srcStride = 0;
8509             if (movSrc->getRegion()->isSingleStride(inst->getExecSize(), srcStride))
8510             {
8511                 change64bCopyToUD(movInst, srcStride);
8512             }
8513             else
8514             {
8515                 movNeedsFix = true;
8516             }
8517         }
8518         bb->insertBefore(it, movInst);
8519         if (movNeedsFix)
8520         {
8521             // try splitting the move as last resort
8522             // it may be successful if we are not in SIMD CF
8523             evenlySplitInst(std::prev(it), bb);
8524         }
8525         auto newSrc = builder.createSrcRegRegion(src->getModifier(), Direct, tmp->getRegVar(), 0, 0,
8526             builder.createRegionDesc(stride, 1, 0), newSrcTy);
8527         inst->setSrc(newSrc, srcPos);
8528     };
8529 
8530     for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; ++i)
8531     {
8532         G4_SrcRegRegion* src = inst->getSrc(i) && inst->getSrc(i)->isSrcRegRegion() ?
8533             inst->getSrc(i)->asSrcRegRegion() : nullptr;
8534         if (src)
8535         {
8536             if (IS_BTYPE(src->getType()) && (src->getRegion()->isRegionWH() || src->getRegion()->isScalar()))
8537             {
8538                 // no scalar byte when dst is float
8539                 // byte src with DF dst is handled by fixMov
8540                 inst->setSrc(insertMovBefore(it, 0, inst->getDst()->getTypeSize() == 4 ? Type_D : Type_W, bb), 0);
8541             }
8542             else if (!src->getRegion()->isRegionWH() && !src->getRegion()->isScalar())
8543             {
8544                 // indirect VxH operands are handled elsewhere
8545                 auto srcStride = getSrcStride(src);
8546                 bool isMixModeSrc = isLowPrecisionFloatTy(src->getType()) && IS_FTYPE(execTy);
8547                 bool isMixModePackedSrc = isMixModeSrc && srcStride == 2;
8548                 uint16_t alignment = isMixModePackedSrc ? (getGRFSize() / 2) : getGRFSize();
8549                 // for mix mode the source must be packed, otherwise srcStride shoudl be == sizeof(exec type)
8550                 if (!builder.isOpndAligned(src, alignment) || (isMixModeSrc ? !isMixModePackedSrc : srcStride != execTyWidth))
8551                 {
8552                     int stride = (int)(isMixModeSrc ? 1 : execTyWidth / src->getTypeSize());
8553                     doAlignMove(inst, i, stride);
8554                 }
8555             }
8556         }
8557     }
8558 }
8559 
8560 // emulate mov HF BF8
8561 // with
8562 // shl UW UB 8
fixFcvt(INST_LIST_ITER i,G4_BB * bb)8563 bool HWConformity::fixFcvt(INST_LIST_ITER i, G4_BB* bb)
8564 {
8565     G4_INST* inst = *i;
8566     if (inst->opcode() != G4_fcvt)
8567     {
8568         return false;
8569     }
8570 
8571     if (inst->getDst()->getType() == Type_UB)
8572     {
8573         assert((inst->getSrc(0)->getType() == Type_HF) &&
8574             "Only HF->BF8 conversion is supported");
8575         assert(!inst->getPredicate() && !inst->getCondMod() && !inst->getSaturate() &&
8576             "HF->BF8 move does not support pred/cond mod/sat");
8577         assert(inst->getSrc(0)->isSrcRegRegion() &&
8578             "HF->BF8 currently supports non-imm source only");
8579         assert(inst->getSrc(0)->asSrcRegRegion()->getRegAccess() == Direct &&
8580             inst->getSrc(0)->asSrcRegRegion()->getModifier() == Mod_src_undef &&
8581             "HF->BF8 move does not support source modifier");
8582 
8583         // fix regioning <0;1,0> to <1;1,0> for execution sizes higher than 1.
8584         if (inst->getSrc(0)->asSrcRegRegion()->getRegion()->isScalar() &&
8585             inst->getExecSize() != g4::SIMD1)
8586         {
8587             inst->getSrc(0)->asSrcRegRegion()->setRegion(builder.getRegionStride1());
8588             inst->setSrc(insertMovBefore(i, 0, inst->getSrc(0)->getType(), bb, ThirtyTwo_Word), 0);
8589             INST_LIST_ITER newMovIter = i;
8590             newMovIter--;
8591             G4_INST* newMovInst = *newMovIter;
8592             newMovInst->getSrc(0)->asSrcRegRegion()->setRegion(builder.getRegionScalar());
8593         }
8594         assert(inst->getSrc(0)->asSrcRegRegion()->getRegion()->isContiguous(inst->getExecSize()) &&
8595             "HF->BF8 only support <1;1,0> regioning");
8596         if (inst->getDst()->getHorzStride() != 1)
8597         {
8598             replaceDst(i, inst->getDst()->getType(), ThirtyTwo_Word);
8599             INST_LIST_ITER newMovIter = i;
8600             newMovIter++;
8601             G4_INST* newMovInst = *newMovIter;
8602             newMovInst->getSrc(0)->asSrcRegRegion()->setType(Type_UB);
8603             newMovInst->getDst()->asDstRegRegion()->setType(Type_UB);
8604             if (inst->getExecSize() != g4::SIMD1)
8605             {
8606                 newMovInst->getSrc(0)->asSrcRegRegion()->setRegion(builder.getRegionStride1());
8607             }
8608             inst->getDst()->setHorzStride(1);
8609             inst->setOptionOn(InstOpt_WriteEnable);
8610         }
8611         if (!builder.isOpndAligned(inst->getDst(), 64) ||
8612             !inst->isWriteEnableInst())
8613         {
8614             replaceDst(i, inst->getDst()->getType(), ThirtyTwo_Word);
8615             INST_LIST_ITER newMovIter = i;
8616             newMovIter++;
8617             G4_INST* newMovInst = *newMovIter;
8618             newMovInst->getSrc(0)->asSrcRegRegion()->setType(Type_UB);
8619             newMovInst->getDst()->asDstRegRegion()->setType(Type_UB);
8620             inst->setOptionOn(InstOpt_WriteEnable);
8621         }
8622         if (!builder.isOpndAligned(inst->getSrc(0), 64))
8623         {
8624             inst->setSrc(insertMovBefore(i, 0, inst->getSrc(0)->getType(), bb, ThirtyTwo_Word), 0);
8625         }
8626         return true;
8627     }
8628 
8629     if (inst->getSrc(0)->getType() == Type_UB)
8630     {
8631         assert((inst->getDst()->getType() == Type_HF) &&
8632             "Only BF8->HF conversion is supported");
8633         assert(!inst->getPredicate() && !inst->getCondMod() && !inst->getSaturate() &&
8634             "BF8->HF move does not support pred/cond mod/sat");
8635         // don't support QF imm for now
8636         assert(inst->getSrc(0)->isSrcRegRegion() && inst->getSrc(0)->asSrcRegRegion()->getRegAccess() == Direct &&
8637             inst->getSrc(0)->asSrcRegRegion()->getModifier() == Mod_src_undef &&
8638             "BF8->HF move does not support source modifier");
8639 
8640         // fix regioning <0;1,0> to <1;1,0> for execution sizes higher than 1.
8641         if (inst->getSrc(0)->asSrcRegRegion()->getRegion()->isScalar() &&
8642             inst->getExecSize() != g4::SIMD1)
8643         {
8644             inst->getSrc(0)->asSrcRegRegion()->setRegion(builder.getRegionStride1());
8645             inst->setSrc(insertMovBefore(i, 0, inst->getSrc(0)->getType(), bb, ThirtyTwo_Word), 0);
8646             INST_LIST_ITER newMovIter = i;
8647             newMovIter--;
8648             G4_INST* newMovInst = *newMovIter;
8649             newMovInst->getSrc(0)->asSrcRegRegion()->setType(Type_UB);
8650             newMovInst->getDst()->asDstRegRegion()->setType(Type_UB);
8651             newMovInst->getSrc(0)->asSrcRegRegion()->setRegion(builder.getRegionScalar());
8652         }
8653         assert(inst->getSrc(0)->asSrcRegRegion()->getRegion()->isContiguous(inst->getExecSize()) &&
8654             "BF8->HF only support <1;1,0> regioning");
8655         if (inst->getDst()->getHorzStride() != 1)
8656         {
8657             if (inst->getExecSize() != g4::SIMD1)
8658             {
8659                 replaceDst(i, inst->getDst()->getType(), ThirtyTwo_Word);
8660                 INST_LIST_ITER newMovIter = i;
8661                 newMovIter++;
8662                 G4_INST* newMovInst = *newMovIter;
8663                 newMovInst->getSrc(0)->asSrcRegRegion()->setRegion(builder.getRegionStride1());
8664             }
8665             inst->getDst()->setHorzStride(1);
8666         }
8667         if (!builder.isOpndAligned(inst->getDst(), 64))
8668         {
8669             replaceDst(i, inst->getDst()->getType(), ThirtyTwo_Word);
8670         }
8671         if (!builder.isOpndAligned(inst->getSrc(0), 64))
8672         {
8673             inst->setSrc(insertMovBefore(i, 0, inst->getSrc(0)->getType(), bb, ThirtyTwo_Word), 0);
8674             INST_LIST_ITER newMovIter = i;
8675             newMovIter--;
8676             G4_INST* newMovInst = *newMovIter;
8677             newMovInst->getSrc(0)->asSrcRegRegion()->setType(Type_UB);
8678             newMovInst->getDst()->asDstRegRegion()->setType(Type_UB);
8679         }
8680 
8681         inst->getSrc(0)->asSrcRegRegion()->setType(Type_UB);
8682         G4_SrcRegRegion* newSrc0 = inst->getSrc(0)->asSrcRegRegion();
8683 
8684         inst->getDst()->setType(Type_UW);
8685         auto newDst = inst->getDst();
8686 
8687         auto shlInst = builder.createBinOp(G4_shl,
8688             inst->getExecSize(), newDst, newSrc0, builder.createImm(8, Type_UW), inst->getOption(), false);
8689         bb->insertBefore(i, shlInst);
8690         bb->erase(i);
8691 
8692         return true;
8693     }
8694     if (inst->getSrc(0)->getType() == Type_UD)
8695     {
8696         // fcvt  a:F   b:tf32
8697         // --> mov  a:f  b:f  (tf32 format is valid f)
8698         G4_Operand* newSrc;
8699         if (inst->getSrc(0)->isImm())
8700         {
8701             float  newF = inst->getSrc(0)->asImm()->getFloat();
8702             newSrc = builder.createImm(newF);
8703         }
8704         else
8705         {
8706             G4_SrcRegRegion* regSrc = inst->getSrc(0)->asSrcRegRegion();
8707             regSrc->setType(Type_F);
8708             newSrc = regSrc;
8709         }
8710         auto newDst = inst->getDst();
8711         auto movInst = builder.createMov(inst->getExecSize(), newDst, newSrc, inst->getOption(), false);
8712         bb->insertBefore(i, movInst);
8713         bb->erase(i);
8714         return true;
8715     }
8716 
8717     if (inst->getDst()->getType() == Type_UD)
8718     {
8719         // fcvt a:tf32   b:f
8720         // Make sure dst/src0 have the same subreg offset and stride, except for scalar broadcast.
8721         G4_Operand* src0 = inst->getSrc(0);
8722         if (src0->isSrcRegRegion() && !src0->asSrcRegRegion()->getRegion()->isScalar())
8723         {
8724             G4_SrcRegRegion* regSrc0 = inst->getSrc(0)->asSrcRegRegion();
8725             G4_DstRegRegion* regDst = inst->getDst();
8726             uint16_t srcSingleStride;
8727             // Note that regSrc0 must not be scalar here!
8728             if (!regSrc0->getRegion()->isSingleStride(inst->getExecSize(), srcSingleStride))
8729             {
8730                 // set it to an invalid value as it has no single (uniform) stride
8731                 srcSingleStride = 0xFFFF;
8732             }
8733             if (srcSingleStride != regDst->getHorzStride() || !hasSameSubregOffset(inst))
8734             {
8735                 // Need to force GRF-alignment and stride = 1
8736                 if (srcSingleStride != 1 || !regSrc0->checkGRFAlign())
8737                 {
8738                     // Make sure to do UD copy for src
8739                     regSrc0->setType(Type_UD);
8740                     // Insert mov before i
8741                     replaceSrc(i, 0, Type_UD, bb, ThirtyTwo_Word);
8742                     // must have the original type (float) for i
8743                     inst->getSrc(0)->asSrcRegRegion()->setType(Type_F);
8744                 }
8745                 if (regDst->getHorzStride() != 1 || !regDst->checkGRFAlign())
8746                 {
8747                     replaceDst(i, regDst->getType(), ThirtyTwo_Word);
8748                 }
8749                 return true;
8750             }
8751         }
8752     }
8753 
8754     return false;
8755 }
8756 
8757 // on PVC there are new restrictions on using byte/word region due to XBar reduction
fixByteXBarRestriction(INST_LIST_ITER it,G4_BB * bb)8758 void HWConformity::fixByteXBarRestriction(INST_LIST_ITER it, G4_BB* bb)
8759 {
8760     G4_INST* inst = *it;
8761 
8762     if (!inst->getDst() || inst->isSend() || inst->isDpas() ||
8763         inst->getExecSize() == g4::SIMD1)
8764     {
8765         // only check non-scalar ALU instructions
8766         return;
8767     }
8768 
8769     // due to much stronger restriction on float-pipe operation,
8770     // assume float-op has been fixed in fixUnalignedRegions
8771     if (isFloatOr64b(inst))
8772     {
8773         return;
8774     }
8775 
8776     // hardware checks restriction even on null
8777     if (inst->getDst()->isNullReg())
8778     {
8779         auto dst = inst->getDst();
8780         auto dstTy = dst->getType();
8781         auto stride = dst->getHorzStride();
8782 
8783         if ((dstTy == Type_W || dstTy == Type_UW) && stride < 2)
8784             dst->setHorzStride(2);
8785         else if (dstTy == Type_B || dstTy == Type_UB)
8786         {
8787             // create a new dst with W/UW type
8788             G4_DstRegRegion* new_null = builder.createNullDst(dstTy == Type_B ? Type_W : Type_UW);
8789             new_null->setHorzStride(2);
8790             inst->setDest(new_null);
8791         }
8792         return;
8793     }
8794 
8795     if (VISA_WA_CHECK(builder.getPWaTable(), Wa_22010493955) &&
8796         inst->opcode() == G4_mov && inst->getSaturate())
8797     {
8798         auto dst = inst->getDst();
8799         auto dstTy = dst->getType();
8800         if ((dstTy == Type_B || dstTy == Type_UB) &&
8801             inst->getSrc(0) && inst->getSrc(0)->isSrcRegRegion())
8802         {
8803             auto src = inst->getSrc(0)->asSrcRegRegion();
8804             auto srcTy = src->getType();
8805             if (srcTy == Type_B)
8806             {
8807                 insertMovBefore(it, 0, Type_D, bb);
8808                 return;
8809             }
8810             if (srcTy == Type_UB)
8811             {
8812                 insertMovBefore(it, 0, Type_UD, bb);
8813                 return;
8814             }
8815         }
8816     }
8817 
8818     auto isDclGRFAligned = [](G4_Declare* dcl)
8819     {
8820         if (!dcl)
8821         {
8822             return false;
8823         }
8824         uint32_t offset = 0;
8825         auto rootDcl = dcl->getRootDeclare(offset);
8826         return rootDcl->getSubRegAlign() >= GRFALIGN && (offset % getGRFSize() == 0);
8827     };
8828 
8829     bool needFix = false;
8830     auto dst = inst->getDst();
8831     auto dstTy = dst->getType();
8832     // FIXME: should call isOpndAligned() here, but seems later code processes subRegOff separately..
8833     bool dstAligned = (dst->getRegAccess() == Direct) && isDclGRFAligned(dst->getTopDcl());
8834     auto dstSubRegOff = dst->getSubRegOff();
8835     bool allDirect = (dst->getRegAccess() == Direct);
8836 
8837     // Fix for the odd destination subregister for G4_and, G4_or, G4_xor, G4_add, G4_asr, G4_sel, G4_cmp
8838     // Adding mov instruction to change inst dst subregister to even when conditions are met:
8839     // - instruction is at least two sources and dst isn't null
8840     // - dst sub-register is odd and dst stride is at least 1
8841     // - src0 reg region exist and isn't contiguous
8842     // - dst is B/UB, src0 is B/UB or W/UW, src1 is B/UB or W/UW
8843     if (VISA_WA_CHECK(builder.getPWaTable(), Wa_22010487853) &&
8844         inst->getNumSrc() > 1 &&
8845         inst->getDst() != NULL &&
8846         inst->getDst()->getSubRegOff() % 2 &&
8847         inst->getDst()->getHorzStride() > 1 &&
8848         inst->getSrc(0)->isSrcRegRegion() &&
8849         inst->getSrc(0)->asSrcRegRegion()->getRegion()->isContiguous(inst->getExecSize()) == false &&
8850         IS_BTYPE(inst->getDst()->getType()) &&
8851         (IS_BTYPE(inst->getSrc(0)->getType()) || IS_WTYPE(inst->getSrc(0)->getType())) &&
8852         (IS_BTYPE(inst->getSrc(1)->getType()) || IS_WTYPE(inst->getSrc(1)->getType())))
8853     {
8854         auto newDstTy = inst->getDst()->getType();
8855         // if dst stride is equal 4 and b2b-DS2 rule isn't covered, changing dst type to dword
8856         if (inst->getDst()->getHorzStride() == 4)
8857         {
8858              newDstTy = Type_D;
8859              replaceDst(it, newDstTy);
8860              return;
8861         }
8862         // force a fix when we applied b2b or w2b rule
8863         needFix = true;
8864     }
8865 
8866     if (VISA_WA_CHECK(builder.getPWaTable(), Wa_22010487853) &&
8867         (dstTy == Type_B || dstTy == Type_UB) && (dstSubRegOff % 2) &&
8868         dst->getHorzStride() >= 4 && inst->getExecSize() == g4::SIMD32)
8869     {
8870         assert(canSplitInst(inst, NULL));
8871         evenlySplitInst(it, bb);
8872         return;
8873     }
8874     // check src0-to-dest and src1-to-dest restrictions
8875     for (int i = 0, numSrc = inst->getNumSrc(); !needFix && i < 2 && i < numSrc; ++i)
8876     {
8877         G4_SrcRegRegion* src = inst->getSrc(i) && inst->getSrc(i)->isSrcRegRegion() ?
8878             inst->getSrc(i)->asSrcRegRegion() : nullptr;
8879         if (!src)
8880         {
8881             continue;
8882         }
8883         // check then fix the restriction on Src
8884         auto srcTy = src->getType();
8885         auto region = src->getRegion();
8886         bool srcDirect = (src->getRegAccess() == Direct);
8887         allDirect &= srcDirect;
8888         // skip VxH indirect case because src operand will be read out one element a time
8889         if (!srcDirect && region->isRegionWH())
8890         {
8891             continue;
8892         }
8893         bool srcAligned = srcDirect && isDclGRFAligned(src->getTopDcl());
8894         auto srcSubRegOff = src->getSubRegOff();
8895         auto numRows = 1;
8896         unsigned ss = (region->width == 1) ? region->vertStride : region->horzStride;
8897         // we need to check the region rule one row at a time under the following situation
8898         if (region->width > 1 && region->width * region->horzStride != region->vertStride)
8899         {
8900             numRows = inst->getExecSize() / region->width;
8901             assert((inst->getExecSize() % region->width) == 0);
8902         }
8903         for (int row = 0; row < numRows; ++row)
8904         {
8905             srcSubRegOff = (srcSubRegOff + row * region->vertStride) % (getGRFSize() / TypeSize(srcTy));
8906             dstSubRegOff = (dstSubRegOff + row * region->width * dst->getHorzStride()) % (getGRFSize() / TypeSize(dstTy));
8907             bool dstSubRegOffDwordAlign = ((dstSubRegOff % (4 / TypeSize(dstTy))) == 0);
8908             if (TypeSize(srcTy) == 2)
8909             {
8910                 // w2w and w2b rules
8911                 // cannot have the case of w2b packing case, i.e. dest-stride == 1
8912                 assert(!(TypeSize(dstTy) == 1 && dst->getHorzStride() == 1));
8913                 if ((TypeSize(dstTy) == 2 && dst->getHorzStride() == 1) ||
8914                     (TypeSize(dstTy) == 1 && dst->getHorzStride() == 2))
8915                 {
8916                     if (numRows > 1 && !dstSubRegOffDwordAlign)
8917                     {
8918                         needFix = true;
8919                     }
8920                     else if (ss == 2)
8921                     {
8922                         bool Aligned = srcAligned && dstAligned
8923                             && !(i == 1 && TypeSize(dstTy) == 1 && VISA_WA_CHECK(builder.getPWaTable(), Wa_16012383669))
8924                             && ((dstSubRegOff % (32 / TypeSize(dstTy))) == (srcSubRegOff / TypeSize(dstTy)));
8925                         needFix |= !Aligned;
8926                     }
8927                     else if (ss > 2)
8928                     {
8929                         needFix = true;
8930                     }
8931                 }
8932             }
8933             else if (TypeSize(srcTy) == 1)
8934             {
8935                 if (TypeSize(dstTy) == 2 && dst->getHorzStride() == 1)  // b2w rule
8936                 {
8937                     if (numRows > 1 && !dstSubRegOffDwordAlign)
8938                     {
8939                         needFix = true;
8940                     }
8941                     else if (ss == 4)
8942                     {
8943                         bool Aligned = srcAligned && dstAligned
8944                             && ((2 * (dstSubRegOff % 16)) == (srcSubRegOff / 2));
8945                         needFix |= !Aligned;
8946                     }
8947                     else if (ss == 8)
8948                     {
8949                         bool Aligned = srcAligned && dstAligned
8950                             && ((2 * (dstSubRegOff % 8)) == (srcSubRegOff / 4));
8951                         needFix |= !Aligned;
8952                     }
8953                     else if (ss > 8)
8954                     {
8955                         needFix = true;
8956                     }
8957                 }
8958                 else if (TypeSize(dstTy) == 1 && dst->getHorzStride() == 2)  // b2b-DS2 rule
8959                 {
8960                     if (numRows > 1 && !dstSubRegOffDwordAlign)
8961                     {
8962                         needFix = true;
8963                     }
8964                     else if (ss == 4)
8965                     {
8966                         bool Aligned = srcAligned && dstAligned
8967                             && ((dstSubRegOff % 32) == (srcSubRegOff / 2));
8968                         // change dstAligned to false, so we need a pack-shift
8969                         // in the end of the fix
8970                         if (VISA_WA_CHECK(builder.getPWaTable(), Wa_1507979211))
8971                         {
8972                             dstAligned &= (dstSubRegOff < 32);
8973                             Aligned &= (dstSubRegOff < 32);
8974                         }
8975                         needFix |= !Aligned;
8976                     }
8977                     else if (ss == 8)
8978                     {
8979                         bool Aligned = srcAligned && dstAligned
8980                             && ((dst->getSubRegOff() % 16) == (srcSubRegOff / 4));
8981                         needFix |= !Aligned;
8982                     }
8983                     else if (ss > 8)
8984                     {
8985                         needFix = true;
8986                     }
8987 
8988                 }
8989                 else if (TypeSize(dstTy) == 1 && dst->getHorzStride() == 1 && region->width != 2) // b2b-DS1 rule
8990                 {
8991                     if (numRows > 1 && !dstSubRegOffDwordAlign)
8992                     {
8993                         needFix = true;
8994                     }
8995                     else if (ss == 2)
8996                     {
8997                         bool Aligned = srcAligned && dstAligned
8998                             && ((dstSubRegOff % 32) == (srcSubRegOff / 2));
8999                         needFix |= !Aligned;
9000                     }
9001                     else if (ss == 4)
9002                     {
9003                         bool Aligned = srcAligned && dstAligned
9004                             && ((dstSubRegOff % 16) == (srcSubRegOff / 4));
9005                         // change dstAligned to false, so we need a pack-shift
9006                         // in the end of the fix
9007                         if (VISA_WA_CHECK(builder.getPWaTable(), Wa_1507979211))
9008                         {
9009                             dstAligned &= (dstSubRegOff < 32);
9010                             Aligned &= (dstSubRegOff < 32);
9011                         }
9012                         needFix |= !Aligned;
9013                     }
9014                     else if (ss > 4)
9015                     {
9016                         needFix = true;
9017                     }
9018 
9019                 }
9020                 else if (TypeSize(dstTy) == 1 && dst->getHorzStride() == 1 && region->width == 2)  // b2b-DS1 rule
9021                 {
9022                     if (numRows > 1 && !dstSubRegOffDwordAlign)
9023                     {
9024                         needFix = true;
9025                     }
9026                     else if (region->horzStride + region->vertStride >= 4)
9027                     {
9028                         if (region->horzStride == 2 && region->vertStride == 4)
9029                         {
9030                             bool Aligned = srcAligned && dstAligned
9031                                 && ((dstSubRegOff % 32) == (srcSubRegOff / 2));
9032                             // change dstAligned to false, so we need a pack-shift
9033                             // in the end of the fix
9034                             if (VISA_WA_CHECK(builder.getPWaTable(), Wa_1507979211))
9035                             {
9036                                 dstAligned &= (dstSubRegOff < 32);
9037                                 Aligned &= (dstSubRegOff < 32);
9038                             }
9039                             needFix |= !Aligned;
9040                         }
9041                         else if (region->horzStride == 4 && region->vertStride == 8)
9042                         {
9043                             bool Aligned = srcAligned && dstAligned
9044                                 && ((dstSubRegOff % 16) == (srcSubRegOff / 4));
9045                             // change dstAligned to false, so we need a pack-shift
9046                             // in the end of the fix
9047                             if (VISA_WA_CHECK(builder.getPWaTable(), Wa_1507979211))
9048                             {
9049                                 dstAligned &= (dstSubRegOff < 32);
9050                                 Aligned &= (dstSubRegOff < 32);
9051                             }
9052                             needFix |= !Aligned;
9053 
9054                         }
9055                         else
9056                         {
9057                             needFix = true;
9058                         }
9059                     }
9060                     else if (region->horzStride == 2)
9061                     {
9062                         // DS==1 && W==2 && HS==2 && VS == 0 or 1
9063                         needFix = true;
9064                     }
9065                 }
9066             }
9067         }
9068     }
9069 
9070     if (needFix)
9071     {
9072         if (inst->getExecSize() == g4::SIMD2 && allDirect && inst->getNumSrc() != 3)
9073         {
9074             // just split the inst
9075             evenlySplitInst(it, bb);
9076             return;
9077         }
9078 
9079         auto scale = 4 / TypeSize(dstTy);
9080         const RegionDesc* unpackRegion = builder.createRegionDesc(scale, 1, 0);
9081         dstSubRegOff = dst->getSubRegOff() % (getGRFSize() / TypeSize(dstTy));
9082 
9083         // compute the sub-reg-offset we need to use
9084         short tmpSSR = 0;
9085         if (TypeSize(dstTy) == 2)
9086         {
9087             tmpSSR = 2 * (dstSubRegOff % 16);
9088         }
9089         else
9090         {
9091             assert(TypeSize(dstTy) == 1);
9092             if (dst->getHorzStride() == 2)
9093             {
9094                 tmpSSR = 2 * (dstSubRegOff % 32);
9095             }
9096             else
9097             {
9098                 assert(dst->getHorzStride() == 1);
9099                 tmpSSR = 4 * (dstSubRegOff % 16);
9100             }
9101         }
9102         auto tempSize = std::max(inst->getExecSize() * scale + tmpSSR, getGRFSize() / TypeSize(dstTy));
9103 
9104         // Replace the dest with a temp, same-type, offset == 0
9105         // stride == 2 for word-type; stride == 4 for byte-type
9106         // Add a B2B or W2W pack-move from temp.0(stride; 1, 0)  to the original-dest.sub(ds)
9107         // however, if the original-dest is NOT grf-aligned, we need another B2B or W2W
9108         // to shift the location of packed bytes or words after packing.
9109         if (dstAligned && (tempSize <= (unsigned short)(getGRFSize() * 2)))
9110         {
9111             G4_Declare* unpackDcl = builder.createTempVar(tempSize, dstTy, GRFALIGN);
9112 
9113             G4_SrcRegRegion* unpackSrc = builder.createSrc(
9114                 unpackDcl->getRegVar(),
9115                 0,
9116                 tmpSSR,
9117                 unpackRegion,
9118                 unpackDcl->getElemType());
9119 
9120             G4_Predicate* pred = NULL;
9121             if (inst->opcode() != G4_sel)
9122             {
9123                 pred = inst->getPredicate();
9124                 inst->setPredicate(NULL);
9125                 // maintainDU4TempMov will update def-use
9126             }
9127             unsigned int new_option = inst->getMaskOption();
9128 
9129             auto pos = it;
9130             pos++;
9131 
9132             // insert the packing move
9133             G4_INST* packInst = builder.createMov(inst->getExecSize(), dst, unpackSrc, new_option, false);
9134             packInst->setPredicate(pred);
9135             bb->insertBefore(pos, packInst);
9136 
9137             // update def-use info
9138             maintainDU4TempMov(inst, packInst);
9139             // change the destination of the original instruction
9140             if (dstTy == Type_UW || dstTy == Type_W || inst->getSaturate() || (tmpSSR % scale))
9141             {
9142                 auto tmpDst = builder.createDst(
9143                     unpackDcl->getRegVar(),
9144                     0,
9145                     tmpSSR,
9146                     scale,
9147                     unpackDcl->getElemType());
9148                 inst->setDest(tmpDst);
9149             }
9150             else
9151             {
9152                 // use dword destination to avoid read-modify-write
9153                 G4_Declare* tmpDstDcl =
9154                     builder.createTempVar(tempSize / scale,
9155                     (dstTy == Type_UB) ? Type_UD : Type_D, GRFALIGN);
9156                 tmpDstDcl->setAliasDeclare(unpackDcl, 0);
9157                 auto tmpDst = builder.createDst(
9158                     tmpDstDcl->getRegVar(),
9159                     0,
9160                     tmpSSR / scale,
9161                     1,
9162                     tmpDstDcl->getElemType());
9163                 inst->setDest(tmpDst);
9164             }
9165         }
9166         else
9167         {
9168             G4_Declare* unpackDcl = builder.createTempVar(inst->getExecSize() * scale, dstTy, GRFALIGN);
9169             G4_SrcRegRegion* unpackSrc = builder.createSrcRegRegion(unpackDcl, unpackRegion);
9170             G4_Predicate* pred = NULL;
9171             if (inst->opcode() != G4_sel)
9172             {
9173                 pred = inst->getPredicate();
9174                 inst->setPredicate(NULL);
9175                 // maintainDU4TempMov will update def-use
9176             }
9177             unsigned int new_option = inst->getMaskOption();
9178             auto pos = it;
9179             pos++;
9180             auto dstride = dst->getHorzStride();
9181             const RegionDesc* shiftRegion = builder.createRegionDesc(dstride, 1, 0);
9182             G4_Declare* shiftDcl = builder.createTempVar(inst->getExecSize() * dstride, dstTy, GRFALIGN);
9183             G4_SrcRegRegion* shiftSrc = builder.createSrcRegRegion(shiftDcl, shiftRegion);
9184             auto packTmp = builder.createDstRegRegion(shiftDcl, dstride);
9185             // pack
9186             G4_INST* packInst = builder.createMov(inst->getExecSize(), packTmp, unpackSrc, new_option, false);
9187             packInst->setPredicate(pred);
9188             bb->insertBefore(pos, packInst);
9189             // then shift the bytes and words location
9190             G4_INST* shiftInst = builder.createMov(inst->getExecSize(), dst, shiftSrc, new_option, false);
9191             shiftInst->setPredicate(pred);
9192             bb->insertBefore(pos, shiftInst);
9193             // update propagation info
9194             maintainDU4TempMov(inst, shiftInst);
9195             // change the destination of the original instruction
9196             if (dstTy == Type_UW || dstTy == Type_W || inst->getSaturate())
9197             {
9198                 inst->setDest(builder.createDstRegRegion(unpackDcl, scale));
9199             }
9200             else
9201             {
9202                 // situations we use dword-tmp to reduce byte-read-mod-write
9203                 G4_Declare* tmpDstDcl =
9204                     builder.createTempVar(inst->getExecSize(),
9205                     (dstTy == Type_UB) ? Type_UD : Type_D, GRFALIGN);
9206                 tmpDstDcl->setAliasDeclare(unpackDcl, 0);
9207                 inst->setDest(builder.createDstRegRegion(tmpDstDcl, 1));
9208             }
9209         }
9210     }
9211 }
9212 
fixSrnd(INST_LIST_ITER it,G4_BB * bb)9213 bool HWConformity::fixSrnd(INST_LIST_ITER it, G4_BB* bb)
9214 {
9215     G4_INST* inst = *it;
9216     if (inst->opcode() != G4_srnd)
9217     {
9218         return false;
9219     }
9220 
9221     bool changed = false;  // return value
9222 
9223     // case 1. src0 cannot be imm.
9224     // case 2. subreg must be zero  (must be grf-aligned)
9225     // case 3. For HF->BF8,  both dst and src must be packed
9226     // srnd: https://gfxspecs.intel.com/Predator/Home/Index/67451
9227     G4_DstRegRegion* dst = inst->getDst();
9228     uint32_t execsize = inst->getExecSize();
9229     bool Packed = (dst->getType() == Type_UB);
9230     if (!dst->checkGRFAlign() ||                // case 2
9231         (Packed && dst->getHorzStride() != 1))  // case 3
9232     {
9233         G4_Declare* dcl = builder.createTempVar(execsize, dst->getType(), GRFALIGN);
9234         G4_SrcRegRegion* srcRegion = builder.createSrcRegRegion(
9235             dcl,
9236             execsize == 1 ? builder.getRegionScalar() : builder.getRegionStride1());
9237         uint32_t newOption = InstOpt_WriteEnable | inst->getMaskOption();
9238         G4_INST* newInst = builder.createMov(G4_ExecSize(execsize), dst, srcRegion, newOption, false);
9239         bb->insertAfter(it, newInst);
9240 
9241         G4_DstRegRegion* newDst = builder.createDstRegRegion(dcl, 1);
9242         inst->setDest(newDst);
9243         changed = true;
9244     }
9245 
9246     G4_Operand* opnd0 = inst->getSrc(0);
9247     if (opnd0->isImm() || // case 1
9248         !opnd0->asSrcRegRegion()->checkGRFAlign() ||  // case 2
9249         (Packed && !opnd0->asSrcRegRegion()->getRegion()->isContiguous(execsize))) // case 3
9250     {
9251         G4_Operand* newSrc0 = insertMovBefore(it, 0, opnd0->getType(), bb, GRFALIGN);
9252         inst->setSrc(newSrc0, 0);
9253         G4_INST* newMovInst = *(std::prev(it));
9254         newMovInst->setNoMask(true);
9255         changed = true;
9256     }
9257 
9258     G4_Operand* opnd1 = inst->getSrc(1);
9259     if (opnd1->isSrcRegRegion() &&
9260         (!opnd1->asSrcRegRegion()->checkGRFAlign() || // case 2
9261          (Packed && !opnd1->asSrcRegRegion()->getRegion()->isContiguous(execsize)))) // case 3
9262     {
9263         G4_Operand* newSrc1 = insertMovBefore(it, 1, opnd1->getType(), bb, GRFALIGN);
9264         inst->setSrc(newSrc1, 1);
9265         G4_INST* newMovInst = *(std::prev(it));
9266         newMovInst->setNoMask(true);
9267         changed = true;
9268     }
9269     return changed;
9270 }
9271 
fixShiftInsts(INST_LIST_ITER i,G4_BB * bb)9272 void HWConformity::fixShiftInsts(INST_LIST_ITER i, G4_BB* bb)
9273 {
9274     G4_INST* inst = *i;
9275     if (inst->opcode() != G4_shl && inst->opcode() != G4_shr && inst->opcode() != G4_asr)
9276     {
9277         return;
9278     }
9279 
9280     auto dst = inst->getDst();
9281     auto src0 = inst->getSrc(0);
9282     auto src1 = inst->getSrc(1);
9283 
9284     bool needWA = false;
9285 
9286     if (builder.getPlatform() == GENX_PVCXT && !IS_QTYPE(dst->getType()) && !IS_QTYPE(src0->getType()) && IS_QTYPE(src1->getType()))
9287     {
9288         needWA = true;
9289     }
9290 
9291     if (builder.getOption(vISA_forceSrc0ToQwForQwShlWA) && inst->opcode() == G4_shl && IS_QTYPE(dst->getType()) && !IS_QTYPE(src0->getType()))
9292     {
9293         needWA = true;
9294     }
9295 
9296     if (needWA)
9297     {
9298         G4_Operand* newSrc0 = insertMovBefore(i, 0, IS_SIGNED_INT(src0->getType()) ? Type_Q : Type_UQ, bb);
9299         inst->setSrc(newSrc0, 0);
9300     }
9301 }
9302 
hasDedicateAlignRegionConformity(const G4_INST * I) const9303 bool HWConformity::hasDedicateAlignRegionConformity(const G4_INST *I) const
9304 {
9305     switch (I->opcode())
9306     {
9307     case G4_fcvt:
9308         return true;
9309     case G4_srnd:
9310         return true;
9311     default:
9312         break;
9313     }
9314     return false;
9315 }
9316 
9317 // get rid of source modifiers on this inst[srcPos]
fixSrc1Region(INST_LIST_ITER it,G4_BB * bb)9318 void HWConformity::fixSrc1Region(INST_LIST_ITER it, G4_BB* bb)
9319 {
9320     G4_INST* inst = *it;
9321     G4_Operand* src1 = inst->getSrc(1);
9322 
9323     // need extra move if horzStride >= 4
9324     if (src1->isSrcRegRegion() && src1->asSrcRegRegion()->getRegion()->horzStride >= 4)
9325     {
9326         G4_Operand* new_src1 = insertMovBefore(it, 1, src1->getType(), bb);
9327         inst->setSrc(new_src1, 1);
9328     }
9329 }
9330 
fixMadwInst(INST_LIST_ITER it,G4_BB * bb)9331 INST_LIST_ITER HWConformity::fixMadwInst(INST_LIST_ITER it, G4_BB* bb)
9332 {
9333     G4_INST* madwInst = *it;
9334     auto execSize = madwInst->getExecSize();
9335     MUST_BE_TRUE(madwInst->opcode() == G4_madw, "expect madw instruction");
9336 
9337     MUST_BE_TRUE(builder.getPlatform() >= GENX_PVC || execSize != g4::SIMD32, "SIMD32 is not supported on this platform for madw");
9338 
9339     auto dst = madwInst->getDst();
9340     MUST_BE_TRUE(IS_DTYPE(dst->getType()), "dst only supports DW type");
9341 
9342     auto src0 = madwInst->getSrc(0);
9343     auto src1 = madwInst->getSrc(1);
9344     auto src2 = madwInst->getSrc(2);
9345     MUST_BE_TRUE(IS_DTYPE(src0->getType()) && IS_DTYPE(src1->getType()) && IS_DTYPE(src2->getType()), "only DW-type sources are supported");
9346 
9347     // src1 does not support modifier
9348     checkSrcMod(it, bb, 1);
9349 
9350     // fix src1 region: stride can't exceed 4, otherwise the stride of src1 in the expanded mul will be invalid
9351     fixSrc1Region(it, bb);
9352     src1 = madwInst->getSrc(1);
9353 
9354     // fix modifier for src0
9355     if (!builder.supportSrcModforMul())
9356     {
9357         checkSrcMod(it, bb, 0);
9358         src0 = madwInst->getSrc(0);
9359     }
9360 
9361     // sat cannot be used at all in the macro sequence
9362     // make the dst GRF-aligned before expanding to macro
9363     if (madwInst->getSaturate() ||
9364         dst->getHorzStride() != 1 ||
9365         isPreAssignedRegOffsetNonZero<G4_DstRegRegion>(dst) ||
9366         !builder.isOpndAligned(dst, getGRFSize()))
9367     {
9368         // add tmp mov instructions
9369         int dstLowGRFNum = (int)std::ceil((float)(execSize * dst->getExecTypeSize()) / getGRFSize());
9370         int dstTotalGRFNum = dstLowGRFNum * 2;
9371 
9372         G4_Declare* newDstDcl = builder.createTempVar(numEltPerGRF(dst->getType()) * dstTotalGRFNum, dst->getType(), GRFALIGN);
9373 
9374         // add a tmp mov for low results in dst
9375         G4_Declare* lowMovSrcDcl = builder.createTempVar(numEltPerGRF(dst->getType()) * dstLowGRFNum, dst->getType(), GRFALIGN);
9376         lowMovSrcDcl->setAliasDeclare(newDstDcl, 0);
9377         G4_SrcRegRegion* lowMovSrc = builder.createSrcRegRegion(lowMovSrcDcl, builder.getRegionStride1());
9378         auto dstLow = builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff(), dst->getHorzStride(), dst->getType());
9379         G4_INST* lowMovInst = builder.createMov(execSize, dstLow, lowMovSrc, madwInst->getMaskOption(), false);
9380         lowMovInst->setPredicate(madwInst->getPredicate());
9381         lowMovInst->setSaturate(madwInst->getSaturate());
9382         auto insertIter = bb->insertAfter(it, lowMovInst);
9383         maintainDU4TempMov(madwInst, lowMovInst);
9384 
9385         // add a tmp mov for high results in dst
9386         G4_Declare* hiMovSrcDcl = builder.createTempVar(numEltPerGRF(dst->getType()) * dstLowGRFNum, dst->getType(), GRFALIGN);
9387         hiMovSrcDcl->setAliasDeclare(newDstDcl, dstLowGRFNum * getGRFSize());
9388         G4_SrcRegRegion* hiMovSrc = builder.createSrcRegRegion(hiMovSrcDcl, builder.getRegionStride1());
9389         auto dstHi = builder.createDst(dst->getBase(), dst->getRegOff() + dstLowGRFNum, dst->getSubRegOff(), dst->getHorzStride(), dst->getType());
9390         G4_INST* hiMovInst = builder.createMov(execSize, dstHi, hiMovSrc, madwInst->getMaskOption(), false);
9391         hiMovInst->setPredicate(madwInst->getPredicate());
9392         hiMovInst->setSaturate(madwInst->getSaturate());
9393         bb->insertAfter(insertIter, hiMovInst);
9394         maintainDU4TempMov(madwInst, hiMovInst);
9395 
9396         G4_DstRegRegion* newDst = builder.createDstRegRegion(newDstDcl, 1);
9397         madwInst->setDest(newDst);
9398         madwInst->setPredicate(nullptr);
9399         madwInst->setSaturate(g4::NOSAT);
9400         dst = newDst;
9401     }
9402 
9403     INST_LIST_ITER retIter = it;
9404     if (builder.noMulOrMadwExpandingBeforeScheduler() && builder.getOption(vISA_expandMadwPostSchedule))
9405     {
9406         // Here just create tmp variables to fix srcMod, cond modifier, saturate, etc. And Madw->Mul+Mach+Addc+Add expanding
9407         // will be done in expandMadwPostSchedule pass.
9408 
9409         // need extra mov if dst is acc and src0 is indirect
9410         if (!builder.accDstforIndirectSrc())
9411         {
9412             if (src0->isSrcRegRegion() && src0->asSrcRegRegion()->getRegAccess() == IndirGRF)
9413             {
9414                 madwInst->setSrc(insertMovBefore(it, 0, src0->getType(), bb), 0);
9415             }
9416         }
9417 
9418         // add implicit acc dst to the madw instruction as acc will be used as dst of the expanded mul after local scheduling.
9419         // it is a must to fix the WAR/WAW issue of acc in local scheduling.
9420         G4_DstRegRegion* accDstOpnd = builder.createDst(builder.phyregpool.getAcc0Reg(), 0, 0, 1, madwInst->getDst()->getType());
9421         madwInst->setImplAccDst(accDstOpnd);
9422 
9423         retIter = std::next(it);
9424     }
9425     else
9426     {
9427         // SOA layout of dst:(dst_hi32:d, dst_lo32:d)
9428         // if src2 is not immediate value of zero, then expand MADW((dst_hi32, dst_lo32) = src0 * src1 + src2) to:
9429         //     mul  (16) acc0.0<1>:d    src0<1;1,0>:d    src1<2;1,0>:uw
9430         //     mach (16) dst_hi32<1>:d  src0<1;1,0>:d    src1<1;1,0>:d
9431         //     addc (16) dst_lo32<1>:d  acc0.0<1;1,0>:d  src2<1;1,0>:d     // Low 32 bits
9432         //     add  (16) dst_hi32<1>:d  acc0.0<1;1,0>:d  dst_hi32<1;1,0>:d // High 32 bits
9433         // otherwise, expand to:
9434         //     mul  (16) acc0.0<1>:d    src0<1;1,0>:d    src1<2;1,0>:uw
9435         //     mach (16) dst_hi32<1>:d  src0<1;1,0>:d    src1<1;1,0>:d // High 32 bits
9436         //     mov  (16) dst_lo32<1>:d  acc0.0<1;1,0>:d                // Low 32 bits
9437 
9438         uint32_t origOptions = madwInst->getOption();
9439         G4_Predicate* origPredicate = madwInst->getPredicate();
9440         G4_Type tmpType = (IS_UNSIGNED_INT(src0->getType()) && IS_UNSIGNED_INT(src1->getType()) && IS_UNSIGNED_INT(src2->getType())) ? Type_UD : Type_D;
9441 
9442         // 1, create a new mul inst
9443         G4_DstRegRegion* accDstOpnd = builder.createDst(builder.phyregpool.getAcc0Reg(), 0, 0, 1, tmpType);
9444         auto newMul = builder.createBinOp(G4_mul, execSize,
9445             accDstOpnd, builder.duplicateOperand(src0), builder.duplicateOperand(src1), origOptions, false);
9446         auto startIter = bb->insertBefore(it, newMul);
9447         madwInst->copyDefsTo(newMul, false);
9448         // change src1 type to uw type
9449         fixMulSrc1(std::prev(it), bb);
9450 
9451         // 2, create a mach inst
9452         int DstHiRegOffset = (int)std::ceil((float)(execSize * TypeSize(tmpType)) / getGRFSize());
9453         G4_DstRegRegion* dstHi32 = builder.createDst(dst->getBase(), dst->getRegOff() + DstHiRegOffset, dst->getSubRegOff(), 1, tmpType);
9454         G4_INST* machInst = builder.createMach(execSize,
9455             dstHi32, builder.duplicateOperand(src0), builder.duplicateOperand(src1), origOptions, tmpType);
9456         machInst->setPredicate(origPredicate);
9457         *it = machInst;
9458         madwInst->transferUse(machInst);
9459         madwInst->removeAllDefs();
9460         newMul->addDefUse(machInst, Opnd_implAccSrc);
9461 
9462         auto endIter = it;
9463         // optimize: only do multiply if src2 is imme 0
9464         if (src2->isImm() && src2->asImm()->getImm() == 0)
9465         {
9466             // 3, create a mov inst
9467             auto dstLo32 = builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff(), 1, tmpType);
9468             auto accSrcOpndMov = builder.createSrc(builder.phyregpool.getAcc0Reg(), 0, 0,
9469                 execSize == g4::SIMD1 ? builder.getRegionScalar() : builder.getRegionStride1(), tmpType);
9470             auto movInst = builder.createMov(execSize, dstLo32, accSrcOpndMov, origOptions, false);
9471             movInst->setPredicate(origPredicate);
9472             endIter = bb->insertAfter(endIter, movInst);
9473         }
9474         else
9475         {
9476             // 3, create a addc inst
9477             auto dstLo32 = builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff(), 1, tmpType);
9478             auto accSrcOpnd = builder.createSrc(builder.phyregpool.getAcc0Reg(), 0, 0,
9479                 execSize == g4::SIMD1 ? builder.getRegionScalar() : builder.getRegionStride1(), tmpType);
9480             auto addcInst = builder.createBinOp(G4_addc, execSize, dstLo32, accSrcOpnd, builder.duplicateOperand(src2), origOptions, false);
9481             addcInst->setPredicate(origPredicate);
9482             endIter = bb->insertAfter(endIter, addcInst);
9483 
9484             // 4, create a add inst
9485             auto src1Add = builder.createSrc(dstHi32->getBase(), dstHi32->getRegOff(), dstHi32->getSubRegOff(),
9486                 execSize == g4::SIMD1 ? builder.getRegionScalar() : builder.getRegionStride1(), tmpType);
9487             auto addInst = builder.createBinOp(G4_add, execSize, builder.duplicateOperand(dstHi32), builder.duplicateOperand(accSrcOpnd), src1Add, origOptions, false);
9488             addInst->setPredicate(origPredicate);
9489             endIter = bb->insertAfter(endIter, addInst);
9490         }
9491 
9492         // split inst if execSize is larger than native execSize
9493         if (execSize > builder.getNativeExecSize())
9494         {
9495             splitDWMULInst(startIter, endIter, bb);
9496             retIter = startIter;
9497         }
9498         else
9499         {
9500             retIter = std::prev(it);
9501         }
9502     }
9503 
9504     return retIter;
9505 }
9506 
9507 // Currently the local copy propagation phase (newLocalDefHoisting) might be
9508 // too aggressive and could fold a0 register into a select in the float pipe
9509 // which is illegal. We try to fix the instruction in HWConformity because we
9510 // may fix it easily by just flipping the types when it is a raw MOV or a raw
9511 // SEL. This would keep the fp semantics and still save one MOV. Here's an
9512 // example pattern being dealt with.
9513 //
9514 // BEFORE:
9515 // (W&f0.0) sel (1|M0) a0.0<1>:f  r5.2<0;1,0>:f  r3.3<0;1,0>:f
9516 // =>
9517 // AFTER:
9518 // (W&f0.0) sel (1|M0) a0.0<1>:ud r5.2<0;1,0>:ud r3.3<0;1,0>:ud
9519 //
9520 // For others cases, to keep the fp semantics first we create a temp GRF and
9521 // set it as the new dst of the inst. Then we insert a MOV to the old dst (ARF)
9522 // using the int pipe.
9523 //
9524 // BEFORE:
9525 // (W&f0.0) sel (1|M0) (lt)f0.0 a0.0<1>:f  r5.2<0;1,0>:f  r3.3<0;1,0>:f
9526 // =>
9527 // AFTER:
9528 // (W&f0.0) sel (1|M0) (lt)f0.0 r2.0<0;1,0>:f r5.2<0;1,0>:f r3.3<0;1,0>:f
9529 // (W&f0.0) mov (1|M0) a0.0<1>:ud  r2.0<0;1,0>:ud
fixFloatARFDst(INST_LIST_ITER it,G4_BB * bb)9530 void HWConformity::fixFloatARFDst(INST_LIST_ITER it, G4_BB* bb)
9531 {
9532     auto isDstTargetedARFInFloat = [](G4_DstRegRegion* dst) -> bool {
9533         if (!dst || !dst->getTopDcl())
9534             return false;
9535 
9536         // Currently when ARF is used as an index register in dst, vISA treats
9537         // the dst as an ARF dst. Skip the IndirGRF case and return true if the
9538         // dst is ARF/FLAG with a fp type and Direct access. Here's an example
9539         // pattern.
9540         // mov (2)              r[A0(0,0), 0]<4>:f  V44(0,0)<1;1,0>:f
9541         auto regFile = dst->getTopDcl()->getRegFile();
9542         return (regFile == G4_ADDRESS || regFile == G4_FLAG) &&
9543                IS_TYPE_FLOAT_ALL(dst->getType()) &&
9544                (dst->getRegAccess() == Direct);
9545     };
9546 
9547     auto isRawSel = [](G4_INST* inst) -> bool {
9548         return inst->opcode() == G4_sel &&
9549             inst->getDst()->getType() == inst->getSrc(0)->getType() &&
9550             inst->getDst()->getType() == inst->getSrc(1)->getType() &&
9551             inst->getCondMod() == nullptr &&
9552             (inst->getSrc(0)->isSrcRegRegion() &&
9553              inst->getSrc(0)->asSrcRegRegion()->getModifier() == Mod_src_undef) &&
9554             (inst->getSrc(1)->isImm() ||
9555              (inst->getSrc(1)->isSrcRegRegion() &&
9556               inst->getSrc(1)->asSrcRegRegion()->getModifier() == Mod_src_undef));
9557     };
9558 
9559     auto getFlippedIntType = [](G4_Type floatTy) -> G4_Type {
9560         assert(IS_TYPE_FLOAT_ALL(floatTy));
9561         switch (TypeSize(floatTy)) {
9562             case 2:
9563                 return Type_UW;
9564             case 4:
9565                 return Type_UD;
9566             case 8:
9567                 return Type_UQ;
9568             default:
9569                 assert(false && "unexpected float type size.");
9570                 return Type_UNDEF;
9571         }
9572     };
9573 
9574     G4_INST* inst = *it;
9575     G4_DstRegRegion* dst = inst->getDst();
9576     if (!isDstTargetedARFInFloat(dst))
9577         return;
9578 
9579     G4_Type floatTy = dst->getType();
9580     G4_Type intTy = getFlippedIntType(floatTy);
9581     if (inst->isRawMov() || isRawSel(inst))
9582     {
9583         // For raw MOV and raw predicate-based SEL (w/o conditional modifier),
9584         // we can just flip the types.
9585         dst->setType(intTy);
9586         for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; ++i)
9587         {
9588             auto src = inst->getSrc(i);
9589             if (src->isSrcRegRegion())
9590             {
9591                 src->asSrcRegRegion()->setType(intTy);
9592             }
9593             else if (src->isImm())
9594             {
9595                 inst->setSrc(builder.createImm(src->asImm()->getImm(), intTy), i);
9596             }
9597         }
9598     }
9599     else
9600     {
9601         // For others, 2 steps are required.
9602         // 1. Replace the dst with a temp var in float type.
9603         G4_Declare* newDefDcl =
9604             builder.createTempVar(1, floatTy, dst->getTopDcl()->getSubRegAlign());
9605         inst->setDest(builder.createDstRegRegion(newDefDcl, 1));
9606 
9607         // 2. Create a MOV that moves the temp var to the old dst (ARF).
9608         G4_Declare* newUseDcl = builder.createTempVar(1, intTy,
9609                                                       dst->getTopDcl()->getSubRegAlign());
9610         newUseDcl->setAliasDeclare(newDefDcl, 0);
9611         const RegionDesc* rd = inst->getExecSize() == 1 ?
9612             builder.getRegionScalar() : builder.getRegionStride1();
9613         G4_SrcRegRegion* newSrcRegion = builder.createSrcRegRegion(newUseDcl, rd);
9614         dst->setType(intTy);
9615         G4_INST* movInst = builder.createMov(inst->getExecSize(), dst, newSrcRegion, inst->getMaskOption(), false);
9616         bb->insertAfter(it, movInst);
9617     }
9618 }
9619