1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "HWConformity.h"
10 #include "Optimizer.h"
11 #include "G4_Verifier.hpp"
12 #include "InstSplit.h"
13 
14 using namespace vISA;
15 
checkMinExecSize(G4_opcode op)16 uint8_t HWConformity::checkMinExecSize(G4_opcode op)
17 {
18     if (op == G4_dp2 ||
19         op == G4_dp3 ||
20         op == G4_dp4 ||
21         op == G4_dph)
22     {
23         return 4;
24     }
25     else if (op == G4_line || op == G4_pln) {
26         return 8;
27     }
28     else if (op == G4_sad2 || op == G4_sada2) {
29         return 2;
30     }
31     else
32         return 1;
33 }
34 
fixOpndTypeAlign(G4_BB * bb)35 void HWConformity::fixOpndTypeAlign(G4_BB* bb)
36 {
37     INST_LIST_ITER i = bb->begin();
38     INST_LIST_ITER next_iter = i;
39     bool needSplit = false;
40 
41     for (auto iEnd = bb->end(); i != iEnd; i = next_iter)
42     {
43         G4_INST *inst = *i;
44         G4_opcode opcode = inst->opcode();
45         if (opcode == G4_nop || opcode == G4_label || inst->mayExceedTwoGRF()) {
46             next_iter++;
47         } else if (fixInstOpndTypeAlign(i, bb)) {
48             needSplit = true;
49             next_iter = i;
50             next_iter++;
51         } else {
52             next_iter++;
53         }
54 #ifdef _DEBUG
55         verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
56 #endif
57     }
58 
59     if (needSplit)
60     {
61         // make sure updated insts and new moves don't cross 2 GRF
62         InstSplitPass instSplitter(&builder);
63         instSplitter.runOnBB(bb);
64     }
65 }
66 
67 // Fix instructions with vector immediate as source operands.
68 //    mov (8) r5.0<2>:uw 0xfdb97531:uv {Align1}
69 // becomes
70 //    mov (8) r6.0<1>:uw 0xfdb97531:uv {Align1}
71 //    mov (8) r5.0<2>:uw r6.0<8;8,1>:uw {Align1, Q1}
72 //
73 // When an immediate vector is used in an instruction, the destination must
74 // be 128-bit aligned with destination horizontal stride equivalent to a
75 // word for an immediate integer vector (v) and equivalent to a DWord for an
76 // immediate float vector (vf).
77 bool
fixDstAlignmentWithVectorImm(INST_LIST_ITER iter,G4_BB * bb)78 HWConformity::fixDstAlignmentWithVectorImm(INST_LIST_ITER iter, G4_BB *bb)
79 {
80     bool changed = false;
81     G4_INST *inst = *iter;
82     G4_DstRegRegion *reg = inst->getDst();
83     uint8_t execSize = inst->getExecSize();
84 
85     bool dstAligned = builder.isOpndAligned(reg, 16);
86 
87     unsigned hsInBytes = reg->getHorzStride() * reg->getTypeSize();
88     for (int k = 0, e = inst->getNumSrc(); k < e; ++k)
89     {
90         G4_Operand *src = inst->getSrc(k);
91         if (!src || !src->isImm())
92             continue;
93 
94         G4_Type ty = src->getType();
95         G4_Type moveTy = (ty == Type_V) ? Type_W :
96             (ty == Type_UV) ? Type_UW :
97             (ty == Type_VF) ? Type_F : Type_UNDEF;
98         if (moveTy == Type_UNDEF)
99             continue;
100 
101         if (!dstAligned)
102         {
103             inst->setSrc(insertMovBefore(iter, k, moveTy, bb), k);
104             changed = true;
105         }
106         else if (hsInBytes != TypeSize(moveTy))
107         {
108             if (hsInBytes == 4 && execSize < 8)
109             {
110                 // for the case where dst is dword and execution size is < 8,
111                 // we can interleave the vector to avoid a move
112                 // e.g., mov (2) r1.0<1>:d 0x21:uv  -->
113                 //       mov (2) r1.0<1>:d 0x0201:uv
114                 uint32_t bitValue = 0;
115                 uint16_t immBits = (uint16_t) src->asImm()->getImm();
116                 for (int i = 0; i < execSize; ++i)
117                 {
118                     int val = (immBits >> (i*4)) & 0xF;
119                     bitValue |= val << (i * 8);
120                 }
121                 inst->setSrc(builder.createImm(bitValue, ty), k);
122             }
123             else
124             {
125                 inst->setSrc(insertMovBefore(iter, k, moveTy, bb), k);
126                 changed = true;
127             }
128         }
129     }
130 
131     return changed;
132 }
133 
134 // Do basic HW conformity check related to operand type and dst alignment before resucing execution size
135 // to avoid spliting of the MOV inserted in this stage.
136 // This function is called for some instructions generated in later stages.
fixInstOpndTypeAlign(INST_LIST_ITER i,G4_BB * bb)137 bool HWConformity::fixInstOpndTypeAlign(INST_LIST_ITER i, G4_BB* bb)
138 {
139     G4_INST *inst = *i;
140     bool insertedInst = false;
141 
142     if (inst->opcode() == G4_srnd)
143     {
144         // Operands can be packed.
145         return false;
146     }
147 
148     int extypesize = 0;
149     G4_Type extype = inst->getOpExecType(extypesize);
150 
151     if (extypesize == numEltPerGRF<Type_UB>()/2 && inst->opcode() != G4_mov)
152     {
153         fixPackedSource(i, bb);
154         extype = inst->getOpExecType(extypesize);
155     }
156 
157     // fixes opernds including
158     // swapping sel,
159     fixOpnds(i, bb, extype);
160 
161     extype = inst->getOpExecType(extypesize);
162     if (inst->getDst() && !(inst->isSend()) && !(inst->isRawMov()))
163     {
164         if (extypesize < (int)numEltPerGRF<Type_UB>()/2)
165         {
166             uint32_t dst_elsize = inst->getDst()->getTypeSize();
167             if (dst_elsize < (unsigned int)extypesize)
168             {
169                 if (fixDstAlignment(i, bb, extype, dst_elsize))
170                 {
171                     insertedInst = true;
172                 }
173             }
174         }
175 
176         auto hasPackedImm = [](G4_INST *inst) {
177           for (unsigned i = 0, e = inst->getNumSrc(); i != e; ++i) {
178             auto src = inst->getSrc(i);
179             if (!src || !src->isImm())
180               continue;
181             switch (src->getType()) {
182             case Type_V:
183             case Type_UV:
184             case Type_VF:
185               return true;
186             default:
187               break;
188             }
189           }
190           return false;
191         };
192 
193         // There are vector immediate source operands.
194         if (hasPackedImm(*i)) {
195           if ((insertedInst = fixDstAlignmentWithVectorImm(i, bb))) {
196             // Recompute the execution type size if there is some change.
197             // This allows fixDstAlignment to fix possible conformity issues.
198             extype = inst->getOpExecType(extypesize);
199             uint32_t dst_elsize = inst->getDst()->getTypeSize();
200             if (dst_elsize < unsigned(extypesize)) {
201               if (fixDstAlignment(i, bb, extype, dst_elsize)) {
202                 insertedInst = true;
203               }
204             }
205           }
206         }
207     }
208 
209     return insertedInst;
210 }
211 
212 // check Rule 2H
213 // VertStride must be used to cross GRF register boundaries. This rule implies that elements within a 'Width' cannot cross GRF boundaries.
214 // This is a separate function from fixSrcRegion because we may need to split the instruction to satisfy this rule
checkSrcCrossGRF(INST_LIST_ITER & iter,G4_BB * bb)215 bool HWConformity::checkSrcCrossGRF(INST_LIST_ITER& iter, G4_BB* bb)
216 {
217     G4_INST* inst = *iter;
218     for (int i = 0; i < G4_MAX_SRCS; i++)
219     {
220         if (inst->getSrc(i) && inst->getSrc(i)->isSrcRegRegion())
221         {
222             G4_SrcRegRegion *src = inst->getSrc(i)->asSrcRegRegion();
223             bool widthCrossingGRF = false;
224             const RegionDesc* srcRegion = src->getRegion();
225             uint16_t vs = srcRegion->vertStride, wd = srcRegion->width, hs = srcRegion->horzStride;
226             uint8_t exSize = inst->getExecSize();
227             if (src->getRegAccess() == Direct && src->crossGRF())
228             {
229                 int elementSize = src->getTypeSize();
230                 int startOffset = src->getLeftBound() % numEltPerGRF<Type_UB>();
231                 for (int row = 0; row < exSize / wd; row++)
232                 {
233                     int rowOffset = (startOffset + row * vs * elementSize) % numEltPerGRF<Type_UB>();
234                     if (rowOffset + (wd - 1) * hs * elementSize >= (int)numEltPerGRF<Type_UB>())
235                     {
236                         widthCrossingGRF = true;
237                         break;
238                     }
239                 }
240             }
241             else if (src->getRegAccess() == IndirGRF)
242             {
243                 widthCrossingGRF = wd > 1 && hs != 0;
244             }
245 
246             auto doSplit = [&](bool canCrossGRF) -> void {
247                 if (inst->usesFlag() || (!bb->isAllLaneActive() && !inst->isWriteEnableInst()))
248                 {
249                     // splitting may be unsafe, insert a move then split the move
250                     G4_Operand* newSrc = insertMovBefore(iter, i, inst->getSrc(i)->getType(), bb);
251                     inst->setSrc(newSrc, i);
252                     auto movIter = iter;
253                     --movIter;
254                     splitInstruction(movIter, bb, false, 0, false, canCrossGRF);
255                 }
256                 else
257                 {
258                     splitInstruction(iter, bb, false, 0, false, canCrossGRF);
259                 }
260             };
261 
262             if (widthCrossingGRF)
263             {
264                 uint16_t stride = 0;
265                 if (srcRegion->isSingleStride(exSize, stride))
266                 {
267                     // replace <v;w,h> with <h;1,0>
268                     src->setRegion(builder.createRegionDesc(stride, 1, 0), true);
269                 }
270                 else
271                 {
272                     doSplit(true);
273                     return true;
274                 }
275             }
276             else if (kernel.getKernelType() == VISA_CM && builder.no64bitRegioning() &&
277                 src->getTypeSize() == 8)
278             {
279                 // for CM, split non-scalar, non-contiguous source that cross GRF as HW conformity
280                 // may be not equipped to deal with them later
281                 const RegionDesc* region = src->getRegion();
282                 if (!region->isScalar() && !region->isContiguous(inst->getExecSize()) &&
283                     src->crossGRF())
284                 {
285                     doSplit(false);
286                     return true;
287                 }
288             }
289         }
290     }
291 
292     return false;
293 }
294 
fixInstExecSize(G4_BB * bb)295 void HWConformity::fixInstExecSize(G4_BB* bb)
296 {
297 #ifdef _DEBUG
298         verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
299 #endif
300 
301     INST_LIST_ITER i = bb->begin();
302     INST_LIST_ITER next_iter = i;
303 
304     for (; i != bb->end(); i = next_iter)
305     {
306         next_iter++;
307         G4_INST *inst = *i;
308         G4_opcode opcode = inst->opcode();
309         if (opcode == G4_nop || opcode == G4_label || inst->mayExceedTwoGRF())
310         {
311             continue;
312         }
313 
314         if (reduceExecSize(i, bb))
315         {
316             next_iter = i;
317             next_iter++;
318         }
319     }
320 }
321 // split CISA instructions to follow Gen register region restriction
322 // splitOp returns true if inst is split into more than instructions
reduceExecSize(INST_LIST_ITER iter,G4_BB * bb)323 bool HWConformity::reduceExecSize(INST_LIST_ITER iter, G4_BB* bb)
324 {
325     G4_INST *inst = *iter;
326     // Madw can't be split in any pass except for fixMadwInst as it will cause the dst(SOA layout) unexpected. For example:
327     //    madw (M1, 16) dst(0,0)<1> src0(0,0)<1;1,0> 0x38:ud 0x0:ud
328     // If split here, then low result is in dst(0,0) and dst(2,0), and high result is in dst(1,0) and dst(3,0)
329     //    madw (M1, 8) dst(0,0)<1> src0(0,0)<1;1,0> 0x38:ud 0x0:ud
330     //    madw (M8, 8) dst(2,0)<1> src0(1,0)<1;1,0> 0x38:ud 0x0:ud
331     // But expected dst is low result is in dst(0,0) and dst(1,0) and high result is in dst(2,0) and dst(3,0)
332     if (!inst || inst->isSend() || inst->getExecSize() == 1 || inst->opcode() == G4_madw)
333     {
334         return false;
335     }
336 
337     bool insertMOV = false;
338 
339     G4_DstRegRegion *dst = inst->getDst();
340     uint8_t minExSize = checkMinExecSize(inst->opcode());
341 
342     bool useAcc = (inst->hasACCSrc() ||
343                     (dst && dst->isAccReg()) ||
344                     inst->getImplAccDst());
345 
346     // TODO pre-processing of replicate region, VxH or all indirect sources?
347     bool nullDst = inst->hasNULLDst();
348     bool packedByteDst = false;
349     if (!nullDst && dst)
350     {
351         packedByteDst = IS_BTYPE(dst->getType()) && (dst->getHorzStride() == 1);
352     }
353 
354     unsigned char execSize = inst->getExecSize();
355     bool splitOp = false, goodOneGRFDst = false;
356     bool crossGRFDst = dst && dst->isCrossGRFDst();
357     bool goodTwoGRFDst = false;
358     // for all platforms, if execution size is 8 or less and the destination register is 2, flag updates are not supported.
359     bool specialCondForComprInst = (execSize < 8 && dst && dst->getHorzStride() != 1 &&
360         inst->getCondMod() && inst->opcode() != G4_sel);
361 
362     TARGET_PLATFORM genX = builder.getPlatform();
363 
364     // rules specific to math instructions
365     // INT DIV function does not support SIMD16
366     if (inst->isMath() && inst->asMathInst()->isMathIntDiv() && execSize == 16)
367     {
368         return reduceExecSizeForMath(iter, bb);
369     }
370 
371     if (genX >= GENX_SKL)
372     {
373         // SKL removes rules for GRF alignments, so we don't have to check whether the dst or src is evenly split anymore
374         // This means that any subreg of source can move to any subreg of dst
375         // FIXME: From the comments it seems we still have to handle "specialCondForComprInst"
376         return checkSrcCrossGRF(iter, bb);
377     }
378 
379     // various variables needed for instruction splitting, for some reason
380     G4_opcode op = inst->opcode();
381     G4_Type instExecType = inst->getExecType();
382     bool oneGRFSrc[3] = { false, false, false };
383     bool twoGRFSrc[3] = { false, false, false };
384     bool badTwoGRFSrc[3] = { false, false, false };
385     bool evenTwoGRFSrc[3] = { false, false, false };
386     bool fullTwoGRFSrc[3] = { false, false, false };
387     bool hasBadTwoGRFSrc = false;
388     bool compOpt = false,
389         forceEvenSplit = (execSize == 32 && inst->opcode() == G4_sel && inst->getCondMod()) || packedByteDst;
390     uint8_t numInFirstMov = 0;
391     bool useFlag = inst->getPredicate() || inst->getCondMod() || (!bb->isAllLaneActive() && !inst->isWriteEnableInst());
392     bool evenSplitDst = false;
393 
394     // separate the checks for BDW to make it more maintainable
395     // For CM use pre-BDW region rules due to HW bugs.
396     if (kernel.getKernelType() != VISA_CM && (genX == GENX_BDW || genX == GENX_CHV))
397     {
398         // for BDW we check the following rules:
399         // Rule 3D
400         // [DevBDW/DevCHV]: When an instruction has a source region spanning two registers and a destination regioning contained in one register, one of the following must be true:
401         // The destination region is entirely contained in the lower Oword of a register.
402         // The destination region is entirely contained in the upper Oword of a register.
403         // The destination elements are evenly split between the two OWords of a register AND evenly split between the two source registers.
404         // Rule 3G
405         // [DevBDW]: When destination spans two registers, the source may be one or two registers. The destination elements must be evenly split between the two registers.
406 
407         bool mayUseIntAcc = op == G4_pseudo_sada2;
408         if (crossGRFDst)
409         {
410             // rule 3G
411             goodTwoGRFDst = inst->goodTwoGRFDst(evenSplitDst) && !specialCondForComprInst && !mayUseIntAcc;
412             splitOp = !goodTwoGRFDst;
413         }
414         else
415         {
416             // rule 3D
417             G4_Operand *srcs[3];
418             uint8_t eleInFirstGRF[3];
419 
420             for (int i = 0; i < inst->getNumSrc(); i++)
421             {
422                 srcs[i] = inst->getSrc(i);
423                 if (srcs[i] && srcs[i]->isSrcRegRegion())
424                 {
425                     bool indirectSrc = srcs[i]->asSrcRegRegion()->getRegAccess() != Direct;
426                     if (!indirectSrc && srcs[i]->asSrcRegRegion()->isScalar())
427                     {
428                         continue;
429                     }
430 
431                     if (inst->opcode() == G4_pln && i == 1)
432                     {
433                         // src1 for plane may touch multiple GRFs as there's a hidden source
434                         continue;
435                     }
436 
437                     if (srcs[i]->crossGRF())
438                     {
439                         twoGRFSrc[i] = true;
440 
441                         if (!nullDst && dst)
442                         {
443                             // check if dst can be entirely contained in one oword
444                             int dstRegionSize = dst->getRightBound() - dst->getLeftBound() + 1;
445 
446                             if (dstRegionSize <= 16)
447                             {
448                                 // see if we can make the dst fit in one oword
449                                 unsigned short dstOffset = 0;
450                                 bool dstOwordAligned = false;
451                                 int dstAlign = Round_Up_Pow2(dstRegionSize);
452                                 dstOwordAligned = builder.isOpndAligned(dst, dstOffset, dstAlign);
453                                 if (!dstOwordAligned)
454                                 {
455                                     // If we can align dst to its size, it must fit in one OWord
456                                     // if we can't, it may still be in OWord (e.g., for size < 16)
457                                     dstOffset %= numEltPerGRF<Type_UB>();
458                                     bool fitInOword = !(dstOffset < 16 && (dstOffset + dstRegionSize) > 16);
459                                     if (!fitInOword)
460                                     {
461                                         // technically if dst and src are both evenly split the instruction is
462                                         // still ok, but this case should be rare so we ignore it
463                                         G4_DstRegRegion* newDst = insertMovAfter(iter, dst, dst->getType(), bb);
464                                         bool alignTmpDst = builder.isOpndAligned(newDst, dstOffset, 16);
465                                         MUST_BE_TRUE(alignTmpDst, "must be able to oword align tmp dst");
466                                         inst->setDest(newDst);
467                                         return true;
468                                     }
469                                 }
470                             }
471                             else
472                             {
473                                 // dst does not fit in one oword, but is guaranteed to be evenly split (it does not cross GRF).
474                                 // check if src is evenly split across the two GRFs
475                                 bool sameSubregOff, vertCrossGRF, contRegion;
476                                 evenTwoGRFSrc[i] = srcs[i]->asSrcRegRegion()->evenlySplitCrossGRF(
477                                     execSize, sameSubregOff, vertCrossGRF, contRegion, eleInFirstGRF[i]);
478                                 bool coverTwoGRF = srcs[i]->asSrcRegRegion()->coverTwoGRF();
479                                 const RegionDesc *rd = srcs[i]->asSrcRegRegion()->getRegion();
480                                 uint16_t stride = 0;
481                                 fullTwoGRFSrc[i] = coverTwoGRF && rd->isSingleStride(inst->getExecSize(), stride) && (stride == 1);
482 
483                                 if (!evenTwoGRFSrc[i])
484                                 {
485                                     // evenly split would be the best approach here, but unfortunately we can't do it
486                                     // if the instruction is predicated
487                                     splitOp = true;
488                                     // compensation OPT
489                                     if (!forceEvenSplit && !hasBadTwoGRFSrc && minExSize == 1 && goodOneGRFDst &&
490                                         contRegion && eleInFirstGRF[i] > (execSize >> 1))
491                                     {
492                                         if (!useFlag && ((!compOpt && numInFirstMov == 0) || numInFirstMov == eleInFirstGRF[i]))
493                                         {
494                                             compOpt = true;
495                                             numInFirstMov = eleInFirstGRF[i];
496                                         }
497                                         else
498                                         {
499                                             compOpt = false;
500                                             hasBadTwoGRFSrc = true;
501                                             badTwoGRFSrc[i] = true;
502                                         }
503                                     }
504                                     else
505                                     {
506                                         hasBadTwoGRFSrc = true;
507                                         badTwoGRFSrc[i] = true;
508                                     }
509                                 }
510                             }
511                         }
512                         // nothing needs to be done when dst is null???
513                     }
514                 }
515             }
516         }
517 
518         // the only reason for split is due to 32-bit flag
519         // split inst into two SIMD 16 instructions
520         if (!splitOp && execSize == 32 &&
521             (inst->getPredicate() || inst->getCondMod()))
522         {
523             if (forceEvenSplit)
524             {
525                 splitSIMD32Inst(iter, bb);
526                 return insertMOV;
527             }
528         }
529     }
530     else
531     {
532         // pre-BDW checks
533 
534         // Check if the instruction will use int ACC later. if yes, compressed instruction
535         // is split into 2 one-GRF instructions.
536 
537         bool mayUseIntAcc = op == G4_pseudo_sada2 ||
538         (op == G4_mul && IS_DTYPE(inst->getSrc(0)->getType()) && IS_DTYPE(inst->getSrc(1)->getType()));
539 
540         if (crossGRFDst)
541         {
542             // rule 3D
543             goodTwoGRFDst = inst->goodTwoGRFDst(evenSplitDst) && !specialCondForComprInst && !mayUseIntAcc;
544             splitOp = !goodTwoGRFDst;
545         }
546 
547         G4_Operand *srcs[3];
548         uint8_t eleInFirstGRF[3];
549         for (int i = 0; i < inst->getNumSrc(); i++)
550         {
551             srcs[i] = inst->getSrc(i);
552 
553             if (srcs[i] && srcs[i]->isSrcRegRegion() &&
554                 !(inst->opcode() == G4_math && i == 1 && srcs[i]->isNullReg()))
555             {
556                 bool indirectSrc = (srcs[i]->isSrcRegRegion() &&
557                     srcs[i]->asSrcRegRegion()->getRegAccess() != Direct);
558 
559                 if (!indirectSrc && srcs[i]->asSrcRegRegion()->isScalar())
560                 {
561                     continue;
562                 }
563                 if (inst->opcode() == G4_pln && i == 1)
564                 {
565                     continue;
566                 }
567 
568                 if (crossGRFDst && indirectSrc)
569                 {
570                     // Assumption: all indirect operand follow GenX requirement (no cross-GRF indexing ...)
571                     // pre_BDW rule 6D: When a Vx1 or a VxH addressing mode is used on src0,
572                     // the destination must use ONLY one register.
573                     // Vx1 is not handled now. only vxh is considered here
574                     // if (srcs[i]->asSrcRegRegion()->getRegion()->isRegionWH())
575                     {
576                         splitOp = true;
577                     }
578                 }
579                 else if (srcs[i]->crossGRF())
580                 {
581                     twoGRFSrc[i] = true;
582                     bool sameSubregOff, vertCrossGRF, contRegion;
583                     evenTwoGRFSrc[i] = srcs[i]->asSrcRegRegion()->evenlySplitCrossGRF(
584                         execSize, sameSubregOff, vertCrossGRF, contRegion, eleInFirstGRF[i]);
585                     bool coverTwoGRF = srcs[i]->asSrcRegRegion()->coverTwoGRF();
586                     const RegionDesc *rd = srcs[i]->asSrcRegRegion()->getRegion();
587                     uint16_t stride = 0;
588                     fullTwoGRFSrc[i] = coverTwoGRF && rd->isSingleStride(inst->getExecSize(), stride) && (stride == 1);
589 
590                     if (dst && !crossGRFDst)
591                     {
592                         // destination requirements are:
593                         // -- The destination region is entirely contained in the lower OWord of a register.
594                         // -- The destination region is entirely contained in the upper OWord of a register.
595                         // -- The destination elements are evenly split between the two OWords of a register.
596                         int dstRegionSize = dst->getRightBound() - dst->getLeftBound() + 1;
597 
598                         // round up dst region size to next power of two
599                         int dstAlign = Round_Up_Pow2(dstRegionSize);
600 
601                         bool dstOwordAligned = false;
602                         dstOwordAligned = builder.isOpndAligned(dst, dstAlign);
603                         if (dstOwordAligned)
604                         {
605                             // If we can align dst to its size, it must fit in one OWord
606                             goodOneGRFDst = true;
607                         }
608                         else
609                         {
610                             // if we can't, it may still be in one OWord or evenly split
611                             goodOneGRFDst = dst->goodOneGRFDst(execSize);
612                         }
613                     }
614 
615                     // region can be fixed later in fixCompressedInst().
616                     // rule 3E and 3F
617                     // 2-GRF src should follow below implicit rules, no matter the dst size:
618                     // pre-BDW
619                     // 1. Data must be evenly split between source registers.
620                     // 2. Same subregister number in the two GRFs(occupy whole two GRFs) if dst is two-GRF.
621                     if (!evenTwoGRFSrc[i] ||
622                         (((goodTwoGRFDst || (goodOneGRFDst && !contRegion)) && !sameSubregOff) ||
623                         (goodTwoGRFDst && IS_WTYPE(srcs[i]->getType()) && !(srcs[i]->asSrcRegRegion()->checkGRFAlign() && coverTwoGRF))))
624                     {
625                         splitOp = true;
626                         // compensation OPT
627                         if (!forceEvenSplit && !hasBadTwoGRFSrc && minExSize == 1 && goodOneGRFDst &&
628                             contRegion && eleInFirstGRF[i] > (execSize >> 1))
629                         {
630                             if (!useFlag && ((!compOpt && numInFirstMov == 0) || numInFirstMov == eleInFirstGRF[i]))
631                             {
632                                 compOpt = true;
633                                 numInFirstMov = eleInFirstGRF[i];
634                             }
635                             else
636                             {
637                                 compOpt = false;
638                                 hasBadTwoGRFSrc = true;
639                                 badTwoGRFSrc[i] = true;
640                             }
641                         }
642                         else
643                         {
644                             hasBadTwoGRFSrc = true;
645                             badTwoGRFSrc[i] = true;
646                         }
647                     }
648                     // rule 3C and 3D
649                     // mul (4) r8.3<1>:f r2.3<4;4,1>:f r31.0<8;2,4>:f {Align1}
650                     else if (dst && !crossGRFDst && !goodOneGRFDst)
651                     {
652                         splitOp = true;
653                     }
654                 }
655             }
656         }
657 
658         // the only reason for split is due to 32-bit flag
659         // split inst into two SIMD 16 instructions
660         if (!splitOp && execSize == 32 &&
661              (packedByteDst || (inst->getPredicate() || inst->getCondMod())))
662         {
663             if (forceEvenSplit)
664             {
665                 // FIXME: try to use evenlySplitInst() instead.
666                 splitSIMD32Inst(iter, bb);
667                 return insertMOV;
668             }
669         }
670 
671         // You will need to do this ONLY when destination spans 2 registers, src1 is a word or byte and you expect channels to be turned off !!
672         // currrently for instruction with pred or emask on pre-BDW
673         bool specialCondForShootDown = (dst && goodTwoGRFDst &&
674             (inst->getPredicate() || (!bb->isAllLaneActive() && !inst->isWriteEnableInst())) &&
675             oneGRFSrc[1] && (IS_BTYPE(srcs[1]->getType()) || IS_WTYPE(srcs[1]->getType())));
676         if (specialCondForShootDown)
677         {
678             splitOp = true;
679         }
680     }
681 
682     if (!splitOp)
683     {
684         return insertMOV;
685     }
686 
687     MUST_BE_TRUE((inst->opcode() != G4_smov), "Error in splitting smov instruction");
688 
689     // split instruction like:
690     // mad (8) V24(2,0)<2> V20(2,0)<16;8,2> V20(2,0)<16;8,2> V23(2,0)<16;8,2>
691     if (splitOp && crossGRFDst && evenSplitDst && !hasBadTwoGRFSrc && (execSize >= 16 || !useFlag))
692     {
693         if (minExSize == 1 || execSize > minExSize)
694         {
695             evenlySplitInst(iter, bb);
696             return insertMOV;
697         }
698     }
699 
700     // For inst with pred, condMod, or with mask in SIMDCF BB, we insert MOVs with nomask for src/dst
701     // to avoid instruction spliting. inserted MOVs may be split into multiple instructions.
702     // ATTN: We do not include sel here because the condMod generated by sel is never used.
703     if (useFlag &&
704         !(inst->opcode() == G4_sel && !(inst->getPredicate()) && inst->getCondMod()))
705     {
706         // if there is predicate or cond modifier, we keep the original instruction and
707         // perform spliting on new MOV instructions.
708         if (!nullDst && !crossGRFDst && !goodOneGRFDst)
709         {
710             // try to move 2-GRF src into 1GRF tmp to avoid spliting.
711             // this is unnecessary in non-SIMDCF/nonPred/nonCondMod cases because we can do compensation.
712             for (int i = 0; i < inst->getNumSrc(); i++)
713             {
714                 if (twoGRFSrc[i] && !fullTwoGRFSrc[i])
715                 {
716                     moveSrcToGRF(iter, i, 1, bb);
717                     twoGRFSrc[i] = false;
718                     badTwoGRFSrc[i] = false;
719                     INST_LIST_ITER tmpIter = iter;
720                     tmpIter--;
721                     if (builder.getOption(vISA_OptReport))
722                     {
723                         (*tmpIter)->emit(std::cout);
724                         std::cout << std::endl;
725                     }
726                     reduceExecSize(tmpIter, bb);
727                 }
728             }
729             if (!fullTwoGRFSrc[0] && !fullTwoGRFSrc[1] && !fullTwoGRFSrc[2])
730             {
731                 return insertMOV;
732             }
733         }
734         //FIXME: another option is that if original exec size is 16 and will be split into
735         // two simd8, we can use quarter control in some cases.
736 
737         if (!nullDst &&
738             ((!crossGRFDst && !goodOneGRFDst) ||
739                (crossGRFDst && !goodTwoGRFDst) ||
740             (goodTwoGRFDst && specialCondForComprInst)))
741         {
742             // TODO: NULL dst
743             // use temp dst.
744             // case 1: SIMD CF
745             // mov (16) r3.5<1>:b r1.0<8;8,1>:d
746             // ==>
747             // mov (16) r6.0<2>:b r3.5<16;16,1>:b {nomask}
748             // mov (16) r6.0<2>:b r1.0<8;8,1>:d  -- other dst alignment fix will take care of dst.
749             // mov (16) r3.5<1>:b r6.0<32;16,2>:b {nomask}
750 
751             // case 2, no SIMDCF
752             // (f0.0) mov (16) r3.5<1>:b r1.0<8;8,1>:d
753             // ==>
754             // mov (16) r6.0<2>:b r1.0<8;8,1>:d  -- other dst alignment fix will take care of dst.
755             // (f0.0) mov (16) r3.5<1>:b r6.0<32;16,2>:b
756 
757             uint8_t scale = TypeSize(instExecType) / dst->getTypeSize();
758 
759             if (scale > 1 &&
760                 TypeSize(instExecType) * (unsigned)execSize > numEltPerGRF<Type_UB>())
761             {
762                 scale = numEltPerGRF<Type_UB>() / dst->getTypeSize() / execSize;
763             }
764             else if (scale == 0)
765             {
766                 scale = 1;
767             }
768 
769             // can't split if inst is in SIMD flow and is not NoMask, or the inst has predicate
770             // Have to introduce a temp that supports splitting instead
771             if ((!bb->isAllLaneActive() && !inst->isWriteEnableInst()) || inst->getPredicate())
772             {
773                 saveDst(iter, scale, bb);
774                 INST_LIST_ITER tmpIter = iter;
775                 tmpIter--;
776                 if (builder.getOption(vISA_OptReport))
777                 {
778                     (*tmpIter)->emit(std::cout);
779                     std::cout << std::endl;
780                     inst->emit(std::cout);
781                     std::cout << std::endl;
782                 }
783                 // Fix up the move to load dst.  We can split the move instruction as it is NoMask
784                 reduceExecSize(tmpIter, bb);
785 
786                 // source may also be bad, so we have to call reduceExecSize() on iter again
787                 reduceExecSize(iter, bb);
788 
789                 // generate MOV after inst
790                 // if the dst is bad, it will be fixed by the next call to reduceExecSize()
791                 restoreDst(iter, dst, bb);
792 
793                 if (builder.getOption(vISA_OptReport))
794                 {
795                     tmpIter = iter;
796                     tmpIter++;
797                     (*tmpIter)->emit(std::cout);
798                     std::cout << std::endl;
799                 }
800             }
801             else
802             {
803                 insertMovAfter(iter, scale, bb);
804                 if (builder.getOption(vISA_OptReport))
805                 {
806                     INST_LIST_ITER tmpIter = iter;
807                     tmpIter++;
808                     (*tmpIter)->emit(std::cout);
809                     std::cout << std::endl;
810                 }
811             }
812             goodOneGRFDst = true;
813             goodTwoGRFDst = true;
814             crossGRFDst = false;
815             insertMOV = true;
816         }
817 
818         removeBadSrc(iter, bb, crossGRFDst, oneGRFSrc, badTwoGRFSrc);
819         return insertMOV;
820     }
821 
822     if (!nullDst && !crossGRFDst && !goodOneGRFDst && !hasBadTwoGRFSrc)
823     {
824         // insert a temp dst and a MOV
825         // example:
826         // add (8) r5.3<1>:w r2.0<16;8,2>:w 0x1:w
827         // ==>
828         // add (8) r6.0<1>:w r2.0<8;8,1>:d 0x1:w
829         // mov (8) r5.3<1>:b r6.0<8;8,1>:w
830         // In some cases spliting the instruction generates the same number of instruction
831         // without dependency, but needs more analysis.
832         inst->setDest(insertMovAfter(iter, dst, dst->getType(), bb));
833         if (builder.getOption(vISA_OptReport))
834         {
835             inst->emit(std::cout);
836             std::cout << std::endl;
837             INST_LIST_ITER tmpIter = iter;
838             tmpIter++;
839             (*tmpIter)->emit(std::cout);
840             std::cout << std::endl;
841         }
842         return true;
843     }
844 
845     // only two kinds of instruction use ACC operands:
846     // 1. instructions generated in ARCTAN intrinsic translation.
847     // they do not need spliting
848     // 2. instructions generated in MAC opt. there is a check to make
849     // sure only evenly spliting will happen to them.
850     if (useAcc)
851     {
852         evenlySplitInst(iter, bb);
853         return insertMOV;
854     }
855     // split the instruction into a list of instructions
856     splitInstruction(iter, bb, compOpt, numInFirstMov, false, true);
857     return true;
858 }
859 
860 // split a SIMD32 inst into two SIMD16.
861 // there is predicate/conditional modifier used in this inst.
862 //
863 // Result:
864 //    Inst refered to by 'iter' is split into two simd16 instrcutions.
865 //    One is inserted right before 'iter', the other is to reuse 'iter'.
866 // And the caller of this function can access two new instructions via '--iter' and 'iter' !
splitSIMD32Inst(INST_LIST_ITER iter,G4_BB * bb)867 void HWConformity::splitSIMD32Inst(INST_LIST_ITER iter, G4_BB* bb)
868 {
869     G4_INST *inst = *iter;
870     G4_opcode op = inst->opcode();
871     G4_Operand *srcs[3] = { nullptr };
872     int numSrc = inst->getNumSrc();
873 
874     // check dst/src dependency
875     checkSrcDstOverlap(iter, bb, false);
876     for (int i = 0; i < numSrc; i++)
877     {
878         srcs[i] = inst->getSrc(i);
879     }
880 
881     // compute max exeuction size.
882     // boundary is GRF-boundary and HS change, but for Dst, elements should be symetric
883     // if half-GRF boundary is crossed.
884     G4_DstRegRegion *dst = inst->getDst();
885     bool nullDst = dst && inst->hasNULLDst();
886     G4_ExecSize instExSize = inst->getExecSize(), currExSize = G4_ExecSize(instExSize / 2);
887     for (int i = 0; i < instExSize; i += currExSize)
888     {
889         // create new Oprands. Acc should not be split since we generate it in jitter and
890         // can control this.
891         // create new condMod and predicate
892         G4_CondMod *newCondMod = inst->getCondMod();
893         if (newCondMod)
894         {
895             newCondMod = builder.createCondMod(
896                 newCondMod->getMod(), newCondMod->getBase(), i == 0 ? 0 : 1);
897         }
898 
899         G4_Predicate *newPredOpnd = inst->getPredicate();
900         if (newPredOpnd)
901         {
902             newPredOpnd = builder.createPredicate(
903                 newPredOpnd->getState(), newPredOpnd->getBase(), i == 0 ? 0 : 1, newPredOpnd->getControl());
904         }
905 
906         G4_DstRegRegion *newDst;
907         if (!nullDst)
908         {
909             newDst = builder.createSubDstOperand(dst, (uint16_t) i, currExSize);
910         }
911         else
912         {
913             newDst = dst;
914         }
915         // generate new inst
916         G4_INST* newInst;
917         if ((i + currExSize) < instExSize)
918         {
919             newInst = builder.makeSplittingInst(inst, currExSize);
920             newInst->setDest(newDst);
921             newInst->setPredicate(newPredOpnd);
922             newInst->setCondMod(newCondMod);
923             bb->insertBefore(iter, newInst);
924         }
925         else
926         {
927             // reuse the original inst
928             newInst = inst;
929             newInst->setExecSize(currExSize);
930             newInst->setDest(newDst);
931             newInst->setPredicate(newPredOpnd);
932             newInst->setCondMod(newCondMod);
933         }
934 
935         for (int j = 0; j < numSrc; j++)
936         {
937             if (srcs[j])
938             {
939                 // src1 for single source math should be arc reg null.
940                 if (srcs[j]->isImm() ||
941                     (inst->opcode() == G4_math && j == 1 && srcs[j]->isNullReg()))
942                 {
943                     newInst->setSrc(srcs[j], j);
944                 }
945                 else if (srcs[j]->asSrcRegRegion()->isScalar() || (j == 0 && op == G4_line))
946                 {
947                     newInst->setSrc(builder.duplicateOperand(srcs[j]), j);
948                 }
949                 else
950                 {
951                     newInst->setSrc(builder.createSubSrcOperand(srcs[j]->asSrcRegRegion(), (uint16_t)i,
952                         currExSize, (uint8_t)(srcs[j]->asSrcRegRegion()->getRegion()->vertStride),
953                         (uint8_t)(srcs[j]->asSrcRegRegion()->getRegion()->width)), j);
954                 }
955             }
956         }
957 
958         // maintain def-use chain
959         if (newInst == inst)
960         {
961             newInst->trimDefInstList();
962         }
963         else
964         {
965             // Defs (uses) of this new instruction will be a subset of the
966             // original instruction's defs (uses).
967             inst->copyDefsTo(newInst, true);
968             inst->copyUsesTo(newInst, true);
969         }
970     }
971 }
972 
splitInstruction(INST_LIST_ITER iter,G4_BB * bb,bool compOpt,uint8_t numInFirstMov,bool rule4_11,bool canSrcCrossGRF)973 void HWConformity::splitInstruction(INST_LIST_ITER iter, G4_BB* bb, bool compOpt, uint8_t numInFirstMov,
974     bool rule4_11, bool canSrcCrossGRF)
975 {
976     G4_INST *inst = *iter;
977     G4_opcode op = inst->opcode();
978     G4_Operand *srcs[3] = { nullptr };
979 
980     // check dst/src dependency
981     checkSrcDstOverlap(iter, bb, compOpt);
982 
983     int numSrcs = inst->getNumSrc();
984 
985     for (int i = 0; i < numSrcs; i++)
986     {
987         srcs[i] = inst->getSrc(i);
988     }
989 
990     uint8_t minExSize = checkMinExecSize(op);
991     // compute max exeuction size.
992     // boundary is GRF-boundary and HS change, but for Dst, elements should be symetric
993     // if half-GRF boundary is crossed.
994     G4_DstRegRegion *dst = inst->getDst();
995     bool nullDst = inst->hasNULLDst();
996     G4_ExecSize instExSize = inst->getExecSize();
997     G4_ExecSize currExSize;
998     uint16_t vs[3] = { 0 }, wd[3] = { 0 };
999 
1000     G4_Predicate *instPred = inst->getPredicate();
1001 
1002     // first, produce mask if needed
1003     // mov (16) r2.0<1>:uw 0:uw {Align1, NoMask}   // 0:uw
1004     // mov (16) r2.0<1>:uw 0x1:uw {Align1}   // 1:uw
1005     // this part is currently not used since we do not split inst with predicate or emask
1006     bool isSIMDCFInst = !bb->isAllLaneActive() && !inst->isWriteEnableInst();
1007     G4_Declare *maskDcl = NULL;
1008     if (instPred || isSIMDCFInst)
1009     {
1010         maskDcl = builder.createTempVar(instExSize, Type_UW, Eight_Word);
1011         G4_DstRegRegion * tmpMaskOpnd = builder.createDst(maskDcl->getRegVar(), 0, 0, 1, Type_UW);
1012 
1013         G4_INST* firstMov = builder.createMov(instExSize,
1014             tmpMaskOpnd, builder.createImm(0, Type_UW), inst->getOption(), false);
1015 
1016         G4_Predicate* pred = builder.duplicateOperand(inst->getPredicate());
1017         auto movInst = builder.createMov(instExSize, tmpMaskOpnd, builder.createImm(1, Type_UW), inst->getOption(), false);
1018         movInst->setPredicate(pred);
1019 
1020         if (isSIMDCFInst)
1021         {
1022             firstMov->setNoMask(true);
1023         }
1024     }
1025 
1026     bool needsMaskOffset = instPred || isSIMDCFInst || inst->getCondMod() != nullptr;
1027 
1028     for (uint8_t i = 0; i < instExSize; i += currExSize)
1029     {
1030         if (compOpt && i == 0)
1031         {
1032             currExSize = G4_ExecSize(numInFirstMov);
1033             G4_INST *newInst = builder.makeSplittingInst(inst, instExSize);
1034             newInst->setDest(builder.duplicateOperand(inst->getDst()));
1035             newInst->setPredicate(builder.duplicateOperand(inst->getPredicate()));
1036             newInst->setCondMod(builder.duplicateOperand(inst->getCondMod()));
1037             for (int j = 0; j < inst->getNumSrc(); j++)
1038             {
1039                 newInst->setSrc(builder.duplicateOperand(srcs[j]), j);
1040             }
1041             // update def-use chain
1042             inst->copyDefsTo(newInst, true);
1043             inst->copyDefsTo(newInst, true);
1044             bb->insertBefore(iter, newInst);
1045             continue;
1046         }
1047 
1048         // this stores the max allowed exec size for each operand (0 -- dst, 1 -- src0, and so on)
1049         uint8_t opndExSize[4] = { 0, 0, 0, 0 };
1050         currExSize = G4_ExecSize(roundDownPow2(instExSize - i));
1051 
1052         bool crossGRFsrc = false;
1053         for (int j = 0; j < numSrcs; j++)
1054         {
1055             if (!srcs[j] || !srcs[j]->isSrcRegRegion() ||
1056                 srcs[j]->isNullReg() || (j == 0 && op == G4_line))
1057             {
1058                 opndExSize[j + 1] = currExSize;
1059                 continue;
1060             }
1061             bool twoGRFsrc = false;
1062             opndExSize[j+1] = srcs[j]->asSrcRegRegion()->getMaxExecSize(i, currExSize, canSrcCrossGRF, vs[j], wd[j], twoGRFsrc);
1063 
1064             if (opndExSize[j + 1] > 8 && rule4_11)
1065             {
1066                 opndExSize[j + 1] = 8;
1067             }
1068 
1069             crossGRFsrc |= twoGRFsrc;
1070             if (minExSize == 1)
1071             {
1072                 currExSize = G4_ExecSize(opndExSize[j + 1]);
1073             }
1074         }
1075 
1076         if (dst && !nullDst)
1077         {
1078             opndExSize[0] = dst->getMaxExecSize(i, currExSize, crossGRFsrc);
1079 
1080             if (opndExSize[0] > 8 && rule4_11)
1081                 opndExSize[0] = 8;
1082         }
1083         else
1084         {
1085             // dst essentially does not affect the splitting decision
1086             opndExSize[0] = currExSize;
1087         }
1088 
1089         if (minExSize == 1)
1090         {
1091             currExSize = G4_ExecSize(opndExSize[0]);
1092         }
1093 
1094         bool needMov = false;
1095         if (minExSize > 1)
1096         {
1097             // find minimal execsize. if it is not less than minExSize, use it
1098             // to avoid dependency
1099             // FIXME: optimize this part by avoiding MOVs
1100             uint8_t currMinExSize = 64;
1101             currExSize = G4_ExecSize(0);
1102             for (int j = 0; j <= numSrcs; j++)
1103             {
1104                 // use max possible exsize
1105                 if (opndExSize[j] > currExSize)
1106                 {
1107                     currExSize = G4_ExecSize(opndExSize[j]);
1108                 }
1109                 if (opndExSize[j] != 0 && opndExSize[j] < currMinExSize)
1110                 {
1111                     currMinExSize = opndExSize[j];
1112                 }
1113             }
1114 
1115             if (currMinExSize >= minExSize)
1116             {
1117                 currExSize = G4_ExecSize(currMinExSize);
1118             }
1119             else
1120             {
1121                 for (int j = 0; j <= numSrcs; j++)
1122                 {
1123                     if (opndExSize[j] != 0 && opndExSize[j] < currExSize)
1124                     {
1125                         needMov = true;
1126                     }
1127                 }
1128             }
1129         }
1130 
1131         MUST_BE_TRUE(currExSize != 0, "illegal execution size in instruction splitting");
1132         // create new Oprands. Acc should not be split since we generate it in jitter and
1133         // can control this.
1134         G4_DstRegRegion *newDst = !nullDst ? builder.createSubDstOperand(dst, (uint16_t)i, currExSize) : dst;
1135 
1136         // generate new inst
1137         G4_INST* newInst;
1138         INST_LIST_ITER newInstIter;
1139         if ((i + currExSize) < instExSize)
1140         {
1141             newInst = builder.makeSplittingInst(inst, currExSize);
1142             newInst->setDest(newDst);
1143             newInst->setPredicate(builder.duplicateOperand(inst->getPredicate()));
1144             newInst->setCondMod(builder.duplicateOperand(inst->getCondMod()));
1145             bb->insertBefore(iter, newInst);
1146             newInstIter = iter;
1147             newInstIter--;
1148         }
1149         else
1150         {
1151             // reuse the original inst
1152             newInst = inst;
1153             newInst->setDest(newDst);
1154             newInst->setExecSize(currExSize);
1155             newInstIter = iter;
1156         }
1157 
1158         for (int j = 0; j < inst->getNumSrc(); j++)
1159         {
1160             if (srcs[j])
1161             {
1162                 // src1 for single source math should be arc reg null.
1163                 if (srcs[j]->isImm() ||
1164                     (inst->opcode() == G4_math && j == 1 && srcs[j]->isNullReg()))
1165                 {
1166                     newInst->setSrc(srcs[j], j);
1167                 }
1168                 else if (srcs[j]->asSrcRegRegion()->isScalar() || (j == 0 && op == G4_line))
1169                 {
1170                     newInst->setSrc(builder.duplicateOperand(srcs[j]), j);
1171                 }
1172                 else
1173                 {
1174                     if (srcs[j]->isAddrExp())
1175                     {
1176                         G4_AddrExp* addExp = builder.createAddrExp(srcs[j]->asAddrExp()->getRegVar(), srcs[j]->asAddrExp()->getOffset(), srcs[j]->asAddrExp()->getType());
1177                         newInst->setSrc(addExp, j);
1178                     }
1179                     else
1180                     {
1181                         uint16_t start = i;
1182                         newInst->setSrc(builder.createSubSrcOperand(srcs[j]->asSrcRegRegion(), start, currExSize, vs[j], wd[j]), j);
1183                     }
1184                 }
1185             }
1186         }
1187 
1188         if (instExSize == 16 &&
1189             currExSize == 8 &&
1190             needsMaskOffset)
1191         {
1192             if (instPred)
1193             {
1194                 G4_Predicate* tPred = builder.duplicateOperand(instPred);
1195                 tPred->setInst(newInst);
1196                 newInst->setPredicate(tPred);
1197             }
1198 
1199             if (newInst->getMaskOffset() == 0)
1200             {
1201                 newInst->setMaskOption(i == 0 ? InstOpt_M0 : InstOpt_M8);
1202             }
1203             else
1204             {
1205                 newInst->setMaskOption(i == 0 ? InstOpt_M16 : InstOpt_M24);
1206             }
1207         }
1208 
1209         // maintain def-use chain
1210         if (newInst == inst)
1211         {
1212             newInst->trimDefInstList();
1213         }
1214         else
1215         {
1216             inst->copyDefsTo(newInst, /*checked*/true);
1217             inst->copyUsesTo(newInst, /*checked*/true);
1218         }
1219 
1220         // the following code is to keep minimal execution size for some opcode, for example, DP4
1221         // insert mov if needed
1222         if (needMov)
1223         {
1224             for (int j = 0; j < inst->getNumSrc(); j++)
1225             {
1226                 if (opndExSize[j + 1] < currExSize)
1227                 {
1228                     newInst->setSrc(insertMovBefore(newInstIter, j, srcs[j]->getType(), bb), j);
1229                     // reducing exec size for new MOV
1230                     INST_LIST_ITER newMovIter = newInstIter;
1231                     newMovIter--;
1232                     reduceExecSize(newMovIter, bb);
1233                     if (builder.getOption(vISA_OptReport))
1234                     {
1235                         (*newMovIter)->emit(std::cout);
1236                         std::cout << std::endl;
1237                     }
1238                 }
1239             }
1240         }
1241         if (builder.getOption(vISA_OptReport))
1242         {
1243             newInst->emit(std::cout);
1244             std::cout << std::endl;
1245         }
1246         // dst
1247         if (needMov && opndExSize[0] < currExSize)
1248         {
1249             (*newInstIter)->setDest(
1250                 insertMovAfter(newInstIter, inst->getDst(), inst->getDst()->getType(), bb));
1251             INST_LIST_ITER newMovIter = newInstIter;
1252             newMovIter++;
1253             reduceExecSize(newMovIter, bb);
1254             if (builder.getOption(vISA_OptReport))
1255             {
1256                 (*newMovIter)->emit(std::cout);
1257                 std::cout << std::endl;
1258             }
1259         }
1260     }
1261 }
1262 
1263 
1264 
1265 // evenly split an inst into two instructions with half execution size.
1266 // this is used to split a simd16 math into two simd8 before other reducing exeuction size actions
1267 //
1268 // This will has two instructions: one is right before "iter", the other is to re-use "iter". The
1269 // caller is safe to use "--iter" and "iter" to refer those two instructions.
evenlySplitInst(INST_LIST_ITER iter,G4_BB * bb,bool checkOverlap)1270 bool HWConformity::evenlySplitInst(INST_LIST_ITER iter, G4_BB* bb, bool checkOverlap)
1271 {
1272     G4_INST* inst = *iter;
1273     G4_opcode op = inst->opcode();
1274     G4_Operand* srcs[3];
1275     int origMaskOffset = inst->getMaskOffset();
1276     bool extraMov = false;
1277     const int numSrc = inst->getNumSrc();
1278 
1279     // check dst/src dependency
1280     if (checkOverlap)
1281     {
1282         extraMov = checkSrcDstOverlap(iter, bb, false);
1283     }
1284 
1285     bool useARF = false;
1286     for (int i = 0; i < numSrc; i++)
1287     {
1288         srcs[i] = inst->getSrc(i);
1289     }
1290 
1291     // compute max exeuction size.
1292     // boundary is GRF-boundary and HS change, but for Dst, elements should be symetric
1293     // if half-GRF boundary is crossed.
1294 
1295     G4_DstRegRegion* dst = inst->getDst();
1296     bool nullDst = dst && inst->hasNULLDst();
1297     G4_ExecSize instExSize = inst->getExecSize(), currExSize = G4_ExecSize(instExSize / 2);
1298 
1299     G4_Predicate* newPred = NULL;
1300     if (inst->getPredicate())
1301     {
1302         newPred = inst->getPredicate();
1303         newPred->splitPred();
1304     }
1305 
1306     G4_CondMod* newCond = NULL;
1307     if (inst->getCondMod())
1308     {
1309         newCond = inst->getCondMod();
1310         newCond->splitCondMod();
1311     }
1312 
1313     G4_SrcRegRegion* accSrcRegion = NULL;
1314     if (inst->getImplAccSrc())
1315     {
1316         accSrcRegion = inst->getImplAccSrc()->asSrcRegRegion();
1317     }
1318 
1319     G4_DstRegRegion* accDstRegion = NULL;
1320     if (inst->getImplAccDst())
1321     {
1322         accDstRegion = inst->getImplAccDst();
1323     }
1324 
1325     if (accSrcRegion || accDstRegion || newPred || newCond)
1326     {
1327         useARF = true;
1328     }
1329 
1330     for (int i = 0; i < instExSize; i += currExSize)
1331     {
1332         // create new Oprands.
1333         G4_DstRegRegion* newDst;
1334         if (!nullDst)
1335         {
1336             newDst = builder.createSubDstOperand(dst, (uint16_t)i, currExSize);
1337         }
1338         else
1339         {
1340             newDst = dst;
1341         }
1342         // generate new inst
1343         G4_INST* newInst;
1344         if ((i + currExSize) < instExSize)
1345         {
1346             newInst = builder.makeSplittingInst(inst, currExSize);
1347             newInst->setImplAccDst(builder.duplicateOperand(accDstRegion));
1348             newInst->setImplAccSrc(builder.duplicateOperand(accSrcRegion));
1349             newInst->setDest(newDst);
1350             newInst->setPredicate(builder.duplicateOperand(newPred));
1351             newInst->setCondMod(builder.duplicateOperand(newCond));
1352             newInst->setEvenlySplitInst(true);
1353             bb->insertBefore(iter, newInst);
1354         }
1355         else
1356         {
1357             // reuse the original inst
1358             newInst = inst;
1359             newInst->setExecSize(currExSize);
1360             newInst->setDest(newDst);
1361             if (newPred)
1362             {
1363                 inst->setPredicate(builder.duplicateOperand(newPred));
1364             }
1365             if (newCond)
1366             {
1367                 inst->setCondMod(builder.duplicateOperand(newCond));
1368             }
1369             if (accSrcRegion)
1370             {
1371                 newInst->setImplAccSrc(builder.createSrcRegRegion(*accSrcRegion));
1372             }
1373             if (accDstRegion)
1374             {
1375                 newInst->setImplAccDst(builder.createDstRegRegion(*accDstRegion));
1376             }
1377         }
1378 
1379         for (int j = 0; j < numSrc; j++)
1380         {
1381             if (srcs[j])
1382             {
1383                 if (srcs[j]->isImm() || srcs[j]->isNullReg())
1384                 {
1385                     newInst->setSrc(srcs[j], j);
1386                 }
1387                 else if (srcs[j]->isScalarSrc() || (j == 0 && op == G4_line))
1388                 {
1389                     // no need to split, but need to duplicate
1390                     newInst->setSrc(builder.duplicateOperand(srcs[j]), j);
1391                 }
1392                 else
1393                 {
1394                     newInst->setSrc(builder.createSubSrcOperand(srcs[j]->asSrcRegRegion(), (uint16_t)i,
1395                         currExSize, (uint8_t)(srcs[j]->asSrcRegRegion()->getRegion()->vertStride),
1396                         (uint8_t)(srcs[j]->asSrcRegRegion()->getRegion()->width)), j);
1397                 }
1398             }
1399         }
1400 
1401         // set mask
1402         bool needsMaskOffset = useARF || (!bb->isAllLaneActive() && !inst->isWriteEnableInst());
1403         if (needsMaskOffset)
1404         {
1405             int newMaskOffset = origMaskOffset + (i == 0 ? 0 : currExSize);
1406             bool nibOk = builder.hasNibCtrl() &&
1407                 (inst->getDst()->getTypeSize() == 8 || TypeSize(inst->getExecType()) == 8);
1408             G4_InstOption newMask = G4_INST::offsetToMask(currExSize, newMaskOffset, nibOk);
1409             if (newMask == InstOpt_NoOpt)
1410             {
1411                 bool useMask = inst->getPredicate() || inst->getCondModBase() ||
1412                     (!bb->isAllLaneActive() && !inst->isWriteEnableInst());
1413                 MUST_BE_TRUE(!useMask, "no legal emask found for the split instruction");
1414             }
1415             else
1416             {
1417                 newInst->setMaskOption(newMask);
1418             }
1419         }
1420 
1421         // maintain def-use chain
1422         if (newInst == inst)
1423         {
1424             newInst->trimDefInstList();
1425         }
1426         else
1427         {
1428             inst->copyDefsTo(newInst, /*checked*/true);
1429             inst->copyUsesTo(newInst, /*checked*/true);
1430         }
1431         if (builder.getOption(vISA_OptReport))
1432         {
1433             newInst->emit(std::cout);
1434             std::cout << std::endl;
1435         }
1436     }
1437 
1438     return extraMov;
1439 }
1440 
1441 // this is specifically for math instruction
1442 // assumption: the input math function is a compressed instruction and need split
reduceExecSizeForMath(INST_LIST_ITER iter,G4_BB * bb)1443 bool HWConformity::reduceExecSizeForMath(INST_LIST_ITER iter, G4_BB* bb)
1444 {
1445     // split the instruction into two first
1446     evenlySplitInst(iter, bb);
1447     // fix execution size for each one
1448     INST_LIST_ITER firstIter = iter;
1449     firstIter--;
1450     reduceExecSize(firstIter, bb);
1451     return reduceExecSize(iter, bb);
1452 }
1453 // check overlap between src and dst
1454 // if overlap exists, insert to MOV to eliminate it
1455 // how about replicate regions?<0;4,1>
checkSrcDstOverlap(INST_LIST_ITER iter,G4_BB * bb,bool compOpt)1456 bool HWConformity::checkSrcDstOverlap(INST_LIST_ITER iter, G4_BB* bb, bool compOpt)
1457 {
1458     G4_INST *inst = *iter;
1459     G4_Operand *srcs[3];
1460     bool hasOverlap = false;
1461 
1462     for (int i = 0; i < inst->getNumSrc(); i++)
1463     {
1464         srcs[i] = inst->getSrc(i);
1465     }
1466     // check dst/src dependency
1467     // how about replicate regions?<0;4,1>
1468     if (inst->getDst() && !inst->hasNULLDst())
1469     {
1470         for (int i = 0; i < inst->getNumSrc(); i++)
1471         {
1472             bool useTmp = false;
1473             if (srcs[i] && (IS_VINTTYPE(srcs[i]->getType()) || IS_VFTYPE(srcs[i]->getType())))
1474             {
1475                 useTmp = true;
1476             }
1477             else
1478             {
1479                 G4_CmpRelation rel = inst->getDst()->compareOperand(srcs[i]);
1480                 if (rel != Rel_disjoint)
1481                 {
1482                     useTmp = (rel != Rel_eq) || compOpt ||
1483                         srcs[i]->asSrcRegRegion()->getRegion()
1484                         ->isRepeatRegion(inst->getExecSize());
1485                 }
1486             }
1487             if (useTmp)
1488             {
1489                 // insert mov
1490                 inst->setSrc(insertMovBefore(iter, i, G4_Operand::GetNonVectorImmType(srcs[i]->getType()), bb), i);
1491                 srcs[i] = inst->getSrc(i);
1492                 // reducing exec size for new MOV
1493                 INST_LIST_ITER newMovIter = iter;
1494                 newMovIter--;
1495                 reduceExecSize(newMovIter, bb);
1496                 hasOverlap = true;
1497 
1498             }
1499         }
1500     }
1501 
1502     return hasOverlap;
1503 }
1504 
1505 // move source operand to one or two GRF
1506 // tmp dst use the same type as source.
1507 // this MOV does not need further resucing execsize
moveSrcToGRF(INST_LIST_ITER it,uint32_t srcNum,uint16_t numGRF,G4_BB * bb)1508 void HWConformity::moveSrcToGRF(INST_LIST_ITER it, uint32_t srcNum, uint16_t numGRF, G4_BB *bb)
1509 {
1510     G4_INST* inst = *it;
1511     G4_ExecSize execSize = inst->getExecSize();
1512 
1513     G4_Operand *src = inst->getSrc(srcNum);
1514     uint32_t srcTypeSize = src->getTypeSize();
1515     uint16_t dclSize = (numEltPerGRF<Type_UB>() * numGRF) / srcTypeSize;
1516     uint16_t hs = dclSize / execSize;
1517     uint16_t wd = execSize;
1518     uint16_t vs = hs * wd;
1519     const RegionDesc* region = builder.createRegionDesc(vs, wd, hs);
1520 
1521     // look up in MOV table to see if there is already inserted MOV for this source.
1522     G4_INST* def_inst = NULL;
1523     def_inst = checkSrcDefInst(inst, def_inst, srcNum);
1524 
1525     G4_Type tmpType = G4_Operand::GetNonVectorImmType(src->getType());
1526 
1527     if (def_inst && def_inst->getDst()->getType() == tmpType &&
1528         (def_inst->getExecSize() == execSize) &&
1529         def_inst->getDst()->coverGRF(numGRF, execSize) &&
1530         def_inst->getDst()->checkGRFAlign() &&
1531         (bb->isAllLaneActive() || def_inst->isWriteEnableInst()))
1532     {
1533 
1534         //inst->removeDefUse(Gen4_Operand_Number(srcNum + 1));
1535         //def_inst->useInstList.emplace_back(inst, Gen4_Operand_Number(srcNum + 1));
1536         //inst->defInstList.emplace_back(def_inst, Gen4_Operand_Number(srcNum + 1));
1537 
1538         G4_DstRegRegion* existing_def = def_inst->getDst();
1539         G4_SrcRegRegion* newSrc = builder.createSrc(
1540                         existing_def->getBase(),
1541                         existing_def->getRegOff(),
1542                         existing_def->getSubRegOff(),
1543                         region,
1544                         src->getType());
1545         inst->setSrc(newSrc, srcNum);
1546     }
1547 
1548     G4_Declare* dcl = builder.createTempVar(dclSize, src->getType(), GRFALIGN);
1549     G4_DstRegRegion *dstRegion = builder.createDst(
1550                         dcl->getRegVar(),
1551                         0,
1552                         0,
1553                         hs,
1554                         dcl->getElemType());
1555     G4_INST* newInst = builder.createMov(
1556         execSize, dstRegion, src, (!bb->isAllLaneActive() ? InstOpt_WriteEnable : InstOpt_NoOpt), false);
1557 
1558     // insert instruction and maintain def-use chain
1559     bb->insertBefore(it, newInst);
1560     inst->transferDef(newInst, Gen4_Operand_Number(srcNum + 1), Opnd_src0);
1561     newInst->addDefUse(inst, Gen4_Operand_Number(srcNum + 1));
1562 
1563     G4_SrcRegRegion* newSrc = builder.createSrc(
1564                         dcl->getRegVar(),
1565                         0,
1566                         0,
1567                         region,
1568                         dcl->getElemType());
1569     inst->setSrc(newSrc, srcNum);
1570 }
1571 
1572 /*
1573  *  create a new mov instruction and insert it before iter
1574  *  mov (esize) tmpDst dst (nomask)
1575  *  add (esize) tmpDst ...
1576  *  where esize is "inst"'s execution size
1577  *
1578  */
saveDst(INST_LIST_ITER & it,uint8_t stride,G4_BB * bb)1579 void HWConformity::saveDst(INST_LIST_ITER& it, uint8_t stride, G4_BB *bb)
1580 {
1581     G4_INST* inst = *it;
1582     G4_DstRegRegion *dst = inst->getDst();
1583     G4_ExecSize execSize = inst->getExecSize();
1584     G4_Type dstType = dst->getType();
1585     uint16_t dstWidthBytes = execSize * TypeSize(dstType) * stride;
1586 
1587     G4_SubReg_Align subAlign = getDclAlignment(dstWidthBytes, inst, execSize == 1);
1588 
1589     uint32_t numElt = execSize == 1 ? 1 : execSize * stride;
1590     G4_Declare* dcl = builder.createTempVar(numElt, dstType, subAlign);
1591 
1592     uint16_t hs = dst->getHorzStride();
1593     const RegionDesc *region = builder.createRegionDesc(hs * execSize, execSize, hs);
1594     G4_SrcRegRegion *srcRegion = builder.createSrc(dst->getBase(), dst->getRegOff(),
1595         dst->getSubRegOff(), region, dstType);
1596 
1597     G4_DstRegRegion *tmpDstOpnd = builder.createDstRegRegion(dcl, stride);
1598 
1599     unsigned int new_option = inst->getOption();
1600 
1601     G4_INST* newInst = builder.createMov(execSize, tmpDstOpnd, srcRegion, new_option, false);
1602     newInst->setNoMask(true);
1603 
1604     bb->insertBefore(it, newInst);
1605     inst->setDest(builder.duplicateOperand(tmpDstOpnd));
1606 }
1607 
restoreDst(INST_LIST_ITER & it,G4_DstRegRegion * origDst,G4_BB * bb)1608 void HWConformity::restoreDst(INST_LIST_ITER& it, G4_DstRegRegion *origDst, G4_BB *bb)
1609 {
1610     G4_INST* inst = *it;
1611     G4_DstRegRegion *dst = inst->getDst();
1612     G4_ExecSize execSize = inst->getExecSize();
1613 
1614     uint16_t hs = dst->getHorzStride();
1615     const RegionDesc *region = builder.createRegionDesc(hs * execSize, execSize, hs);
1616     G4_SrcRegRegion *srcRegion = builder.createSrc(dst->getBase(), dst->getRegOff(),
1617         dst->getSubRegOff(), region, dst->getType());
1618 
1619     unsigned int new_option = inst->getOption();
1620 
1621     G4_INST* newInst = builder.createMov(execSize, origDst, srcRegion, new_option, false);
1622     newInst->setNoMask(true);
1623 
1624     INST_LIST_ITER iter = it;
1625     iter++;
1626     bb->insertBefore(iter, newInst);
1627 
1628     // how about def-use?
1629     inst->transferUse(newInst);
1630     inst->addDefUse(newInst, Gen4_Operand_Number::Opnd_src0);
1631 }
1632 
1633 /*
1634  *  create a new mov instruction and insert it after iter
1635  *  mov (esize) dst tmp:dst_type
1636  *  where esize is "inst"'s execution size and insert it after "inst"
1637  *  dst of inst is replaced with the tmp dst using the same type
1638  */
insertMovAfter(INST_LIST_ITER & it,uint16_t stride,G4_BB * bb)1639 void HWConformity::insertMovAfter(INST_LIST_ITER& it, uint16_t stride, G4_BB* bb)
1640 {
1641     G4_INST* inst = *it;
1642     G4_DstRegRegion *dst = inst->getDst();
1643     G4_ExecSize execSize = inst->getExecSize();
1644     G4_Type execType = inst->getExecType(), dstType = dst->getType();
1645     uint16_t opExecWidthBytes = execSize * TypeSize(execType);
1646     uint16_t dstWidthBytes = execSize * TypeSize(dstType) * stride;
1647 
1648     G4_SubReg_Align subAlign = getDclAlignment(opExecWidthBytes > dstWidthBytes ? opExecWidthBytes : dstWidthBytes,
1649         inst, execSize == 1);
1650 
1651     G4_Declare* dcl = builder.createTempVar(execSize * stride, dstType, subAlign);
1652 
1653     const RegionDesc* region = builder.createRegionDesc(stride, 1, 0);
1654     G4_SrcRegRegion *srcRegion = builder.createSrcRegRegion(dcl, region);
1655     G4_DstRegRegion *tmpDstOpnd = builder.createDstRegRegion(dcl, stride);
1656 
1657     G4_Predicate *pred = NULL;
1658     if (inst->opcode() != G4_sel) {
1659         pred = inst->getPredicate();
1660         inst->setPredicate(NULL);
1661     }
1662     unsigned int new_option = inst->getOption();
1663 
1664     G4_INST* newInst = builder.createMov(execSize, dst, srcRegion, new_option, false);
1665     newInst->setPredicate(pred);
1666 
1667     INST_LIST_ITER iter = it;
1668     iter++;
1669     bb->insertBefore(iter, newInst);
1670     // change dst of inst
1671     inst->setDest(tmpDstOpnd);
1672 
1673     // update propagation info
1674     if (pred)
1675     {
1676         inst->transferDef(newInst, Opnd_pred, Opnd_pred);
1677     }
1678 
1679     inst->transferUse(newInst);
1680     inst->addDefUse(newInst, Opnd_src0);
1681 }
1682 
1683 
removeBadSrc(INST_LIST_ITER & iter,G4_BB * bb,bool crossGRFDst,bool oneGRFSrc[3],bool badTwoGRFSrc[3])1684 void  HWConformity::removeBadSrc(INST_LIST_ITER& iter, G4_BB *bb, bool crossGRFDst, bool oneGRFSrc[3], bool badTwoGRFSrc[3])
1685 {
1686     G4_INST *inst = *iter;
1687     G4_Operand *dst = inst->getDst();
1688     // check source and dst region together
1689     // get rid of bad two-GRF source
1690     for (int i = 0; i < inst->getNumSrc(); i++)
1691     {
1692 
1693         if (badTwoGRFSrc[i])
1694         {
1695             if (!crossGRFDst ||
1696                 (dst && IS_DTYPE(dst->getType()) && IS_WTYPE(inst->getSrc(i)->getType())))
1697             {
1698                 inst->setSrc(insertMovBefore(iter, i, inst->getSrc(i)->getType(), bb), i);
1699             }
1700             else
1701             {
1702                 moveSrcToGRF(iter, i, 2, bb);
1703             }
1704             badTwoGRFSrc[i] = false;
1705             INST_LIST_ITER tmpIter = iter;
1706             tmpIter--;
1707             reduceExecSize(tmpIter, bb);
1708             if (builder.getOption(vISA_OptReport))
1709             {
1710                 (*tmpIter)->emit(std::cout);
1711                 std::cout << std::endl;
1712             }
1713         }
1714     }
1715 }
1716