1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2017-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "HWConformity.h"
10 #include "Optimizer.h"
11 #include "G4_Verifier.hpp"
12 #include "InstSplit.h"
13
14 using namespace vISA;
15
checkMinExecSize(G4_opcode op)16 uint8_t HWConformity::checkMinExecSize(G4_opcode op)
17 {
18 if (op == G4_dp2 ||
19 op == G4_dp3 ||
20 op == G4_dp4 ||
21 op == G4_dph)
22 {
23 return 4;
24 }
25 else if (op == G4_line || op == G4_pln) {
26 return 8;
27 }
28 else if (op == G4_sad2 || op == G4_sada2) {
29 return 2;
30 }
31 else
32 return 1;
33 }
34
fixOpndTypeAlign(G4_BB * bb)35 void HWConformity::fixOpndTypeAlign(G4_BB* bb)
36 {
37 INST_LIST_ITER i = bb->begin();
38 INST_LIST_ITER next_iter = i;
39 bool needSplit = false;
40
41 for (auto iEnd = bb->end(); i != iEnd; i = next_iter)
42 {
43 G4_INST *inst = *i;
44 G4_opcode opcode = inst->opcode();
45 if (opcode == G4_nop || opcode == G4_label || inst->mayExceedTwoGRF()) {
46 next_iter++;
47 } else if (fixInstOpndTypeAlign(i, bb)) {
48 needSplit = true;
49 next_iter = i;
50 next_iter++;
51 } else {
52 next_iter++;
53 }
54 #ifdef _DEBUG
55 verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
56 #endif
57 }
58
59 if (needSplit)
60 {
61 // make sure updated insts and new moves don't cross 2 GRF
62 InstSplitPass instSplitter(&builder);
63 instSplitter.runOnBB(bb);
64 }
65 }
66
67 // Fix instructions with vector immediate as source operands.
68 // mov (8) r5.0<2>:uw 0xfdb97531:uv {Align1}
69 // becomes
70 // mov (8) r6.0<1>:uw 0xfdb97531:uv {Align1}
71 // mov (8) r5.0<2>:uw r6.0<8;8,1>:uw {Align1, Q1}
72 //
73 // When an immediate vector is used in an instruction, the destination must
74 // be 128-bit aligned with destination horizontal stride equivalent to a
75 // word for an immediate integer vector (v) and equivalent to a DWord for an
76 // immediate float vector (vf).
77 bool
fixDstAlignmentWithVectorImm(INST_LIST_ITER iter,G4_BB * bb)78 HWConformity::fixDstAlignmentWithVectorImm(INST_LIST_ITER iter, G4_BB *bb)
79 {
80 bool changed = false;
81 G4_INST *inst = *iter;
82 G4_DstRegRegion *reg = inst->getDst();
83 uint8_t execSize = inst->getExecSize();
84
85 bool dstAligned = builder.isOpndAligned(reg, 16);
86
87 unsigned hsInBytes = reg->getHorzStride() * reg->getTypeSize();
88 for (int k = 0, e = inst->getNumSrc(); k < e; ++k)
89 {
90 G4_Operand *src = inst->getSrc(k);
91 if (!src || !src->isImm())
92 continue;
93
94 G4_Type ty = src->getType();
95 G4_Type moveTy = (ty == Type_V) ? Type_W :
96 (ty == Type_UV) ? Type_UW :
97 (ty == Type_VF) ? Type_F : Type_UNDEF;
98 if (moveTy == Type_UNDEF)
99 continue;
100
101 if (!dstAligned)
102 {
103 inst->setSrc(insertMovBefore(iter, k, moveTy, bb), k);
104 changed = true;
105 }
106 else if (hsInBytes != TypeSize(moveTy))
107 {
108 if (hsInBytes == 4 && execSize < 8)
109 {
110 // for the case where dst is dword and execution size is < 8,
111 // we can interleave the vector to avoid a move
112 // e.g., mov (2) r1.0<1>:d 0x21:uv -->
113 // mov (2) r1.0<1>:d 0x0201:uv
114 uint32_t bitValue = 0;
115 uint16_t immBits = (uint16_t) src->asImm()->getImm();
116 for (int i = 0; i < execSize; ++i)
117 {
118 int val = (immBits >> (i*4)) & 0xF;
119 bitValue |= val << (i * 8);
120 }
121 inst->setSrc(builder.createImm(bitValue, ty), k);
122 }
123 else
124 {
125 inst->setSrc(insertMovBefore(iter, k, moveTy, bb), k);
126 changed = true;
127 }
128 }
129 }
130
131 return changed;
132 }
133
134 // Do basic HW conformity check related to operand type and dst alignment before resucing execution size
135 // to avoid spliting of the MOV inserted in this stage.
136 // This function is called for some instructions generated in later stages.
fixInstOpndTypeAlign(INST_LIST_ITER i,G4_BB * bb)137 bool HWConformity::fixInstOpndTypeAlign(INST_LIST_ITER i, G4_BB* bb)
138 {
139 G4_INST *inst = *i;
140 bool insertedInst = false;
141
142 if (inst->opcode() == G4_srnd)
143 {
144 // Operands can be packed.
145 return false;
146 }
147
148 int extypesize = 0;
149 G4_Type extype = inst->getOpExecType(extypesize);
150
151 if (extypesize == numEltPerGRF<Type_UB>()/2 && inst->opcode() != G4_mov)
152 {
153 fixPackedSource(i, bb);
154 extype = inst->getOpExecType(extypesize);
155 }
156
157 // fixes opernds including
158 // swapping sel,
159 fixOpnds(i, bb, extype);
160
161 extype = inst->getOpExecType(extypesize);
162 if (inst->getDst() && !(inst->isSend()) && !(inst->isRawMov()))
163 {
164 if (extypesize < (int)numEltPerGRF<Type_UB>()/2)
165 {
166 uint32_t dst_elsize = inst->getDst()->getTypeSize();
167 if (dst_elsize < (unsigned int)extypesize)
168 {
169 if (fixDstAlignment(i, bb, extype, dst_elsize))
170 {
171 insertedInst = true;
172 }
173 }
174 }
175
176 auto hasPackedImm = [](G4_INST *inst) {
177 for (unsigned i = 0, e = inst->getNumSrc(); i != e; ++i) {
178 auto src = inst->getSrc(i);
179 if (!src || !src->isImm())
180 continue;
181 switch (src->getType()) {
182 case Type_V:
183 case Type_UV:
184 case Type_VF:
185 return true;
186 default:
187 break;
188 }
189 }
190 return false;
191 };
192
193 // There are vector immediate source operands.
194 if (hasPackedImm(*i)) {
195 if ((insertedInst = fixDstAlignmentWithVectorImm(i, bb))) {
196 // Recompute the execution type size if there is some change.
197 // This allows fixDstAlignment to fix possible conformity issues.
198 extype = inst->getOpExecType(extypesize);
199 uint32_t dst_elsize = inst->getDst()->getTypeSize();
200 if (dst_elsize < unsigned(extypesize)) {
201 if (fixDstAlignment(i, bb, extype, dst_elsize)) {
202 insertedInst = true;
203 }
204 }
205 }
206 }
207 }
208
209 return insertedInst;
210 }
211
212 // check Rule 2H
213 // VertStride must be used to cross GRF register boundaries. This rule implies that elements within a 'Width' cannot cross GRF boundaries.
214 // This is a separate function from fixSrcRegion because we may need to split the instruction to satisfy this rule
checkSrcCrossGRF(INST_LIST_ITER & iter,G4_BB * bb)215 bool HWConformity::checkSrcCrossGRF(INST_LIST_ITER& iter, G4_BB* bb)
216 {
217 G4_INST* inst = *iter;
218 for (int i = 0; i < G4_MAX_SRCS; i++)
219 {
220 if (inst->getSrc(i) && inst->getSrc(i)->isSrcRegRegion())
221 {
222 G4_SrcRegRegion *src = inst->getSrc(i)->asSrcRegRegion();
223 bool widthCrossingGRF = false;
224 const RegionDesc* srcRegion = src->getRegion();
225 uint16_t vs = srcRegion->vertStride, wd = srcRegion->width, hs = srcRegion->horzStride;
226 uint8_t exSize = inst->getExecSize();
227 if (src->getRegAccess() == Direct && src->crossGRF())
228 {
229 int elementSize = src->getTypeSize();
230 int startOffset = src->getLeftBound() % numEltPerGRF<Type_UB>();
231 for (int row = 0; row < exSize / wd; row++)
232 {
233 int rowOffset = (startOffset + row * vs * elementSize) % numEltPerGRF<Type_UB>();
234 if (rowOffset + (wd - 1) * hs * elementSize >= (int)numEltPerGRF<Type_UB>())
235 {
236 widthCrossingGRF = true;
237 break;
238 }
239 }
240 }
241 else if (src->getRegAccess() == IndirGRF)
242 {
243 widthCrossingGRF = wd > 1 && hs != 0;
244 }
245
246 auto doSplit = [&](bool canCrossGRF) -> void {
247 if (inst->usesFlag() || (!bb->isAllLaneActive() && !inst->isWriteEnableInst()))
248 {
249 // splitting may be unsafe, insert a move then split the move
250 G4_Operand* newSrc = insertMovBefore(iter, i, inst->getSrc(i)->getType(), bb);
251 inst->setSrc(newSrc, i);
252 auto movIter = iter;
253 --movIter;
254 splitInstruction(movIter, bb, false, 0, false, canCrossGRF);
255 }
256 else
257 {
258 splitInstruction(iter, bb, false, 0, false, canCrossGRF);
259 }
260 };
261
262 if (widthCrossingGRF)
263 {
264 uint16_t stride = 0;
265 if (srcRegion->isSingleStride(exSize, stride))
266 {
267 // replace <v;w,h> with <h;1,0>
268 src->setRegion(builder.createRegionDesc(stride, 1, 0), true);
269 }
270 else
271 {
272 doSplit(true);
273 return true;
274 }
275 }
276 else if (kernel.getKernelType() == VISA_CM && builder.no64bitRegioning() &&
277 src->getTypeSize() == 8)
278 {
279 // for CM, split non-scalar, non-contiguous source that cross GRF as HW conformity
280 // may be not equipped to deal with them later
281 const RegionDesc* region = src->getRegion();
282 if (!region->isScalar() && !region->isContiguous(inst->getExecSize()) &&
283 src->crossGRF())
284 {
285 doSplit(false);
286 return true;
287 }
288 }
289 }
290 }
291
292 return false;
293 }
294
fixInstExecSize(G4_BB * bb)295 void HWConformity::fixInstExecSize(G4_BB* bb)
296 {
297 #ifdef _DEBUG
298 verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
299 #endif
300
301 INST_LIST_ITER i = bb->begin();
302 INST_LIST_ITER next_iter = i;
303
304 for (; i != bb->end(); i = next_iter)
305 {
306 next_iter++;
307 G4_INST *inst = *i;
308 G4_opcode opcode = inst->opcode();
309 if (opcode == G4_nop || opcode == G4_label || inst->mayExceedTwoGRF())
310 {
311 continue;
312 }
313
314 if (reduceExecSize(i, bb))
315 {
316 next_iter = i;
317 next_iter++;
318 }
319 }
320 }
321 // split CISA instructions to follow Gen register region restriction
322 // splitOp returns true if inst is split into more than instructions
reduceExecSize(INST_LIST_ITER iter,G4_BB * bb)323 bool HWConformity::reduceExecSize(INST_LIST_ITER iter, G4_BB* bb)
324 {
325 G4_INST *inst = *iter;
326 // Madw can't be split in any pass except for fixMadwInst as it will cause the dst(SOA layout) unexpected. For example:
327 // madw (M1, 16) dst(0,0)<1> src0(0,0)<1;1,0> 0x38:ud 0x0:ud
328 // If split here, then low result is in dst(0,0) and dst(2,0), and high result is in dst(1,0) and dst(3,0)
329 // madw (M1, 8) dst(0,0)<1> src0(0,0)<1;1,0> 0x38:ud 0x0:ud
330 // madw (M8, 8) dst(2,0)<1> src0(1,0)<1;1,0> 0x38:ud 0x0:ud
331 // But expected dst is low result is in dst(0,0) and dst(1,0) and high result is in dst(2,0) and dst(3,0)
332 if (!inst || inst->isSend() || inst->getExecSize() == 1 || inst->opcode() == G4_madw)
333 {
334 return false;
335 }
336
337 bool insertMOV = false;
338
339 G4_DstRegRegion *dst = inst->getDst();
340 uint8_t minExSize = checkMinExecSize(inst->opcode());
341
342 bool useAcc = (inst->hasACCSrc() ||
343 (dst && dst->isAccReg()) ||
344 inst->getImplAccDst());
345
346 // TODO pre-processing of replicate region, VxH or all indirect sources?
347 bool nullDst = inst->hasNULLDst();
348 bool packedByteDst = false;
349 if (!nullDst && dst)
350 {
351 packedByteDst = IS_BTYPE(dst->getType()) && (dst->getHorzStride() == 1);
352 }
353
354 unsigned char execSize = inst->getExecSize();
355 bool splitOp = false, goodOneGRFDst = false;
356 bool crossGRFDst = dst && dst->isCrossGRFDst();
357 bool goodTwoGRFDst = false;
358 // for all platforms, if execution size is 8 or less and the destination register is 2, flag updates are not supported.
359 bool specialCondForComprInst = (execSize < 8 && dst && dst->getHorzStride() != 1 &&
360 inst->getCondMod() && inst->opcode() != G4_sel);
361
362 TARGET_PLATFORM genX = builder.getPlatform();
363
364 // rules specific to math instructions
365 // INT DIV function does not support SIMD16
366 if (inst->isMath() && inst->asMathInst()->isMathIntDiv() && execSize == 16)
367 {
368 return reduceExecSizeForMath(iter, bb);
369 }
370
371 if (genX >= GENX_SKL)
372 {
373 // SKL removes rules for GRF alignments, so we don't have to check whether the dst or src is evenly split anymore
374 // This means that any subreg of source can move to any subreg of dst
375 // FIXME: From the comments it seems we still have to handle "specialCondForComprInst"
376 return checkSrcCrossGRF(iter, bb);
377 }
378
379 // various variables needed for instruction splitting, for some reason
380 G4_opcode op = inst->opcode();
381 G4_Type instExecType = inst->getExecType();
382 bool oneGRFSrc[3] = { false, false, false };
383 bool twoGRFSrc[3] = { false, false, false };
384 bool badTwoGRFSrc[3] = { false, false, false };
385 bool evenTwoGRFSrc[3] = { false, false, false };
386 bool fullTwoGRFSrc[3] = { false, false, false };
387 bool hasBadTwoGRFSrc = false;
388 bool compOpt = false,
389 forceEvenSplit = (execSize == 32 && inst->opcode() == G4_sel && inst->getCondMod()) || packedByteDst;
390 uint8_t numInFirstMov = 0;
391 bool useFlag = inst->getPredicate() || inst->getCondMod() || (!bb->isAllLaneActive() && !inst->isWriteEnableInst());
392 bool evenSplitDst = false;
393
394 // separate the checks for BDW to make it more maintainable
395 // For CM use pre-BDW region rules due to HW bugs.
396 if (kernel.getKernelType() != VISA_CM && (genX == GENX_BDW || genX == GENX_CHV))
397 {
398 // for BDW we check the following rules:
399 // Rule 3D
400 // [DevBDW/DevCHV]: When an instruction has a source region spanning two registers and a destination regioning contained in one register, one of the following must be true:
401 // The destination region is entirely contained in the lower Oword of a register.
402 // The destination region is entirely contained in the upper Oword of a register.
403 // The destination elements are evenly split between the two OWords of a register AND evenly split between the two source registers.
404 // Rule 3G
405 // [DevBDW]: When destination spans two registers, the source may be one or two registers. The destination elements must be evenly split between the two registers.
406
407 bool mayUseIntAcc = op == G4_pseudo_sada2;
408 if (crossGRFDst)
409 {
410 // rule 3G
411 goodTwoGRFDst = inst->goodTwoGRFDst(evenSplitDst) && !specialCondForComprInst && !mayUseIntAcc;
412 splitOp = !goodTwoGRFDst;
413 }
414 else
415 {
416 // rule 3D
417 G4_Operand *srcs[3];
418 uint8_t eleInFirstGRF[3];
419
420 for (int i = 0; i < inst->getNumSrc(); i++)
421 {
422 srcs[i] = inst->getSrc(i);
423 if (srcs[i] && srcs[i]->isSrcRegRegion())
424 {
425 bool indirectSrc = srcs[i]->asSrcRegRegion()->getRegAccess() != Direct;
426 if (!indirectSrc && srcs[i]->asSrcRegRegion()->isScalar())
427 {
428 continue;
429 }
430
431 if (inst->opcode() == G4_pln && i == 1)
432 {
433 // src1 for plane may touch multiple GRFs as there's a hidden source
434 continue;
435 }
436
437 if (srcs[i]->crossGRF())
438 {
439 twoGRFSrc[i] = true;
440
441 if (!nullDst && dst)
442 {
443 // check if dst can be entirely contained in one oword
444 int dstRegionSize = dst->getRightBound() - dst->getLeftBound() + 1;
445
446 if (dstRegionSize <= 16)
447 {
448 // see if we can make the dst fit in one oword
449 unsigned short dstOffset = 0;
450 bool dstOwordAligned = false;
451 int dstAlign = Round_Up_Pow2(dstRegionSize);
452 dstOwordAligned = builder.isOpndAligned(dst, dstOffset, dstAlign);
453 if (!dstOwordAligned)
454 {
455 // If we can align dst to its size, it must fit in one OWord
456 // if we can't, it may still be in OWord (e.g., for size < 16)
457 dstOffset %= numEltPerGRF<Type_UB>();
458 bool fitInOword = !(dstOffset < 16 && (dstOffset + dstRegionSize) > 16);
459 if (!fitInOword)
460 {
461 // technically if dst and src are both evenly split the instruction is
462 // still ok, but this case should be rare so we ignore it
463 G4_DstRegRegion* newDst = insertMovAfter(iter, dst, dst->getType(), bb);
464 bool alignTmpDst = builder.isOpndAligned(newDst, dstOffset, 16);
465 MUST_BE_TRUE(alignTmpDst, "must be able to oword align tmp dst");
466 inst->setDest(newDst);
467 return true;
468 }
469 }
470 }
471 else
472 {
473 // dst does not fit in one oword, but is guaranteed to be evenly split (it does not cross GRF).
474 // check if src is evenly split across the two GRFs
475 bool sameSubregOff, vertCrossGRF, contRegion;
476 evenTwoGRFSrc[i] = srcs[i]->asSrcRegRegion()->evenlySplitCrossGRF(
477 execSize, sameSubregOff, vertCrossGRF, contRegion, eleInFirstGRF[i]);
478 bool coverTwoGRF = srcs[i]->asSrcRegRegion()->coverTwoGRF();
479 const RegionDesc *rd = srcs[i]->asSrcRegRegion()->getRegion();
480 uint16_t stride = 0;
481 fullTwoGRFSrc[i] = coverTwoGRF && rd->isSingleStride(inst->getExecSize(), stride) && (stride == 1);
482
483 if (!evenTwoGRFSrc[i])
484 {
485 // evenly split would be the best approach here, but unfortunately we can't do it
486 // if the instruction is predicated
487 splitOp = true;
488 // compensation OPT
489 if (!forceEvenSplit && !hasBadTwoGRFSrc && minExSize == 1 && goodOneGRFDst &&
490 contRegion && eleInFirstGRF[i] > (execSize >> 1))
491 {
492 if (!useFlag && ((!compOpt && numInFirstMov == 0) || numInFirstMov == eleInFirstGRF[i]))
493 {
494 compOpt = true;
495 numInFirstMov = eleInFirstGRF[i];
496 }
497 else
498 {
499 compOpt = false;
500 hasBadTwoGRFSrc = true;
501 badTwoGRFSrc[i] = true;
502 }
503 }
504 else
505 {
506 hasBadTwoGRFSrc = true;
507 badTwoGRFSrc[i] = true;
508 }
509 }
510 }
511 }
512 // nothing needs to be done when dst is null???
513 }
514 }
515 }
516 }
517
518 // the only reason for split is due to 32-bit flag
519 // split inst into two SIMD 16 instructions
520 if (!splitOp && execSize == 32 &&
521 (inst->getPredicate() || inst->getCondMod()))
522 {
523 if (forceEvenSplit)
524 {
525 splitSIMD32Inst(iter, bb);
526 return insertMOV;
527 }
528 }
529 }
530 else
531 {
532 // pre-BDW checks
533
534 // Check if the instruction will use int ACC later. if yes, compressed instruction
535 // is split into 2 one-GRF instructions.
536
537 bool mayUseIntAcc = op == G4_pseudo_sada2 ||
538 (op == G4_mul && IS_DTYPE(inst->getSrc(0)->getType()) && IS_DTYPE(inst->getSrc(1)->getType()));
539
540 if (crossGRFDst)
541 {
542 // rule 3D
543 goodTwoGRFDst = inst->goodTwoGRFDst(evenSplitDst) && !specialCondForComprInst && !mayUseIntAcc;
544 splitOp = !goodTwoGRFDst;
545 }
546
547 G4_Operand *srcs[3];
548 uint8_t eleInFirstGRF[3];
549 for (int i = 0; i < inst->getNumSrc(); i++)
550 {
551 srcs[i] = inst->getSrc(i);
552
553 if (srcs[i] && srcs[i]->isSrcRegRegion() &&
554 !(inst->opcode() == G4_math && i == 1 && srcs[i]->isNullReg()))
555 {
556 bool indirectSrc = (srcs[i]->isSrcRegRegion() &&
557 srcs[i]->asSrcRegRegion()->getRegAccess() != Direct);
558
559 if (!indirectSrc && srcs[i]->asSrcRegRegion()->isScalar())
560 {
561 continue;
562 }
563 if (inst->opcode() == G4_pln && i == 1)
564 {
565 continue;
566 }
567
568 if (crossGRFDst && indirectSrc)
569 {
570 // Assumption: all indirect operand follow GenX requirement (no cross-GRF indexing ...)
571 // pre_BDW rule 6D: When a Vx1 or a VxH addressing mode is used on src0,
572 // the destination must use ONLY one register.
573 // Vx1 is not handled now. only vxh is considered here
574 // if (srcs[i]->asSrcRegRegion()->getRegion()->isRegionWH())
575 {
576 splitOp = true;
577 }
578 }
579 else if (srcs[i]->crossGRF())
580 {
581 twoGRFSrc[i] = true;
582 bool sameSubregOff, vertCrossGRF, contRegion;
583 evenTwoGRFSrc[i] = srcs[i]->asSrcRegRegion()->evenlySplitCrossGRF(
584 execSize, sameSubregOff, vertCrossGRF, contRegion, eleInFirstGRF[i]);
585 bool coverTwoGRF = srcs[i]->asSrcRegRegion()->coverTwoGRF();
586 const RegionDesc *rd = srcs[i]->asSrcRegRegion()->getRegion();
587 uint16_t stride = 0;
588 fullTwoGRFSrc[i] = coverTwoGRF && rd->isSingleStride(inst->getExecSize(), stride) && (stride == 1);
589
590 if (dst && !crossGRFDst)
591 {
592 // destination requirements are:
593 // -- The destination region is entirely contained in the lower OWord of a register.
594 // -- The destination region is entirely contained in the upper OWord of a register.
595 // -- The destination elements are evenly split between the two OWords of a register.
596 int dstRegionSize = dst->getRightBound() - dst->getLeftBound() + 1;
597
598 // round up dst region size to next power of two
599 int dstAlign = Round_Up_Pow2(dstRegionSize);
600
601 bool dstOwordAligned = false;
602 dstOwordAligned = builder.isOpndAligned(dst, dstAlign);
603 if (dstOwordAligned)
604 {
605 // If we can align dst to its size, it must fit in one OWord
606 goodOneGRFDst = true;
607 }
608 else
609 {
610 // if we can't, it may still be in one OWord or evenly split
611 goodOneGRFDst = dst->goodOneGRFDst(execSize);
612 }
613 }
614
615 // region can be fixed later in fixCompressedInst().
616 // rule 3E and 3F
617 // 2-GRF src should follow below implicit rules, no matter the dst size:
618 // pre-BDW
619 // 1. Data must be evenly split between source registers.
620 // 2. Same subregister number in the two GRFs(occupy whole two GRFs) if dst is two-GRF.
621 if (!evenTwoGRFSrc[i] ||
622 (((goodTwoGRFDst || (goodOneGRFDst && !contRegion)) && !sameSubregOff) ||
623 (goodTwoGRFDst && IS_WTYPE(srcs[i]->getType()) && !(srcs[i]->asSrcRegRegion()->checkGRFAlign() && coverTwoGRF))))
624 {
625 splitOp = true;
626 // compensation OPT
627 if (!forceEvenSplit && !hasBadTwoGRFSrc && minExSize == 1 && goodOneGRFDst &&
628 contRegion && eleInFirstGRF[i] > (execSize >> 1))
629 {
630 if (!useFlag && ((!compOpt && numInFirstMov == 0) || numInFirstMov == eleInFirstGRF[i]))
631 {
632 compOpt = true;
633 numInFirstMov = eleInFirstGRF[i];
634 }
635 else
636 {
637 compOpt = false;
638 hasBadTwoGRFSrc = true;
639 badTwoGRFSrc[i] = true;
640 }
641 }
642 else
643 {
644 hasBadTwoGRFSrc = true;
645 badTwoGRFSrc[i] = true;
646 }
647 }
648 // rule 3C and 3D
649 // mul (4) r8.3<1>:f r2.3<4;4,1>:f r31.0<8;2,4>:f {Align1}
650 else if (dst && !crossGRFDst && !goodOneGRFDst)
651 {
652 splitOp = true;
653 }
654 }
655 }
656 }
657
658 // the only reason for split is due to 32-bit flag
659 // split inst into two SIMD 16 instructions
660 if (!splitOp && execSize == 32 &&
661 (packedByteDst || (inst->getPredicate() || inst->getCondMod())))
662 {
663 if (forceEvenSplit)
664 {
665 // FIXME: try to use evenlySplitInst() instead.
666 splitSIMD32Inst(iter, bb);
667 return insertMOV;
668 }
669 }
670
671 // You will need to do this ONLY when destination spans 2 registers, src1 is a word or byte and you expect channels to be turned off !!
672 // currrently for instruction with pred or emask on pre-BDW
673 bool specialCondForShootDown = (dst && goodTwoGRFDst &&
674 (inst->getPredicate() || (!bb->isAllLaneActive() && !inst->isWriteEnableInst())) &&
675 oneGRFSrc[1] && (IS_BTYPE(srcs[1]->getType()) || IS_WTYPE(srcs[1]->getType())));
676 if (specialCondForShootDown)
677 {
678 splitOp = true;
679 }
680 }
681
682 if (!splitOp)
683 {
684 return insertMOV;
685 }
686
687 MUST_BE_TRUE((inst->opcode() != G4_smov), "Error in splitting smov instruction");
688
689 // split instruction like:
690 // mad (8) V24(2,0)<2> V20(2,0)<16;8,2> V20(2,0)<16;8,2> V23(2,0)<16;8,2>
691 if (splitOp && crossGRFDst && evenSplitDst && !hasBadTwoGRFSrc && (execSize >= 16 || !useFlag))
692 {
693 if (minExSize == 1 || execSize > minExSize)
694 {
695 evenlySplitInst(iter, bb);
696 return insertMOV;
697 }
698 }
699
700 // For inst with pred, condMod, or with mask in SIMDCF BB, we insert MOVs with nomask for src/dst
701 // to avoid instruction spliting. inserted MOVs may be split into multiple instructions.
702 // ATTN: We do not include sel here because the condMod generated by sel is never used.
703 if (useFlag &&
704 !(inst->opcode() == G4_sel && !(inst->getPredicate()) && inst->getCondMod()))
705 {
706 // if there is predicate or cond modifier, we keep the original instruction and
707 // perform spliting on new MOV instructions.
708 if (!nullDst && !crossGRFDst && !goodOneGRFDst)
709 {
710 // try to move 2-GRF src into 1GRF tmp to avoid spliting.
711 // this is unnecessary in non-SIMDCF/nonPred/nonCondMod cases because we can do compensation.
712 for (int i = 0; i < inst->getNumSrc(); i++)
713 {
714 if (twoGRFSrc[i] && !fullTwoGRFSrc[i])
715 {
716 moveSrcToGRF(iter, i, 1, bb);
717 twoGRFSrc[i] = false;
718 badTwoGRFSrc[i] = false;
719 INST_LIST_ITER tmpIter = iter;
720 tmpIter--;
721 if (builder.getOption(vISA_OptReport))
722 {
723 (*tmpIter)->emit(std::cout);
724 std::cout << std::endl;
725 }
726 reduceExecSize(tmpIter, bb);
727 }
728 }
729 if (!fullTwoGRFSrc[0] && !fullTwoGRFSrc[1] && !fullTwoGRFSrc[2])
730 {
731 return insertMOV;
732 }
733 }
734 //FIXME: another option is that if original exec size is 16 and will be split into
735 // two simd8, we can use quarter control in some cases.
736
737 if (!nullDst &&
738 ((!crossGRFDst && !goodOneGRFDst) ||
739 (crossGRFDst && !goodTwoGRFDst) ||
740 (goodTwoGRFDst && specialCondForComprInst)))
741 {
742 // TODO: NULL dst
743 // use temp dst.
744 // case 1: SIMD CF
745 // mov (16) r3.5<1>:b r1.0<8;8,1>:d
746 // ==>
747 // mov (16) r6.0<2>:b r3.5<16;16,1>:b {nomask}
748 // mov (16) r6.0<2>:b r1.0<8;8,1>:d -- other dst alignment fix will take care of dst.
749 // mov (16) r3.5<1>:b r6.0<32;16,2>:b {nomask}
750
751 // case 2, no SIMDCF
752 // (f0.0) mov (16) r3.5<1>:b r1.0<8;8,1>:d
753 // ==>
754 // mov (16) r6.0<2>:b r1.0<8;8,1>:d -- other dst alignment fix will take care of dst.
755 // (f0.0) mov (16) r3.5<1>:b r6.0<32;16,2>:b
756
757 uint8_t scale = TypeSize(instExecType) / dst->getTypeSize();
758
759 if (scale > 1 &&
760 TypeSize(instExecType) * (unsigned)execSize > numEltPerGRF<Type_UB>())
761 {
762 scale = numEltPerGRF<Type_UB>() / dst->getTypeSize() / execSize;
763 }
764 else if (scale == 0)
765 {
766 scale = 1;
767 }
768
769 // can't split if inst is in SIMD flow and is not NoMask, or the inst has predicate
770 // Have to introduce a temp that supports splitting instead
771 if ((!bb->isAllLaneActive() && !inst->isWriteEnableInst()) || inst->getPredicate())
772 {
773 saveDst(iter, scale, bb);
774 INST_LIST_ITER tmpIter = iter;
775 tmpIter--;
776 if (builder.getOption(vISA_OptReport))
777 {
778 (*tmpIter)->emit(std::cout);
779 std::cout << std::endl;
780 inst->emit(std::cout);
781 std::cout << std::endl;
782 }
783 // Fix up the move to load dst. We can split the move instruction as it is NoMask
784 reduceExecSize(tmpIter, bb);
785
786 // source may also be bad, so we have to call reduceExecSize() on iter again
787 reduceExecSize(iter, bb);
788
789 // generate MOV after inst
790 // if the dst is bad, it will be fixed by the next call to reduceExecSize()
791 restoreDst(iter, dst, bb);
792
793 if (builder.getOption(vISA_OptReport))
794 {
795 tmpIter = iter;
796 tmpIter++;
797 (*tmpIter)->emit(std::cout);
798 std::cout << std::endl;
799 }
800 }
801 else
802 {
803 insertMovAfter(iter, scale, bb);
804 if (builder.getOption(vISA_OptReport))
805 {
806 INST_LIST_ITER tmpIter = iter;
807 tmpIter++;
808 (*tmpIter)->emit(std::cout);
809 std::cout << std::endl;
810 }
811 }
812 goodOneGRFDst = true;
813 goodTwoGRFDst = true;
814 crossGRFDst = false;
815 insertMOV = true;
816 }
817
818 removeBadSrc(iter, bb, crossGRFDst, oneGRFSrc, badTwoGRFSrc);
819 return insertMOV;
820 }
821
822 if (!nullDst && !crossGRFDst && !goodOneGRFDst && !hasBadTwoGRFSrc)
823 {
824 // insert a temp dst and a MOV
825 // example:
826 // add (8) r5.3<1>:w r2.0<16;8,2>:w 0x1:w
827 // ==>
828 // add (8) r6.0<1>:w r2.0<8;8,1>:d 0x1:w
829 // mov (8) r5.3<1>:b r6.0<8;8,1>:w
830 // In some cases spliting the instruction generates the same number of instruction
831 // without dependency, but needs more analysis.
832 inst->setDest(insertMovAfter(iter, dst, dst->getType(), bb));
833 if (builder.getOption(vISA_OptReport))
834 {
835 inst->emit(std::cout);
836 std::cout << std::endl;
837 INST_LIST_ITER tmpIter = iter;
838 tmpIter++;
839 (*tmpIter)->emit(std::cout);
840 std::cout << std::endl;
841 }
842 return true;
843 }
844
845 // only two kinds of instruction use ACC operands:
846 // 1. instructions generated in ARCTAN intrinsic translation.
847 // they do not need spliting
848 // 2. instructions generated in MAC opt. there is a check to make
849 // sure only evenly spliting will happen to them.
850 if (useAcc)
851 {
852 evenlySplitInst(iter, bb);
853 return insertMOV;
854 }
855 // split the instruction into a list of instructions
856 splitInstruction(iter, bb, compOpt, numInFirstMov, false, true);
857 return true;
858 }
859
860 // split a SIMD32 inst into two SIMD16.
861 // there is predicate/conditional modifier used in this inst.
862 //
863 // Result:
864 // Inst refered to by 'iter' is split into two simd16 instrcutions.
865 // One is inserted right before 'iter', the other is to reuse 'iter'.
866 // And the caller of this function can access two new instructions via '--iter' and 'iter' !
splitSIMD32Inst(INST_LIST_ITER iter,G4_BB * bb)867 void HWConformity::splitSIMD32Inst(INST_LIST_ITER iter, G4_BB* bb)
868 {
869 G4_INST *inst = *iter;
870 G4_opcode op = inst->opcode();
871 G4_Operand *srcs[3] = { nullptr };
872 int numSrc = inst->getNumSrc();
873
874 // check dst/src dependency
875 checkSrcDstOverlap(iter, bb, false);
876 for (int i = 0; i < numSrc; i++)
877 {
878 srcs[i] = inst->getSrc(i);
879 }
880
881 // compute max exeuction size.
882 // boundary is GRF-boundary and HS change, but for Dst, elements should be symetric
883 // if half-GRF boundary is crossed.
884 G4_DstRegRegion *dst = inst->getDst();
885 bool nullDst = dst && inst->hasNULLDst();
886 G4_ExecSize instExSize = inst->getExecSize(), currExSize = G4_ExecSize(instExSize / 2);
887 for (int i = 0; i < instExSize; i += currExSize)
888 {
889 // create new Oprands. Acc should not be split since we generate it in jitter and
890 // can control this.
891 // create new condMod and predicate
892 G4_CondMod *newCondMod = inst->getCondMod();
893 if (newCondMod)
894 {
895 newCondMod = builder.createCondMod(
896 newCondMod->getMod(), newCondMod->getBase(), i == 0 ? 0 : 1);
897 }
898
899 G4_Predicate *newPredOpnd = inst->getPredicate();
900 if (newPredOpnd)
901 {
902 newPredOpnd = builder.createPredicate(
903 newPredOpnd->getState(), newPredOpnd->getBase(), i == 0 ? 0 : 1, newPredOpnd->getControl());
904 }
905
906 G4_DstRegRegion *newDst;
907 if (!nullDst)
908 {
909 newDst = builder.createSubDstOperand(dst, (uint16_t) i, currExSize);
910 }
911 else
912 {
913 newDst = dst;
914 }
915 // generate new inst
916 G4_INST* newInst;
917 if ((i + currExSize) < instExSize)
918 {
919 newInst = builder.makeSplittingInst(inst, currExSize);
920 newInst->setDest(newDst);
921 newInst->setPredicate(newPredOpnd);
922 newInst->setCondMod(newCondMod);
923 bb->insertBefore(iter, newInst);
924 }
925 else
926 {
927 // reuse the original inst
928 newInst = inst;
929 newInst->setExecSize(currExSize);
930 newInst->setDest(newDst);
931 newInst->setPredicate(newPredOpnd);
932 newInst->setCondMod(newCondMod);
933 }
934
935 for (int j = 0; j < numSrc; j++)
936 {
937 if (srcs[j])
938 {
939 // src1 for single source math should be arc reg null.
940 if (srcs[j]->isImm() ||
941 (inst->opcode() == G4_math && j == 1 && srcs[j]->isNullReg()))
942 {
943 newInst->setSrc(srcs[j], j);
944 }
945 else if (srcs[j]->asSrcRegRegion()->isScalar() || (j == 0 && op == G4_line))
946 {
947 newInst->setSrc(builder.duplicateOperand(srcs[j]), j);
948 }
949 else
950 {
951 newInst->setSrc(builder.createSubSrcOperand(srcs[j]->asSrcRegRegion(), (uint16_t)i,
952 currExSize, (uint8_t)(srcs[j]->asSrcRegRegion()->getRegion()->vertStride),
953 (uint8_t)(srcs[j]->asSrcRegRegion()->getRegion()->width)), j);
954 }
955 }
956 }
957
958 // maintain def-use chain
959 if (newInst == inst)
960 {
961 newInst->trimDefInstList();
962 }
963 else
964 {
965 // Defs (uses) of this new instruction will be a subset of the
966 // original instruction's defs (uses).
967 inst->copyDefsTo(newInst, true);
968 inst->copyUsesTo(newInst, true);
969 }
970 }
971 }
972
splitInstruction(INST_LIST_ITER iter,G4_BB * bb,bool compOpt,uint8_t numInFirstMov,bool rule4_11,bool canSrcCrossGRF)973 void HWConformity::splitInstruction(INST_LIST_ITER iter, G4_BB* bb, bool compOpt, uint8_t numInFirstMov,
974 bool rule4_11, bool canSrcCrossGRF)
975 {
976 G4_INST *inst = *iter;
977 G4_opcode op = inst->opcode();
978 G4_Operand *srcs[3] = { nullptr };
979
980 // check dst/src dependency
981 checkSrcDstOverlap(iter, bb, compOpt);
982
983 int numSrcs = inst->getNumSrc();
984
985 for (int i = 0; i < numSrcs; i++)
986 {
987 srcs[i] = inst->getSrc(i);
988 }
989
990 uint8_t minExSize = checkMinExecSize(op);
991 // compute max exeuction size.
992 // boundary is GRF-boundary and HS change, but for Dst, elements should be symetric
993 // if half-GRF boundary is crossed.
994 G4_DstRegRegion *dst = inst->getDst();
995 bool nullDst = inst->hasNULLDst();
996 G4_ExecSize instExSize = inst->getExecSize();
997 G4_ExecSize currExSize;
998 uint16_t vs[3] = { 0 }, wd[3] = { 0 };
999
1000 G4_Predicate *instPred = inst->getPredicate();
1001
1002 // first, produce mask if needed
1003 // mov (16) r2.0<1>:uw 0:uw {Align1, NoMask} // 0:uw
1004 // mov (16) r2.0<1>:uw 0x1:uw {Align1} // 1:uw
1005 // this part is currently not used since we do not split inst with predicate or emask
1006 bool isSIMDCFInst = !bb->isAllLaneActive() && !inst->isWriteEnableInst();
1007 G4_Declare *maskDcl = NULL;
1008 if (instPred || isSIMDCFInst)
1009 {
1010 maskDcl = builder.createTempVar(instExSize, Type_UW, Eight_Word);
1011 G4_DstRegRegion * tmpMaskOpnd = builder.createDst(maskDcl->getRegVar(), 0, 0, 1, Type_UW);
1012
1013 G4_INST* firstMov = builder.createMov(instExSize,
1014 tmpMaskOpnd, builder.createImm(0, Type_UW), inst->getOption(), false);
1015
1016 G4_Predicate* pred = builder.duplicateOperand(inst->getPredicate());
1017 auto movInst = builder.createMov(instExSize, tmpMaskOpnd, builder.createImm(1, Type_UW), inst->getOption(), false);
1018 movInst->setPredicate(pred);
1019
1020 if (isSIMDCFInst)
1021 {
1022 firstMov->setNoMask(true);
1023 }
1024 }
1025
1026 bool needsMaskOffset = instPred || isSIMDCFInst || inst->getCondMod() != nullptr;
1027
1028 for (uint8_t i = 0; i < instExSize; i += currExSize)
1029 {
1030 if (compOpt && i == 0)
1031 {
1032 currExSize = G4_ExecSize(numInFirstMov);
1033 G4_INST *newInst = builder.makeSplittingInst(inst, instExSize);
1034 newInst->setDest(builder.duplicateOperand(inst->getDst()));
1035 newInst->setPredicate(builder.duplicateOperand(inst->getPredicate()));
1036 newInst->setCondMod(builder.duplicateOperand(inst->getCondMod()));
1037 for (int j = 0; j < inst->getNumSrc(); j++)
1038 {
1039 newInst->setSrc(builder.duplicateOperand(srcs[j]), j);
1040 }
1041 // update def-use chain
1042 inst->copyDefsTo(newInst, true);
1043 inst->copyDefsTo(newInst, true);
1044 bb->insertBefore(iter, newInst);
1045 continue;
1046 }
1047
1048 // this stores the max allowed exec size for each operand (0 -- dst, 1 -- src0, and so on)
1049 uint8_t opndExSize[4] = { 0, 0, 0, 0 };
1050 currExSize = G4_ExecSize(roundDownPow2(instExSize - i));
1051
1052 bool crossGRFsrc = false;
1053 for (int j = 0; j < numSrcs; j++)
1054 {
1055 if (!srcs[j] || !srcs[j]->isSrcRegRegion() ||
1056 srcs[j]->isNullReg() || (j == 0 && op == G4_line))
1057 {
1058 opndExSize[j + 1] = currExSize;
1059 continue;
1060 }
1061 bool twoGRFsrc = false;
1062 opndExSize[j+1] = srcs[j]->asSrcRegRegion()->getMaxExecSize(i, currExSize, canSrcCrossGRF, vs[j], wd[j], twoGRFsrc);
1063
1064 if (opndExSize[j + 1] > 8 && rule4_11)
1065 {
1066 opndExSize[j + 1] = 8;
1067 }
1068
1069 crossGRFsrc |= twoGRFsrc;
1070 if (minExSize == 1)
1071 {
1072 currExSize = G4_ExecSize(opndExSize[j + 1]);
1073 }
1074 }
1075
1076 if (dst && !nullDst)
1077 {
1078 opndExSize[0] = dst->getMaxExecSize(i, currExSize, crossGRFsrc);
1079
1080 if (opndExSize[0] > 8 && rule4_11)
1081 opndExSize[0] = 8;
1082 }
1083 else
1084 {
1085 // dst essentially does not affect the splitting decision
1086 opndExSize[0] = currExSize;
1087 }
1088
1089 if (minExSize == 1)
1090 {
1091 currExSize = G4_ExecSize(opndExSize[0]);
1092 }
1093
1094 bool needMov = false;
1095 if (minExSize > 1)
1096 {
1097 // find minimal execsize. if it is not less than minExSize, use it
1098 // to avoid dependency
1099 // FIXME: optimize this part by avoiding MOVs
1100 uint8_t currMinExSize = 64;
1101 currExSize = G4_ExecSize(0);
1102 for (int j = 0; j <= numSrcs; j++)
1103 {
1104 // use max possible exsize
1105 if (opndExSize[j] > currExSize)
1106 {
1107 currExSize = G4_ExecSize(opndExSize[j]);
1108 }
1109 if (opndExSize[j] != 0 && opndExSize[j] < currMinExSize)
1110 {
1111 currMinExSize = opndExSize[j];
1112 }
1113 }
1114
1115 if (currMinExSize >= minExSize)
1116 {
1117 currExSize = G4_ExecSize(currMinExSize);
1118 }
1119 else
1120 {
1121 for (int j = 0; j <= numSrcs; j++)
1122 {
1123 if (opndExSize[j] != 0 && opndExSize[j] < currExSize)
1124 {
1125 needMov = true;
1126 }
1127 }
1128 }
1129 }
1130
1131 MUST_BE_TRUE(currExSize != 0, "illegal execution size in instruction splitting");
1132 // create new Oprands. Acc should not be split since we generate it in jitter and
1133 // can control this.
1134 G4_DstRegRegion *newDst = !nullDst ? builder.createSubDstOperand(dst, (uint16_t)i, currExSize) : dst;
1135
1136 // generate new inst
1137 G4_INST* newInst;
1138 INST_LIST_ITER newInstIter;
1139 if ((i + currExSize) < instExSize)
1140 {
1141 newInst = builder.makeSplittingInst(inst, currExSize);
1142 newInst->setDest(newDst);
1143 newInst->setPredicate(builder.duplicateOperand(inst->getPredicate()));
1144 newInst->setCondMod(builder.duplicateOperand(inst->getCondMod()));
1145 bb->insertBefore(iter, newInst);
1146 newInstIter = iter;
1147 newInstIter--;
1148 }
1149 else
1150 {
1151 // reuse the original inst
1152 newInst = inst;
1153 newInst->setDest(newDst);
1154 newInst->setExecSize(currExSize);
1155 newInstIter = iter;
1156 }
1157
1158 for (int j = 0; j < inst->getNumSrc(); j++)
1159 {
1160 if (srcs[j])
1161 {
1162 // src1 for single source math should be arc reg null.
1163 if (srcs[j]->isImm() ||
1164 (inst->opcode() == G4_math && j == 1 && srcs[j]->isNullReg()))
1165 {
1166 newInst->setSrc(srcs[j], j);
1167 }
1168 else if (srcs[j]->asSrcRegRegion()->isScalar() || (j == 0 && op == G4_line))
1169 {
1170 newInst->setSrc(builder.duplicateOperand(srcs[j]), j);
1171 }
1172 else
1173 {
1174 if (srcs[j]->isAddrExp())
1175 {
1176 G4_AddrExp* addExp = builder.createAddrExp(srcs[j]->asAddrExp()->getRegVar(), srcs[j]->asAddrExp()->getOffset(), srcs[j]->asAddrExp()->getType());
1177 newInst->setSrc(addExp, j);
1178 }
1179 else
1180 {
1181 uint16_t start = i;
1182 newInst->setSrc(builder.createSubSrcOperand(srcs[j]->asSrcRegRegion(), start, currExSize, vs[j], wd[j]), j);
1183 }
1184 }
1185 }
1186 }
1187
1188 if (instExSize == 16 &&
1189 currExSize == 8 &&
1190 needsMaskOffset)
1191 {
1192 if (instPred)
1193 {
1194 G4_Predicate* tPred = builder.duplicateOperand(instPred);
1195 tPred->setInst(newInst);
1196 newInst->setPredicate(tPred);
1197 }
1198
1199 if (newInst->getMaskOffset() == 0)
1200 {
1201 newInst->setMaskOption(i == 0 ? InstOpt_M0 : InstOpt_M8);
1202 }
1203 else
1204 {
1205 newInst->setMaskOption(i == 0 ? InstOpt_M16 : InstOpt_M24);
1206 }
1207 }
1208
1209 // maintain def-use chain
1210 if (newInst == inst)
1211 {
1212 newInst->trimDefInstList();
1213 }
1214 else
1215 {
1216 inst->copyDefsTo(newInst, /*checked*/true);
1217 inst->copyUsesTo(newInst, /*checked*/true);
1218 }
1219
1220 // the following code is to keep minimal execution size for some opcode, for example, DP4
1221 // insert mov if needed
1222 if (needMov)
1223 {
1224 for (int j = 0; j < inst->getNumSrc(); j++)
1225 {
1226 if (opndExSize[j + 1] < currExSize)
1227 {
1228 newInst->setSrc(insertMovBefore(newInstIter, j, srcs[j]->getType(), bb), j);
1229 // reducing exec size for new MOV
1230 INST_LIST_ITER newMovIter = newInstIter;
1231 newMovIter--;
1232 reduceExecSize(newMovIter, bb);
1233 if (builder.getOption(vISA_OptReport))
1234 {
1235 (*newMovIter)->emit(std::cout);
1236 std::cout << std::endl;
1237 }
1238 }
1239 }
1240 }
1241 if (builder.getOption(vISA_OptReport))
1242 {
1243 newInst->emit(std::cout);
1244 std::cout << std::endl;
1245 }
1246 // dst
1247 if (needMov && opndExSize[0] < currExSize)
1248 {
1249 (*newInstIter)->setDest(
1250 insertMovAfter(newInstIter, inst->getDst(), inst->getDst()->getType(), bb));
1251 INST_LIST_ITER newMovIter = newInstIter;
1252 newMovIter++;
1253 reduceExecSize(newMovIter, bb);
1254 if (builder.getOption(vISA_OptReport))
1255 {
1256 (*newMovIter)->emit(std::cout);
1257 std::cout << std::endl;
1258 }
1259 }
1260 }
1261 }
1262
1263
1264
1265 // evenly split an inst into two instructions with half execution size.
1266 // this is used to split a simd16 math into two simd8 before other reducing exeuction size actions
1267 //
1268 // This will has two instructions: one is right before "iter", the other is to re-use "iter". The
1269 // caller is safe to use "--iter" and "iter" to refer those two instructions.
evenlySplitInst(INST_LIST_ITER iter,G4_BB * bb,bool checkOverlap)1270 bool HWConformity::evenlySplitInst(INST_LIST_ITER iter, G4_BB* bb, bool checkOverlap)
1271 {
1272 G4_INST* inst = *iter;
1273 G4_opcode op = inst->opcode();
1274 G4_Operand* srcs[3];
1275 int origMaskOffset = inst->getMaskOffset();
1276 bool extraMov = false;
1277 const int numSrc = inst->getNumSrc();
1278
1279 // check dst/src dependency
1280 if (checkOverlap)
1281 {
1282 extraMov = checkSrcDstOverlap(iter, bb, false);
1283 }
1284
1285 bool useARF = false;
1286 for (int i = 0; i < numSrc; i++)
1287 {
1288 srcs[i] = inst->getSrc(i);
1289 }
1290
1291 // compute max exeuction size.
1292 // boundary is GRF-boundary and HS change, but for Dst, elements should be symetric
1293 // if half-GRF boundary is crossed.
1294
1295 G4_DstRegRegion* dst = inst->getDst();
1296 bool nullDst = dst && inst->hasNULLDst();
1297 G4_ExecSize instExSize = inst->getExecSize(), currExSize = G4_ExecSize(instExSize / 2);
1298
1299 G4_Predicate* newPred = NULL;
1300 if (inst->getPredicate())
1301 {
1302 newPred = inst->getPredicate();
1303 newPred->splitPred();
1304 }
1305
1306 G4_CondMod* newCond = NULL;
1307 if (inst->getCondMod())
1308 {
1309 newCond = inst->getCondMod();
1310 newCond->splitCondMod();
1311 }
1312
1313 G4_SrcRegRegion* accSrcRegion = NULL;
1314 if (inst->getImplAccSrc())
1315 {
1316 accSrcRegion = inst->getImplAccSrc()->asSrcRegRegion();
1317 }
1318
1319 G4_DstRegRegion* accDstRegion = NULL;
1320 if (inst->getImplAccDst())
1321 {
1322 accDstRegion = inst->getImplAccDst();
1323 }
1324
1325 if (accSrcRegion || accDstRegion || newPred || newCond)
1326 {
1327 useARF = true;
1328 }
1329
1330 for (int i = 0; i < instExSize; i += currExSize)
1331 {
1332 // create new Oprands.
1333 G4_DstRegRegion* newDst;
1334 if (!nullDst)
1335 {
1336 newDst = builder.createSubDstOperand(dst, (uint16_t)i, currExSize);
1337 }
1338 else
1339 {
1340 newDst = dst;
1341 }
1342 // generate new inst
1343 G4_INST* newInst;
1344 if ((i + currExSize) < instExSize)
1345 {
1346 newInst = builder.makeSplittingInst(inst, currExSize);
1347 newInst->setImplAccDst(builder.duplicateOperand(accDstRegion));
1348 newInst->setImplAccSrc(builder.duplicateOperand(accSrcRegion));
1349 newInst->setDest(newDst);
1350 newInst->setPredicate(builder.duplicateOperand(newPred));
1351 newInst->setCondMod(builder.duplicateOperand(newCond));
1352 newInst->setEvenlySplitInst(true);
1353 bb->insertBefore(iter, newInst);
1354 }
1355 else
1356 {
1357 // reuse the original inst
1358 newInst = inst;
1359 newInst->setExecSize(currExSize);
1360 newInst->setDest(newDst);
1361 if (newPred)
1362 {
1363 inst->setPredicate(builder.duplicateOperand(newPred));
1364 }
1365 if (newCond)
1366 {
1367 inst->setCondMod(builder.duplicateOperand(newCond));
1368 }
1369 if (accSrcRegion)
1370 {
1371 newInst->setImplAccSrc(builder.createSrcRegRegion(*accSrcRegion));
1372 }
1373 if (accDstRegion)
1374 {
1375 newInst->setImplAccDst(builder.createDstRegRegion(*accDstRegion));
1376 }
1377 }
1378
1379 for (int j = 0; j < numSrc; j++)
1380 {
1381 if (srcs[j])
1382 {
1383 if (srcs[j]->isImm() || srcs[j]->isNullReg())
1384 {
1385 newInst->setSrc(srcs[j], j);
1386 }
1387 else if (srcs[j]->isScalarSrc() || (j == 0 && op == G4_line))
1388 {
1389 // no need to split, but need to duplicate
1390 newInst->setSrc(builder.duplicateOperand(srcs[j]), j);
1391 }
1392 else
1393 {
1394 newInst->setSrc(builder.createSubSrcOperand(srcs[j]->asSrcRegRegion(), (uint16_t)i,
1395 currExSize, (uint8_t)(srcs[j]->asSrcRegRegion()->getRegion()->vertStride),
1396 (uint8_t)(srcs[j]->asSrcRegRegion()->getRegion()->width)), j);
1397 }
1398 }
1399 }
1400
1401 // set mask
1402 bool needsMaskOffset = useARF || (!bb->isAllLaneActive() && !inst->isWriteEnableInst());
1403 if (needsMaskOffset)
1404 {
1405 int newMaskOffset = origMaskOffset + (i == 0 ? 0 : currExSize);
1406 bool nibOk = builder.hasNibCtrl() &&
1407 (inst->getDst()->getTypeSize() == 8 || TypeSize(inst->getExecType()) == 8);
1408 G4_InstOption newMask = G4_INST::offsetToMask(currExSize, newMaskOffset, nibOk);
1409 if (newMask == InstOpt_NoOpt)
1410 {
1411 bool useMask = inst->getPredicate() || inst->getCondModBase() ||
1412 (!bb->isAllLaneActive() && !inst->isWriteEnableInst());
1413 MUST_BE_TRUE(!useMask, "no legal emask found for the split instruction");
1414 }
1415 else
1416 {
1417 newInst->setMaskOption(newMask);
1418 }
1419 }
1420
1421 // maintain def-use chain
1422 if (newInst == inst)
1423 {
1424 newInst->trimDefInstList();
1425 }
1426 else
1427 {
1428 inst->copyDefsTo(newInst, /*checked*/true);
1429 inst->copyUsesTo(newInst, /*checked*/true);
1430 }
1431 if (builder.getOption(vISA_OptReport))
1432 {
1433 newInst->emit(std::cout);
1434 std::cout << std::endl;
1435 }
1436 }
1437
1438 return extraMov;
1439 }
1440
1441 // this is specifically for math instruction
1442 // assumption: the input math function is a compressed instruction and need split
reduceExecSizeForMath(INST_LIST_ITER iter,G4_BB * bb)1443 bool HWConformity::reduceExecSizeForMath(INST_LIST_ITER iter, G4_BB* bb)
1444 {
1445 // split the instruction into two first
1446 evenlySplitInst(iter, bb);
1447 // fix execution size for each one
1448 INST_LIST_ITER firstIter = iter;
1449 firstIter--;
1450 reduceExecSize(firstIter, bb);
1451 return reduceExecSize(iter, bb);
1452 }
1453 // check overlap between src and dst
1454 // if overlap exists, insert to MOV to eliminate it
1455 // how about replicate regions?<0;4,1>
checkSrcDstOverlap(INST_LIST_ITER iter,G4_BB * bb,bool compOpt)1456 bool HWConformity::checkSrcDstOverlap(INST_LIST_ITER iter, G4_BB* bb, bool compOpt)
1457 {
1458 G4_INST *inst = *iter;
1459 G4_Operand *srcs[3];
1460 bool hasOverlap = false;
1461
1462 for (int i = 0; i < inst->getNumSrc(); i++)
1463 {
1464 srcs[i] = inst->getSrc(i);
1465 }
1466 // check dst/src dependency
1467 // how about replicate regions?<0;4,1>
1468 if (inst->getDst() && !inst->hasNULLDst())
1469 {
1470 for (int i = 0; i < inst->getNumSrc(); i++)
1471 {
1472 bool useTmp = false;
1473 if (srcs[i] && (IS_VINTTYPE(srcs[i]->getType()) || IS_VFTYPE(srcs[i]->getType())))
1474 {
1475 useTmp = true;
1476 }
1477 else
1478 {
1479 G4_CmpRelation rel = inst->getDst()->compareOperand(srcs[i]);
1480 if (rel != Rel_disjoint)
1481 {
1482 useTmp = (rel != Rel_eq) || compOpt ||
1483 srcs[i]->asSrcRegRegion()->getRegion()
1484 ->isRepeatRegion(inst->getExecSize());
1485 }
1486 }
1487 if (useTmp)
1488 {
1489 // insert mov
1490 inst->setSrc(insertMovBefore(iter, i, G4_Operand::GetNonVectorImmType(srcs[i]->getType()), bb), i);
1491 srcs[i] = inst->getSrc(i);
1492 // reducing exec size for new MOV
1493 INST_LIST_ITER newMovIter = iter;
1494 newMovIter--;
1495 reduceExecSize(newMovIter, bb);
1496 hasOverlap = true;
1497
1498 }
1499 }
1500 }
1501
1502 return hasOverlap;
1503 }
1504
1505 // move source operand to one or two GRF
1506 // tmp dst use the same type as source.
1507 // this MOV does not need further resucing execsize
moveSrcToGRF(INST_LIST_ITER it,uint32_t srcNum,uint16_t numGRF,G4_BB * bb)1508 void HWConformity::moveSrcToGRF(INST_LIST_ITER it, uint32_t srcNum, uint16_t numGRF, G4_BB *bb)
1509 {
1510 G4_INST* inst = *it;
1511 G4_ExecSize execSize = inst->getExecSize();
1512
1513 G4_Operand *src = inst->getSrc(srcNum);
1514 uint32_t srcTypeSize = src->getTypeSize();
1515 uint16_t dclSize = (numEltPerGRF<Type_UB>() * numGRF) / srcTypeSize;
1516 uint16_t hs = dclSize / execSize;
1517 uint16_t wd = execSize;
1518 uint16_t vs = hs * wd;
1519 const RegionDesc* region = builder.createRegionDesc(vs, wd, hs);
1520
1521 // look up in MOV table to see if there is already inserted MOV for this source.
1522 G4_INST* def_inst = NULL;
1523 def_inst = checkSrcDefInst(inst, def_inst, srcNum);
1524
1525 G4_Type tmpType = G4_Operand::GetNonVectorImmType(src->getType());
1526
1527 if (def_inst && def_inst->getDst()->getType() == tmpType &&
1528 (def_inst->getExecSize() == execSize) &&
1529 def_inst->getDst()->coverGRF(numGRF, execSize) &&
1530 def_inst->getDst()->checkGRFAlign() &&
1531 (bb->isAllLaneActive() || def_inst->isWriteEnableInst()))
1532 {
1533
1534 //inst->removeDefUse(Gen4_Operand_Number(srcNum + 1));
1535 //def_inst->useInstList.emplace_back(inst, Gen4_Operand_Number(srcNum + 1));
1536 //inst->defInstList.emplace_back(def_inst, Gen4_Operand_Number(srcNum + 1));
1537
1538 G4_DstRegRegion* existing_def = def_inst->getDst();
1539 G4_SrcRegRegion* newSrc = builder.createSrc(
1540 existing_def->getBase(),
1541 existing_def->getRegOff(),
1542 existing_def->getSubRegOff(),
1543 region,
1544 src->getType());
1545 inst->setSrc(newSrc, srcNum);
1546 }
1547
1548 G4_Declare* dcl = builder.createTempVar(dclSize, src->getType(), GRFALIGN);
1549 G4_DstRegRegion *dstRegion = builder.createDst(
1550 dcl->getRegVar(),
1551 0,
1552 0,
1553 hs,
1554 dcl->getElemType());
1555 G4_INST* newInst = builder.createMov(
1556 execSize, dstRegion, src, (!bb->isAllLaneActive() ? InstOpt_WriteEnable : InstOpt_NoOpt), false);
1557
1558 // insert instruction and maintain def-use chain
1559 bb->insertBefore(it, newInst);
1560 inst->transferDef(newInst, Gen4_Operand_Number(srcNum + 1), Opnd_src0);
1561 newInst->addDefUse(inst, Gen4_Operand_Number(srcNum + 1));
1562
1563 G4_SrcRegRegion* newSrc = builder.createSrc(
1564 dcl->getRegVar(),
1565 0,
1566 0,
1567 region,
1568 dcl->getElemType());
1569 inst->setSrc(newSrc, srcNum);
1570 }
1571
1572 /*
1573 * create a new mov instruction and insert it before iter
1574 * mov (esize) tmpDst dst (nomask)
1575 * add (esize) tmpDst ...
1576 * where esize is "inst"'s execution size
1577 *
1578 */
saveDst(INST_LIST_ITER & it,uint8_t stride,G4_BB * bb)1579 void HWConformity::saveDst(INST_LIST_ITER& it, uint8_t stride, G4_BB *bb)
1580 {
1581 G4_INST* inst = *it;
1582 G4_DstRegRegion *dst = inst->getDst();
1583 G4_ExecSize execSize = inst->getExecSize();
1584 G4_Type dstType = dst->getType();
1585 uint16_t dstWidthBytes = execSize * TypeSize(dstType) * stride;
1586
1587 G4_SubReg_Align subAlign = getDclAlignment(dstWidthBytes, inst, execSize == 1);
1588
1589 uint32_t numElt = execSize == 1 ? 1 : execSize * stride;
1590 G4_Declare* dcl = builder.createTempVar(numElt, dstType, subAlign);
1591
1592 uint16_t hs = dst->getHorzStride();
1593 const RegionDesc *region = builder.createRegionDesc(hs * execSize, execSize, hs);
1594 G4_SrcRegRegion *srcRegion = builder.createSrc(dst->getBase(), dst->getRegOff(),
1595 dst->getSubRegOff(), region, dstType);
1596
1597 G4_DstRegRegion *tmpDstOpnd = builder.createDstRegRegion(dcl, stride);
1598
1599 unsigned int new_option = inst->getOption();
1600
1601 G4_INST* newInst = builder.createMov(execSize, tmpDstOpnd, srcRegion, new_option, false);
1602 newInst->setNoMask(true);
1603
1604 bb->insertBefore(it, newInst);
1605 inst->setDest(builder.duplicateOperand(tmpDstOpnd));
1606 }
1607
restoreDst(INST_LIST_ITER & it,G4_DstRegRegion * origDst,G4_BB * bb)1608 void HWConformity::restoreDst(INST_LIST_ITER& it, G4_DstRegRegion *origDst, G4_BB *bb)
1609 {
1610 G4_INST* inst = *it;
1611 G4_DstRegRegion *dst = inst->getDst();
1612 G4_ExecSize execSize = inst->getExecSize();
1613
1614 uint16_t hs = dst->getHorzStride();
1615 const RegionDesc *region = builder.createRegionDesc(hs * execSize, execSize, hs);
1616 G4_SrcRegRegion *srcRegion = builder.createSrc(dst->getBase(), dst->getRegOff(),
1617 dst->getSubRegOff(), region, dst->getType());
1618
1619 unsigned int new_option = inst->getOption();
1620
1621 G4_INST* newInst = builder.createMov(execSize, origDst, srcRegion, new_option, false);
1622 newInst->setNoMask(true);
1623
1624 INST_LIST_ITER iter = it;
1625 iter++;
1626 bb->insertBefore(iter, newInst);
1627
1628 // how about def-use?
1629 inst->transferUse(newInst);
1630 inst->addDefUse(newInst, Gen4_Operand_Number::Opnd_src0);
1631 }
1632
1633 /*
1634 * create a new mov instruction and insert it after iter
1635 * mov (esize) dst tmp:dst_type
1636 * where esize is "inst"'s execution size and insert it after "inst"
1637 * dst of inst is replaced with the tmp dst using the same type
1638 */
insertMovAfter(INST_LIST_ITER & it,uint16_t stride,G4_BB * bb)1639 void HWConformity::insertMovAfter(INST_LIST_ITER& it, uint16_t stride, G4_BB* bb)
1640 {
1641 G4_INST* inst = *it;
1642 G4_DstRegRegion *dst = inst->getDst();
1643 G4_ExecSize execSize = inst->getExecSize();
1644 G4_Type execType = inst->getExecType(), dstType = dst->getType();
1645 uint16_t opExecWidthBytes = execSize * TypeSize(execType);
1646 uint16_t dstWidthBytes = execSize * TypeSize(dstType) * stride;
1647
1648 G4_SubReg_Align subAlign = getDclAlignment(opExecWidthBytes > dstWidthBytes ? opExecWidthBytes : dstWidthBytes,
1649 inst, execSize == 1);
1650
1651 G4_Declare* dcl = builder.createTempVar(execSize * stride, dstType, subAlign);
1652
1653 const RegionDesc* region = builder.createRegionDesc(stride, 1, 0);
1654 G4_SrcRegRegion *srcRegion = builder.createSrcRegRegion(dcl, region);
1655 G4_DstRegRegion *tmpDstOpnd = builder.createDstRegRegion(dcl, stride);
1656
1657 G4_Predicate *pred = NULL;
1658 if (inst->opcode() != G4_sel) {
1659 pred = inst->getPredicate();
1660 inst->setPredicate(NULL);
1661 }
1662 unsigned int new_option = inst->getOption();
1663
1664 G4_INST* newInst = builder.createMov(execSize, dst, srcRegion, new_option, false);
1665 newInst->setPredicate(pred);
1666
1667 INST_LIST_ITER iter = it;
1668 iter++;
1669 bb->insertBefore(iter, newInst);
1670 // change dst of inst
1671 inst->setDest(tmpDstOpnd);
1672
1673 // update propagation info
1674 if (pred)
1675 {
1676 inst->transferDef(newInst, Opnd_pred, Opnd_pred);
1677 }
1678
1679 inst->transferUse(newInst);
1680 inst->addDefUse(newInst, Opnd_src0);
1681 }
1682
1683
removeBadSrc(INST_LIST_ITER & iter,G4_BB * bb,bool crossGRFDst,bool oneGRFSrc[3],bool badTwoGRFSrc[3])1684 void HWConformity::removeBadSrc(INST_LIST_ITER& iter, G4_BB *bb, bool crossGRFDst, bool oneGRFSrc[3], bool badTwoGRFSrc[3])
1685 {
1686 G4_INST *inst = *iter;
1687 G4_Operand *dst = inst->getDst();
1688 // check source and dst region together
1689 // get rid of bad two-GRF source
1690 for (int i = 0; i < inst->getNumSrc(); i++)
1691 {
1692
1693 if (badTwoGRFSrc[i])
1694 {
1695 if (!crossGRFDst ||
1696 (dst && IS_DTYPE(dst->getType()) && IS_WTYPE(inst->getSrc(i)->getType())))
1697 {
1698 inst->setSrc(insertMovBefore(iter, i, inst->getSrc(i)->getType(), bb), i);
1699 }
1700 else
1701 {
1702 moveSrcToGRF(iter, i, 2, bb);
1703 }
1704 badTwoGRFSrc[i] = false;
1705 INST_LIST_ITER tmpIter = iter;
1706 tmpIter--;
1707 reduceExecSize(tmpIter, bb);
1708 if (builder.getOption(vISA_OptReport))
1709 {
1710 (*tmpIter)->emit(std::cout);
1711 std::cout << std::endl;
1712 }
1713 }
1714 }
1715 }
1716