1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2017-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include <cmath>
10
11 #include "HWConformity.h"
12 #include "Optimizer.h"
13 #include "visa_wa.h"
14 #include "DebugInfo.h"
15 #include "G4_Verifier.hpp"
16
17 using namespace vISA;
18
getReverseCondMod(G4_CondModifier mod)19 static G4_CondModifier getReverseCondMod(G4_CondModifier mod)
20 {
21 switch (mod)
22 {
23 case Mod_z:
24 return Mod_z;
25 case Mod_e:
26 return Mod_e;
27 case Mod_nz:
28 return Mod_nz;
29 case Mod_ne:
30 return Mod_ne;
31 case Mod_g:
32 return Mod_l;
33 case Mod_ge:
34 return Mod_le;
35 case Mod_l:
36 return Mod_g;
37 case Mod_le:
38 return Mod_ge;
39 default:
40 MUST_BE_TRUE(0, "Invalid conditional modifier input for reversed conditional modifier.");
41 }
42 return Mod_cond_undef;
43 }
44
isCompressedInst(G4_INST * inst)45 static bool isCompressedInst(G4_INST* inst) {
46 return inst->isComprInst();
47 }
48
49 #define isUnitRegionRow(opnd, execSize) \
50 (opnd->isImm() || \
51 opnd->isSrcRegRegion() && opnd->asSrcRegRegion()->getRegion()->width == execSize || \
52 opnd->isSrcRegRegion() && opnd->asSrcRegRegion()->getRegion()->vertStride == 0)
53
getDclAlignment(int opndBytes,G4_INST * inst,bool isScalar)54 G4_SubReg_Align HWConformity::getDclAlignment(int opndBytes, G4_INST* inst, bool isScalar)
55 {
56 auto subAlign = Get_G4_SubRegAlign_From_Size((uint16_t)opndBytes);
57 bool hasAccSrc = inst->hasACCSrc();
58
59 if (hasAccSrc && subAlign < GRFALIGN)
60 {
61 subAlign = GRFALIGN;
62 }
63
64 if (!isScalar)
65 {
66 // certain instructions have additional alignment requirements for non-scalar sources
67 if (!builder.hasAlign1Ternary() && inst->getNumSrc() == 3 && !inst->isSend() && subAlign < Eight_Word)
68 {
69 subAlign = Eight_Word;
70 }
71 if (inst->isMath())
72 {
73 subAlign = GRFALIGN;
74 }
75 }
76
77 return subAlign;
78 }
79 /*
80 * create a new mov instruction and insert it after "it"
81 * mov (esize) dst tmp:type
82 * where esize is "inst"'s execution size and insert it after "inst"
83 * return value is the new temp variable as a dst operand
84 * If dstAlign is specified, the new temp will at least be aligend to that size
85 *
86 * The new mov instruction is inserted right after "it", and caller is safe to
87 * access it via "++it".
88 */
insertMovAfter(INST_LIST_ITER & it,G4_DstRegRegion * dst,G4_Type type,G4_BB * bb,G4_SubReg_Align dstAlign)89 G4_DstRegRegion* HWConformity::insertMovAfter(INST_LIST_ITER& it, G4_DstRegRegion* dst, G4_Type type, G4_BB* bb, G4_SubReg_Align dstAlign)
90 {
91 G4_INST* inst = *it;
92
93 if (!dst)
94 {
95 return dst;
96 }
97
98 if (inst->hasNULLDst())
99 {
100 return builder.createDst(
101 dst->getBase(),
102 0,
103 0,
104 1,
105 type);
106 }
107
108 G4_ExecSize exec_size = inst->getExecSize();
109 G4_Type execType = inst->isRawMov() ? dst->getType() : inst->getExecType();
110 bool scalarSrc = true;
111
112 for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; i++)
113 {
114 G4_Operand* src = inst->getSrc(i);
115 if (!src->isImm())
116 {
117 if (!(inst->isMath() && i == 1 && src->isNullReg()) &&
118 (src->isSrcRegRegion() && !src->asSrcRegRegion()->isScalar()))
119 {
120 scalarSrc = false;
121 }
122 }
123 else if (IS_VINTTYPE(src->getType()) || IS_VFTYPE(src->getType()))
124 {
125 scalarSrc = false;
126 }
127 }
128
129 G4_ExecSize newExecSize =
130 ((inst->opcode() == G4_sel || inst->getImplAccSrc() || !scalarSrc) ? exec_size : g4::SIMD1);
131
132 uint32_t opExecWidthBytes = newExecSize * TypeSize(execType);
133 if (execType == Type_DF && IS_BTYPE(type))
134 {
135 type = (type == Type_UB ? Type_UW : Type_W);
136 }
137 uint16_t dstWidthBytes = newExecSize * TypeSize(type);
138 uint16_t scale = TypeSize(execType) / TypeSize(type);
139 /* so according to comments in function that call it MAD needs to have packed format.
140 It ends up with hStride 2, due to DefHoisting.
141 So it is trying to undo it.
142 For every other type if srcType > dstCype we need to adjust regions.
143 This is not necessary for HF. It's already packed.
144
145 The src region of move is wrong. Since for HF it is packed, unlike other data types.
146 mad (8) r56.0.xyzw:hf -r37.0.xyzw:f r59.0.xyzw:hf r58.0.xyzw:hf {Align16, NoMask}
147 mov (16) r44.0<2>:hf r56.0<16;8,2>:hf {Align1, H1} // #??:$39:%66
148 */
149 if (scale == 0 || (builder.getPlatform() >= GENX_CHV && execType == Type_F && type == builder.getMixModeType()))
150 {
151 scale = 1;
152 }
153
154 G4_SubReg_Align subAlign = getDclAlignment(opExecWidthBytes > dstWidthBytes ? opExecWidthBytes : dstWidthBytes,
155 inst, newExecSize == 1);
156
157 if (subAlign < dstAlign)
158 {
159 subAlign = dstAlign;
160 }
161
162 const RegionDesc* region = newExecSize > 1 ? builder.createRegionDesc(scale, 1, 0) : builder.getRegionScalar();
163
164 G4_Declare* dcl = builder.createTempVar(newExecSize == 1 ? 1 : newExecSize * scale, type, subAlign);
165
166 G4_SrcRegRegion* srcRegion = builder.createSrcRegRegion(dcl, region);
167 G4_Predicate* pred = NULL;
168
169 if (inst->opcode() != G4_sel)
170 {
171 pred = inst->getPredicate();
172 inst->setPredicate(NULL);
173 // maintainDU4TempMov will update def-use
174 }
175
176 unsigned int new_option = inst->getMaskOption();
177 G4_INST* newInst = builder.createMov(exec_size, dst, srcRegion, new_option, false);
178 newInst->setPredicate(pred);
179 newInst->setSaturate(inst->getSaturate());
180 bb->insertAfter(it, newInst);
181
182 // update propagation info
183 maintainDU4TempMov(inst, newInst);
184
185 if (type == dst->getType())
186 {
187 newInst->setSaturate(g4::NOSAT);
188 }
189 else if (type == Type_F || type == Type_DF)
190 {
191 inst->setSaturate(g4::NOSAT);
192 }
193
194 inst->setExecSize(newExecSize);
195 if (newExecSize == 1)
196 {
197 inst->setNoMask(true);
198 }
199
200 return builder.createDstRegRegion(dcl, scale);
201 }
202
203 //
204 // replace instruction (*it)' source srcPos, which must be a scalar/immediate,
205 // with a temp variable after inserting
206 // mov (esize) tmp<1>:type imm/scalar {options}
207 // before the instruction
208 // This is like insertMovBefore(), except that the latter will always use
209 // simd1 move for scalar/imm values, which may not be what we want
210 // NOTE: This does not check for redundant moves. We are counting on a later LVN pass
211 // to clean them up
212 //
broadcast(G4_BB * bb,INST_LIST_ITER it,int srcPos,G4_SubReg_Align align)213 void HWConformity::broadcast(
214 G4_BB* bb, INST_LIST_ITER it, int srcPos, G4_SubReg_Align align)
215 {
216 G4_INST* inst = *it;
217 G4_Operand* src = inst->getSrc(srcPos);
218 MUST_BE_TRUE(src->isImm() ||
219 (src->isSrcRegRegion() && src->asSrcRegRegion()->isScalar()),
220 "source must be an immediate or scalar");
221 G4_Type type = src->getType();
222
223 G4_ExecSize execSize = inst->getExecSize();
224 uint32_t instMask = inst->getMaskOption();
225
226 // avoid simd16 Qword moves
227 MUST_BE_TRUE((unsigned)execSize * TypeSize(type) <= 2u * numEltPerGRF<Type_UB>(),
228 "move can't exceed 2 GRFs");
229
230 G4_Declare* dcl = builder.createTempVar(execSize, type, align);
231 G4_DstRegRegion* dst = builder.createDst(
232 dcl->getRegVar(),
233 0,
234 0,
235 1,
236 type);
237 G4_INST* newInst = builder.createMov(execSize, dst, src, instMask, false);
238
239 bb->insertBefore(it, newInst);
240
241 const RegionDesc* srcRegion = builder.getRegionStride1();
242 G4_SrcRegRegion* newSrc = builder.createSrcRegRegion(dcl, srcRegion);
243 inst->setSrc(newSrc, srcPos);
244 newInst->addDefUse(inst, inst->getSrcOperandNum(srcPos));
245
246 }
247
248 //
249 // A simplified version of insertMovBefore(), this copies raw bytes from source to a temp
250 // and replaces the original source with tmp. This is primarily used to ensure operand alignment and region restrictions
251 // op (esize) ... (mod) src<region>:type
252 // -->
253 // mov (esize) tmp<1>:type src<region>:type
254 // op (esize) ... (mod) tmp<1;1,0>:type
255 //
256 // source must be a G4_SrcRegRegion (direct or indirect), immediates are not supported
257 // note that modifier is propagated from source to tmp, but region is not
258 //
259 //
insertCopyBefore(INST_LIST_ITER it,uint32_t srcNum,G4_SubReg_Align tmpAlign,G4_BB * bb)260 G4_SrcRegRegion* HWConformity::insertCopyBefore(INST_LIST_ITER it, uint32_t srcNum,
261 G4_SubReg_Align tmpAlign, G4_BB* bb)
262 {
263 G4_INST* inst = *it;
264 G4_Operand* src = inst->getSrc(srcNum);
265 MUST_BE_TRUE(src != nullptr && src->isSrcRegRegion(), "source must be a SrcRegRegion");
266 G4_SrcRegRegion* origSrc = src->asSrcRegRegion();
267
268 G4_ExecSize newExecSize = origSrc->isScalar() ? g4::SIMD1 : inst->getExecSize();
269 G4_Declare* dcl = builder.createTempVar(newExecSize, origSrc->getType(), tmpAlign);
270 G4_SrcModifier modifier = origSrc->getModifier();
271 origSrc->setModifier(Mod_src_undef);
272 G4_DstRegRegion* dst = builder.createDstRegRegion(dcl, 1);
273
274 G4_INST* movInst = builder.createMov(newExecSize, dst, origSrc, InstOpt_WriteEnable, false);
275
276 bb->insertBefore(it, movInst);
277 G4_SrcRegRegion* newSrc = builder.createSrcRegRegion(modifier, Direct, dcl->getRegVar(),
278 0, 0, newExecSize == 1 ? builder.getRegionScalar() : builder.getRegionStride1(),
279 dcl->getElemType());
280
281 return newSrc;
282 }
283
insertCopyAtBBEntry(G4_BB * bb,G4_ExecSize execSize,G4_Operand * src)284 G4_SrcRegRegion* HWConformity::insertCopyAtBBEntry(G4_BB* bb, G4_ExecSize execSize, G4_Operand* src)
285 {
286 MUST_BE_TRUE(src != nullptr && src->isSrcRegRegion(), "source must be a SrcRegRegion");
287 G4_SrcRegRegion* origSrc = src->asSrcRegRegion();
288 auto lb = src->getLinearizedStart();
289 auto rb = src->getLinearizedEnd();
290
291 unsigned int regNum = lb / numEltPerGRF<Type_UB>();
292 unsigned int numRegs = (rb + numEltPerGRF<Type_UB>() - 1 - lb) / numEltPerGRF<Type_UB>();
293 if (regNum == -1 || numRegs == 0)
294 {
295 return nullptr;
296 }
297
298 G4_Declare* dcl = builder.createTempVar(execSize, origSrc->getType(), GRFALIGN);
299 dcl->getRegVar()->setPhyReg(builder.phyregpool.getGreg(regNum), 0);
300 G4_SrcModifier modifier = origSrc->getModifier();
301 origSrc->setModifier(Mod_src_undef);
302 G4_DstRegRegion* dst = builder.createDstRegRegion(dcl, 1);
303 dst->computePReg();
304
305 G4_INST* movInst = builder.createMov(execSize, dst, origSrc, InstOpt_WriteEnable, false);
306
307 for (auto it = bb->begin();
308 it != bb->end();
309 it++)
310 {
311 if (!(*it)->isLabel())
312 {
313 bb->insertBefore(it, movInst);
314 break;
315 }
316 }
317
318 G4_SrcRegRegion* newSrc = builder.createSrcRegRegion(modifier, Direct, dcl->getRegVar(),
319 0, 0, execSize == 1 ? builder.getRegionScalar() : builder.getRegionStride1(),
320 dcl->getElemType());
321 newSrc->asSrcRegRegion()->computePReg();
322 return newSrc;
323 }
324
325 /*
326 * create a new mov instruction
327 * mov (esize) tmp<1>:type src
328 * where esize is "inst"'s execution size and insert it before "inst"
329 * return value is the new temp variable as a source operand.
330 *
331 * "inst" is pointed by "it", and the new mov inst is inserted right
332 * before "it", so that caller can safely use "--it" to access the new
333 * mov instruction.
334 */
insertMovBefore(INST_LIST_ITER it,uint32_t srcNum,G4_Type type,G4_BB * bb,G4_SubReg_Align tmpAlign)335 G4_Operand* HWConformity::insertMovBefore(
336 INST_LIST_ITER it, uint32_t srcNum, G4_Type type, G4_BB* bb,
337 G4_SubReg_Align tmpAlign)
338 {
339 return insertMovBefore(it, srcNum, type, bb, 0, tmpAlign);
340 }
341
insertMovBefore(INST_LIST_ITER it,uint32_t srcNum,G4_Type type,G4_BB * bb,uint16_t tmpStride,G4_SubReg_Align tmpAlign)342 G4_Operand* HWConformity::insertMovBefore(INST_LIST_ITER it, uint32_t srcNum, G4_Type type, G4_BB* bb,
343 uint16_t tmpStride, G4_SubReg_Align tmpAlign)
344 {
345 G4_INST* inst = *it;
346 G4_SubReg_Align subAlign;
347 const RegionDesc* region = nullptr;
348 G4_ExecSize execSize = inst->getExecSize();
349 G4_Operand* src = inst->getSrc(srcNum);
350 unsigned short scale = IS_BTYPE(src->getType()) && src->getType() == type ? 2 : 1;
351
352 G4_ExecSize newExecSize = (src->isImm() && !IS_VTYPE(src->getType())) ||
353 (src->isSrcRegRegion() && src->asSrcRegRegion()->isScalar())
354 ? g4::SIMD1 : execSize;
355
356 if (newExecSize > 1)
357 {
358 if (tmpStride)
359 {
360 scale = tmpStride;
361 }
362 else
363 {
364 if (scale == 1 && !IS_VTYPE(src->getType()))
365 {
366 scale = (uint16_t)(TypeSize(src->getType()) / TypeSize(type));
367 }
368 if (scale == 0)
369 {
370 scale = 1;
371 }
372 }
373 region = builder.createRegionDesc(scale, 1, 0);
374 }
375 else
376 {
377 scale = src->getTypeSize() / TypeSize(type);
378 if (scale == 0)
379 {
380 scale = 1;
381 }
382 region = builder.getRegionScalar();
383 }
384
385 int opExecWidthBytes = IS_VINTTYPE(src->getType()) ?
386 numEltPerGRF<Type_UB>() / 2 * (execSize > 8 ? execSize / 8 : 1) :
387 (src->getType() == Type_VF ?
388 numEltPerGRF<Type_UB>() / 2 * (execSize > 4 ? execSize / 4 : 1) :
389 newExecSize * TypeSize(type) * scale);
390
391 subAlign = getDclAlignment(opExecWidthBytes, inst, newExecSize == 1);
392
393 if (subAlign < tmpAlign)
394 {
395 subAlign = tmpAlign;
396 }
397
398 uint32_t newInstEMask = newExecSize == 1 ? InstOpt_WriteEnable : inst->getMaskOption();
399
400 // due to old BDW regioning rule we need NoMask inst here so they can be split
401 if (kernel.getKernelType() == VISA_CM && builder.getPlatform() == GENX_BDW)
402 {
403 if (!bb->isAllLaneActive())
404 {
405 newInstEMask = InstOpt_WriteEnable;
406 }
407 }
408
409 G4_Declare* dcl = builder.createTempVar(newExecSize == 1 ? 1 : newExecSize * scale, type, subAlign);
410 G4_DstRegRegion* dstRegion = builder.createDstRegRegion(dcl, scale);
411 G4_INST* newInst = builder.createMov(newExecSize, dstRegion, builder.duplicateOperand(src), newInstEMask, false);
412 bb->insertBefore(it, newInst);
413 inst->transferDef(newInst, Gen4_Operand_Number(srcNum + 1), Opnd_src0);
414 newInst->addDefUse(inst, Gen4_Operand_Number(srcNum + 1));
415
416 G4_SrcModifier modifier = Mod_src_undef;
417 if (src->isSrcRegRegion())
418 {
419 G4_SrcModifier srcMod = src->asSrcRegRegion()->getModifier();
420 if (srcMod == Mod_Not)
421 {
422 // mov doesn't support logic modifiers, so we keep it on the new source
423 modifier = Mod_Not;
424 newInst->getSrc(0)->asSrcRegRegion()->setModifier(Mod_src_undef);
425 }
426 else if (src->getType() == Type_BF)
427 {
428 // bf mov does not support src mod as it is changed to shl or uw mov.
429 // Keep it on the new source.
430 modifier = srcMod;
431 newInst->getSrc(0)->asSrcRegRegion()->setModifier(Mod_src_undef);
432 }
433 }
434
435 return builder.createSrcRegRegion(
436 modifier,
437 Direct,
438 dcl->getRegVar(),
439 0,
440 0,
441 region,
442 dcl->getElemType());
443 }
444
fixPackedSource(INST_LIST_ITER it,G4_BB * bb)445 void HWConformity::fixPackedSource(INST_LIST_ITER it, G4_BB* bb)
446 {
447 G4_INST* inst = *it;
448
449 bool nonTypeWFound = false, nonTypeFFound = false, incompatibleTypeFound = false;
450
451 for (int i = 0; i < inst->getNumSrc(); i++)
452 {
453 auto src = inst->getSrc(i);
454 if (!src)
455 {
456 continue;
457 }
458 if (!IS_VTYPE(src->getType()))
459 {
460 // Make sure other src operands are of word type only as this is a HW requirement
461 if (src->getType() != Type_W && src->getType() != Type_UW)
462 {
463 nonTypeWFound = true;
464 }
465 if (src->getType() != Type_F)
466 {
467 nonTypeFFound = true;
468 }
469 continue;
470 }
471 G4_Type target_type = src->getType() == Type_VF ? Type_F : Type_W;
472 if (target_type == Type_W && (nonTypeWFound || !builder.hasByteALU()))
473 {
474 // non-word type src is not allowed to co-exist with :v src
475 // also if platform lacks byte regioning :v src may be incompatible with later legalization
476 incompatibleTypeFound = true;
477 }
478 else if (target_type == Type_F && nonTypeFFound == true)
479 {
480 // non-float type src is not allowed to co-exist with :vf src
481 incompatibleTypeFound = true;
482 }
483
484 // Insert a move only if immediate operand is not last src operand
485 if (i != inst->getNumSrc() - 1 || incompatibleTypeFound == true)
486 {
487 inst->setSrc(insertMovBefore(it, i, target_type, bb), i);
488 }
489 }
490 }
491 /*
492 * fixMathInst() checks the following:
493 * The math instruction can only use GRF registers as source(s) and destination.
494 * The math instruction does not support indirect addressing modes.
495 * source horizontal stride must be 1 with the exception of scalar sources and destination horizontal stride must be always 1.
496 * Source and destination offset must be the same, except the case of scalar source
497 * DW and UD is the only source format supported for INT DIV, FP16/FP32 is the only source format supported for all the other functions.
498 * Mixed DW and UD sources are not allowed for the INT DIV function.
499 * For single source math function, <src1> must be programmed as ARF-NULL register.
500 */
fixMathInst(INST_LIST_ITER it,G4_BB * bb)501 bool HWConformity::fixMathInst(INST_LIST_ITER it, G4_BB* bb)
502 {
503 G4_INST* inst = *it;
504 G4_DstRegRegion* dst = inst->getDst();
505 G4_Operand* src0 = inst->getSrc(0), * src1 = inst->getSrc(1);
506 bool mov_dst = false;
507
508 MUST_BE_TRUE(inst->isMath(), "Expect math instruction");
509 G4_InstMath* mathInst = inst->asMathInst();
510
511 if (mathInst->getMathCtrl() == MATH_INVM || mathInst->getMathCtrl() == MATH_RSQRTM)
512 {
513 // split two GRF math macros. This should only happen for FP64
514 if (!builder.hasTwoGRFMathMacro() &&
515 IS_DFTYPE(inst->getDst()->getType()) && ((uint32_t)(inst->getExecSize() * 2)) > builder.getNativeExecSize())
516 {
517 evenlySplitInst(it, bb);
518 return true;
519 }
520 // math macros are constructed internally and should already conform to all other HW rules
521 return false;
522 }
523
524 if (builder.getOption(vISA_DisableHFMath))
525 {
526 auto src0 = inst->getSrc(0);
527 auto src1 = inst->getSrc(1);
528 auto dst = inst->getDst();
529 if (src0 && src0->getType() == Type_HF)
530 {
531 replaceSrc(it, 0, Type_F, bb);
532 }
533
534 if (src1 && src1->getType() == Type_HF)
535 {
536 replaceSrc(it, 1, Type_F, bb);
537 }
538
539 if (dst && dst->getType() == Type_HF)
540 {
541 replaceDst(it, Type_F);
542 }
543 }
544
545 // covers MATH_INT_DIV, MATH_INT_DIV_QUOT, MATH_INT_DIV_REM
546 bool isIntDivide = inst->asMathInst()->isMathIntDiv();
547 bool hasSameOffset = hasSameSubregOffset(inst);
548
549 auto hasModMinus = [](G4_Operand* SrcOprd)
550 {
551 if (SrcOprd->isSrcRegRegion())
552 {
553 G4_SrcModifier mod = SrcOprd->asSrcRegRegion()->getModifier();
554 return (mod == Mod_Minus || mod == Mod_Minus_Abs);
555 }
556 return false;
557 };
558
559 // check if the source needs a move and if so the new move type
560 auto needsMove = [this, inst, isIntDivide, hasSameOffset, hasModMinus](int srcID, G4_Type& newType)
561 {
562 assert((srcID == 0 || srcID == 1) && "math can have at most two sources");
563 G4_Operand* src = inst->getSrc(srcID);
564 newType = src->getType();
565 if (isIntDivide)
566 {
567 // case 1: Perform a signed division if there's any minus src modifier.
568 // math.quot r10:w r20:ub -r30:ub
569 // Make sure newType is D, not UD. The correct code is:
570 // mov r22:d r20:ub
571 // mov r32:d -r30:ub
572 // math.quot r10:w r22:d r32:d
573 // case 2: Perform an appropriate type conversion based on the type ranks of both sources.
574 // math.quot r6:ud r3:b r4:ud
575 // Make sure it's still an unsigned division.
576 // mov r11:ud r3:b
577 // math.quot r6:ud r11:ud r4:ud
578 G4_Type src0Type = inst->getSrc(0)->getType();
579 G4_Type src1Type = inst->getSrc(1)->getType();
580 G4_Type divType = Type_UNDEF;
581 if (hasModMinus(inst->getSrc(0)) || hasModMinus(inst->getSrc(1)))
582 {
583 // If there's any minus source modifier, do a signed division.
584 divType = Type_D;
585 }
586 else if (TypeSize(src0Type) != TypeSize(src1Type))
587 {
588 // If src0 and src1 have different ranks, get the signedness of the
589 // division from the higher rank src.
590 G4_Type higherRankType = TypeSize(src0Type) > TypeSize(src1Type) ? src0Type : src1Type;
591 divType = IS_SIGNED_INT(higherRankType) ? Type_D : Type_UD;
592 }
593 else
594 {
595 // If both sources have the same rank, do a signed division only
596 // when both are signed. Otherwise, do an unsigned division.
597 divType = IS_SIGNED_INT(src0Type) && IS_SIGNED_INT(src1Type) ? Type_D : Type_UD;
598 }
599 assert(divType == Type_D || divType == Type_UD);
600 if (newType != divType)
601 {
602 newType = divType;
603 return true;
604 }
605 }
606 else if ((src->getType() != Type_F && src->getType() != Type_VF) &&
607 (builder.getPlatform() == GENX_BDW || src->getType() != Type_HF))
608 {
609 // CHV+ supports F/HF math, while BDW only supports F math
610 // mix mode math is handled in fixMixedHFInst()
611 newType = Type_F;
612 return true;
613 }
614
615 if (src->isImm())
616 {
617 if (srcID == 0 && inst->asMathInst()->getMathCtrl() >= MATH_FDIV)
618 {
619 return true;
620 }
621 }
622 else if (src->isSrcRegRegion())
623 {
624 G4_SrcRegRegion* srcRegion = src->asSrcRegRegion();
625 const RegionDesc* rd = srcRegion->getRegion();
626 if (srcRegion->getModifier() != Mod_src_undef && isIntDivide)
627 {
628 // no source modifer for int divide
629 return true;
630 }
631 else if (srcRegion->getRegAccess() != Direct)
632 {
633 return true;
634 }
635 else if (!srcRegion->isScalar())
636 {
637 if (!hasSameOffset && !builder.isOpndAligned(srcRegion, numEltPerGRF<Type_UB>()))
638 {
639 return true;
640 }
641 else if (!rd->isContiguous(inst->getExecSize()))
642 {
643 return true;
644 }
645 }
646 }
647 else
648 {
649 ASSERT_USER(false, "Unexpected math source!");
650 }
651 return false;
652 };
653
654 if (src0)
655 {
656 G4_Type src0_type = src0->getType();
657 bool needsSrc0Mov = needsMove(0, src0_type);
658 if (needsSrc0Mov)
659 {
660 inst->setSrc(insertMovBefore(it, 0, src0->isImm() ? G4_Operand::GetNonVectorImmType(src0_type) : src0_type, bb), 0);
661 src0 = inst->getSrc(0);
662 }
663 }
664
665 bool nullSrc1 = src1 && src1->isNullReg();
666 if (!nullSrc1 && src1)
667 {
668 G4_Type src1_type = src1->getType();
669 bool needsSrc1Move = needsMove(1, src1_type);
670
671 if (needsSrc1Move)
672 {
673 if (isIntDivide && src1->isImm() && !IS_VINTTYPE(src1->getType()))
674 {
675 // just change the immediate's type
676 uint32_t immVal = (uint32_t)src1->asImm()->getImm();
677 inst->setSrc(builder.createImm(immVal, src1_type), 1);
678 }
679 else
680 {
681 inst->setSrc(insertMovBefore(it, 1, src1->isImm() ? G4_Operand::GetNonVectorImmType(src1_type) : src1_type, bb), 1);
682 }
683 src1 = inst->getSrc(1);
684 }
685 }
686
687 if (nullSrc1 && src0 && src1->getType() != src0->getType())
688 {
689 G4_SrcRegRegion* src1_opnd = builder.createNullSrc(inst->getSrc(0)->getType());
690 inst->setSrc(src1_opnd, 1);
691 }
692
693 // recompute as src0 and src1 may have been modified
694 hasSameOffset = hasSameSubregOffset(inst);
695 G4_Type extype = inst->getExecType2();
696 bool cond1 = (dst->getType() != extype && !(dst->getType() == Type_UD && extype == Type_D));
697 if (dst->getRegAccess() != Direct || dst->getHorzStride() != 1 || cond1 ||
698 (!hasSameOffset && inst->getExecSize() != g4::SIMD1 && !builder.isOpndAligned(dst, numEltPerGRF<Type_UB>())))
699 {
700 mov_dst = true;
701 replaceDst(it, extype);
702 }
703
704 if (builder.hasHFMathGRFAlign())
705 {
706 auto src0 = inst->getSrc(0);
707 auto src1 = inst->getSrc(1);
708 auto dst = inst->getDst();
709
710 if (dst && !dst->isNullReg() && dst->getType() == Type_HF && dst->getHorzStride() == 1)
711 {
712 if (!builder.isOpndAligned(dst, numEltPerGRF<Type_UB>()))
713 {
714 mov_dst = true;
715 replaceDst(it, dst->getType(), GRFALIGN);
716 }
717 if (src0 && !src0->isNullReg() && src0->getType() == Type_HF)
718 {
719 if (!builder.isOpndAligned(src0, numEltPerGRF<Type_UB>()))
720 {
721 G4_Operand* newSrc0 = insertMovBefore(it, 0, src0->getType(), bb, GRFALIGN);
722 inst->setSrc(newSrc0, 0);
723 }
724 }
725
726 if (src1 && !src1->isNullReg() && src1->getType() == Type_HF)
727 {
728 if (!builder.isOpndAligned(src0, numEltPerGRF<Type_UB>()))
729 {
730 G4_Operand* newSrc0 = insertMovBefore(it, 1, src0->getType(), bb, GRFALIGN);
731 inst->setSrc(newSrc0, 1);
732 }
733 }
734 }
735 }
736
737 return mov_dst;
738 }
739
hasSameSubregOffset(G4_INST * inst) const740 bool HWConformity::hasSameSubregOffset(G4_INST* inst) const
741 {
742 uint32_t offset;
743 return hasSameSubregOffset(inst, offset);
744 }
745
746 //
747 // returns true if all sources and dst in this inst have the same fixed subreg offset
748 // null src/dst, scalar sources and immediates are excluded from the check
749 // If true, return the common byte offset in byteOffset
750 //
hasSameSubregOffset(G4_INST * inst,uint32_t & byteOffset) const751 bool HWConformity::hasSameSubregOffset(G4_INST* inst, uint32_t& byteOffset) const
752 {
753 bool anyOffset = true; // true means offset is not fixed yet
754 byteOffset = 0;
755 if (inst->getDst())
756 {
757 G4_DstRegRegion* dst = inst->getDst();
758 if (dst->isNullReg())
759 {
760 // do nothing
761 }
762 else if (dst->hasFixedSubregOffset(byteOffset))
763 {
764 anyOffset = false;
765 }
766 else
767 {
768 return false;
769 }
770 }
771
772 for (int i = 0; i < inst->getNumSrc(); ++i)
773 {
774 G4_Operand* src = inst->getSrc(i);
775 if (src->isSrcRegRegion())
776 {
777 uint32_t srcOffset = 0;
778 G4_SrcRegRegion* srcRegion = src->asSrcRegRegion();
779 if (srcRegion->isNullReg() || srcRegion->isScalar())
780 {
781 continue;
782 }
783 else if (srcRegion->hasFixedSubregOffset(srcOffset))
784 {
785 if (anyOffset)
786 {
787 byteOffset = srcOffset;
788 anyOffset = false;
789 }
790 else if (srcOffset != byteOffset)
791 {
792 return false;
793 }
794 }
795 else
796 {
797 return false;
798 }
799 }
800 }
801
802 return true;
803 }
804
805 // Check the following rules
806 // -- src0 in 2 source instructions may not be immediate. We try to swap for src0 and src1 for
807 // commutative instructions in such cases
808 // -- ARF may not be in src1
fixImmAndARFSrc(INST_LIST_ITER it,G4_BB * bb)809 void HWConformity::fixImmAndARFSrc(INST_LIST_ITER it, G4_BB* bb)
810 {
811 G4_INST* inst = *it;
812 if (inst->mayExceedTwoGRF())
813 {
814 return;
815 }
816
817 G4_Operand* src0, * src1, * src2;
818 src0 = inst->getSrc(0);
819 src1 = inst->getSrc(1);
820 src2 = inst->getSrc(2);
821
822 /* Check for usage of two constants in binary operations */
823 if (src0 && (src0->isImm() || src0->isAddrExp()) && inst->getNumSrc() == 2)
824 {
825 if (INST_COMMUTATIVE(inst->opcode()) && !src1->isImm())
826 {
827 //all commutative inst must have 2 sources
828 if (inst->opcode() == G4_mul)
829 {
830 bool needConstMov;
831 //for DW and W mul, src0 must be DW and src1 W
832 needConstMov = IS_DTYPE(src0->getType()) && !IS_DTYPE(src1->getType());
833
834 if (needConstMov)
835 {
836 G4_Type tmpType = G4_Operand::GetNonVectorImmType(src0->getType());
837
838 G4_Operand* newSrc0 = insertMovBefore(it, 0, tmpType, bb);
839 inst->setSrc(newSrc0, 0);
840 }
841 else
842 {
843 // swap operands
844 inst->swapSrc(0, 1);
845 inst->swapDefUse();
846 }
847 }
848 else
849 {
850 // swap operands
851 inst->swapSrc(0, 1);
852 inst->swapDefUse();
853 }
854 }
855 /*
856 * A select operation isn't commutative, but we may commute the
857 * operands provided we perform a predicate inversion as well.
858 * (v0) sel ... const V1
859 * =>
860 * (-v0) sel ... V1 const
861 */
862 else if (inst->opcode() == G4_sel && !src1->isImm())
863 {
864 G4_CondMod* cond = inst->getCondMod();
865 if (cond)
866 {
867 switch (cond->getMod())
868 {
869 case Mod_ne:
870 inst->setCondMod(builder.createCondMod(Mod_e, cond->getBase(), 0));
871 break;
872 case Mod_e:
873 inst->setCondMod(builder.createCondMod(Mod_ne, cond->getBase(), 0));
874 break;
875 default:
876 break;
877 }
878 }
879 else
880 {
881 G4_Predicate* pred = inst->getPredicate();
882 MUST_BE_TRUE(pred != NULL, "predicate must not be null");
883 G4_PredState reverse = pred->getState() == PredState_Minus ? PredState_Plus : PredState_Minus;
884 inst->setPredicate(builder.createPredicate(
885 reverse, pred->getBase(), pred->getSubRegOff(), pred->getControl()));
886 }
887 inst->swapSrc(0, 1);
888 inst->swapDefUse();
889 }
890 else if (!inst->isMath())
891 {
892 // math immediate src0 is handled separately in fixMathInst()
893 // If src0 is not 64-bit, src1 is 64-bit, swap them to save one move.
894 if (INST_COMMUTATIVE(inst->opcode()) && src0->isImm() && src1->isImm() &&
895 src0->getTypeSize() != 8 && src1->getTypeSize() == 8)
896 {
897 inst->swapSrc(0, 1);
898 inst->swapDefUse();
899 src0 = inst->getSrc(0);
900 src1 = inst->getSrc(1);
901 // this needs to fall through as we still need move for src0
902 }
903
904 if (INST_COMMUTATIVE(inst->opcode()) && src0->isAddrExp() && src1->isImm())
905 {
906 // The original IR has both addr expr and immediate
907 // add(8) A0(0, 0)<1>:uw &V36 + 0 0xeca86420 : uv{ Align1, Q1 }
908 // We insert a move for src1 which is an immediate
909 // mov(8) TV0(0, 0)<1> : uw 0xeca86420 : uv{ Align1 }
910 // add(8) A0(0, 0)<1> : uw &V36 + 0 TV0(0, 0)<8; 8, 1> : uw{ Align1, Q1 }
911 G4_Type type = src1->getType();
912 inst->setSrc(insertMovBefore(it, 1, G4_Operand::GetNonVectorImmType(type), bb), 1);
913 // And we swap addr expr and the new variable
914 // add(8) A0(0, 0)<1> : uw TV0(0, 0)<8; 8, 1> : uw &V36 + 0 {Align1, Q1}
915 // The final code sequence is
916 // mov(8) r13.0<1>:uw 0xeca86420 : uv{ Align1 } // #26:$9:%79
917 // add(8) a0.0<1> : uw r13.0<8; 8, 1> : uw 0x60 : uw{ Align1, Q1 }
918 inst->setSrc(inst->getSrc(1), 0);
919 inst->setSrc(src0, 1);
920 inst->swapDefUse();
921 }
922 else
923 {
924 G4_Type newSrcType = inst->needsDWType() ? (IS_UNSIGNED_INT(src0->getType()) ? Type_UD : Type_D) :
925 G4_Operand::GetNonVectorImmType(src0->getType());
926 inst->setSrc(insertMovBefore(it, 0, newSrcType, bb), 0);
927 }
928 }
929 }
930
931 src0 = inst->getSrc(0);
932 src1 = inst->getSrc(1);
933 src2 = inst->getSrc(2);
934
935 // check for non-mad 3src inst
936
937 if (inst->opcode() == G4_madw)
938 {
939 // src0 can not be immediate.
940 if (src0 && src0->isImm())
941 {
942 // swap src0 and src1 if src0 is immediate but src1 is not immediate
943 if (src1 && !src1->isImm())
944 {
945 inst->swapSrc(0, 1);
946 inst->swapDefUse();
947 src0 = inst->getSrc(0);
948 src1 = inst->getSrc(1);
949 }
950 else
951 {
952 inst->setSrc(insertMovBefore(it, 0, IS_UNSIGNED_INT(src0->getType()) ? Type_UD : Type_D, bb), 0);
953 src0 = inst->getSrc(0);
954 }
955 }
956
957 // fixe immediate type of G4_madw as it can only support D/UD types
958 if (src1 && src1->isImm())
959 {
960 uint32_t immVal = (uint32_t)src1->asImm()->getImm();
961 inst->setSrc(builder.createImm(immVal, IS_SIGNED_INT(src1->getType()) ? Type_D : Type_UD), 1);
962 src1 = inst->getSrc(1);
963 }
964
965 if (src2 && src2->isImm())
966 {
967 uint32_t immVal = (uint32_t)src2->asImm()->getImm();
968 inst->setSrc(builder.createImm(immVal, IS_SIGNED_INT(src2->getType()) ? Type_D : Type_UD), 2);
969 src2 = inst->getSrc(2);
970 }
971 }
972
973 // madw can have src1 as immediate
974 if (inst->getNumSrc() == 3 && src1->isImm() && inst->opcode() != G4_madw)
975 {
976 inst->setSrc(insertMovBefore(it, 1, INST_FLOAT_SRC_ONLY(inst->opcode()) ? Type_F : src1->getType(), bb), 1);
977 }
978
979 // Architecture registers may not appear as src1.
980 auto isARF = [](G4_Operand* opnd) { return opnd->isAreg() || opnd->isFlag(); };
981 if (src1 != nullptr && isARF(src1) && !src1->isNullReg())
982 {
983 /* See if we can swap the src1 */
984 if (INST_COMMUTATIVE(inst->opcode()) && !isARF(src0))
985 {
986 inst->swapSrc(0, 1);
987 inst->swapDefUse();
988 }
989 else
990 {
991 /* Otherwise introduce a tmp */
992 inst->setSrc(insertMovBefore(it, 1, INST_FLOAT_SRC_ONLY(inst->opcode()) ? Type_F : src1->getType(), bb), 1);
993 }
994 }
995
996 src2 = inst->getSrc(2);
997
998 // 3 src instructions except madw can't have any constants
999 if (!builder.hasAlign1Ternary() && src2 != nullptr && src2->isImm() && inst->opcode() != G4_madw)
1000 {
1001 inst->setSrc(insertMovBefore(it, 2, src2->getType(), bb), 2);
1002 }
1003 }
1004
fixLine(INST_LIST_ITER it,G4_BB * bb)1005 bool HWConformity::fixLine(INST_LIST_ITER it, G4_BB* bb)
1006 {
1007 G4_INST* inst = *it;
1008
1009 if (inst->opcode() == G4_line)
1010 {
1011 bool badRegion = false;
1012 G4_Operand* src0 = inst->getSrc(0);
1013 // assumption: there are 4 elements in src0
1014 if (src0->isSrcRegRegion())
1015 {
1016 const RegionDesc* rd = src0->asSrcRegRegion()->getRegion();
1017 badRegion = (rd->vertStride != 0 || rd->width != 4 || rd->horzStride != 1);
1018 }
1019 if (!IS_FTYPE(src0->getType()) || src0->isImm() || badRegion ||
1020 !builder.isOpndAligned(src0, numEltPerGRF<Type_UB>() / 2))
1021 {
1022 // insertMovBefore() is not used here
1023 // due to the special region <0;4,1> of src0 of line
1024 G4_Declare* src0_dcl;
1025 G4_DstRegRegion* new_dst_opnd;
1026 G4_SrcRegRegion* new_src0_opnd;
1027 unsigned char mov_size = 4;
1028
1029 src0_dcl = builder.createTempVar(mov_size, Type_F, Eight_Word);
1030 /* Create temporary variable */
1031 // Actully we set region to be <0;4,1> directly here.
1032 const RegionDesc* rd = builder.createRegionDesc(0, 4, 1);
1033 new_src0_opnd = builder.createSrcRegRegion(src0_dcl, rd);
1034 new_dst_opnd = builder.createDstRegRegion(src0_dcl, 1);
1035
1036 G4_INST* newInst = builder.createMov(G4_ExecSize(mov_size), new_dst_opnd, src0, InstOpt_NoOpt, false);
1037 newInst->setNoMask(true);
1038
1039 bb->insertBefore(it, newInst);
1040 inst->setSrc(new_src0_opnd, 0);
1041 return true;
1042 }
1043 }
1044 return false;
1045 }
1046
fixOpndType(INST_LIST_ITER it,G4_BB * bb)1047 bool HWConformity::fixOpndType(INST_LIST_ITER it, G4_BB* bb)
1048 {
1049 /*
1050 * Check for instruction that only accept float/int operands, as well as
1051 * instruction with mixed operand types. Even though vISA itself forbids
1052 * mixed type instructions, optimizations such as copy propagation
1053 * may reintroduce them and so we do the checks here
1054 */
1055 G4_INST* inst = *it;
1056 bool changed = false;
1057 int numSrc = inst->getNumSrc();
1058 bool has_float = false;
1059 bool has_int = false;
1060
1061 if (inst->mayExceedTwoGRF() || inst->opcode() == G4_smov)
1062 {
1063 // skip special instructions
1064 return false;
1065 }
1066
1067 for (int i = 0; i < numSrc; i++)
1068 {
1069 if (!inst->getSrc(i))
1070 {
1071 continue;
1072 }
1073 G4_Type ty = inst->getSrc(i)->getType();
1074 if (IS_TYPE_FLOAT_ALL(ty))
1075 {
1076 has_float = true;
1077 }
1078 else
1079 {
1080 has_int = true;
1081 }
1082 }
1083 if (has_float && has_int)
1084 {
1085 for (int i = 0; i < numSrc; i++)
1086 {
1087 if (inst->getSrc(i) && !IS_FTYPE(inst->getSrc(i)->getType()) && !IS_DFTYPE(inst->getSrc(i)->getType()))
1088 {
1089 // FIXME: we should probably either get rid of this or assert,
1090 // it's unlikely that blinding casting int to float is the right thing here
1091 inst->setSrc(insertMovBefore(it, i, Type_F, bb), i);
1092 changed = true;
1093 }
1094 }
1095 }
1096
1097 if (builder.noSrc1Byte())
1098 {
1099 if (numSrc > 1)
1100 {
1101 G4_Operand* src0 = inst->getSrc(0);
1102 G4_Operand* src1 = inst->getSrc(1);
1103 if (src0 != nullptr && src1 != nullptr && IS_BTYPE(src1->getType()))
1104 {
1105 if (!IS_BTYPE(src0->getType()) && inst->canSwapSource())
1106 {
1107 inst->swapSrc(0, 1);
1108 }
1109 else
1110 {
1111 bool hasModMinus = false;
1112 if (src1->isSrcRegRegion())
1113 {
1114 G4_SrcModifier mod = src1->asSrcRegRegion()->getModifier();
1115 hasModMinus = (mod == Mod_Minus || mod == Mod_Minus_Abs);
1116 }
1117 // If minus modifier is present, need signed type.
1118 G4_Type Ty = (IS_SIGNED_INT(src1->getType()) || hasModMinus) ? Type_W : Type_UW;
1119 inst->setSrc(insertMovBefore(it, 1, Ty, bb), 1);
1120 changed = true;
1121 }
1122 }
1123 }
1124 }
1125 if (inst->opcode() == G4_bfn)
1126 {
1127 // BFN requires its operands to be UD/UW
1128 // ToDo: anyway to generalize this to all instructions requiring signed/unsigned int type? IGA doesn't seem to have API to query supported types
1129 auto dst = inst->getDst();
1130 if (dst->getType() == Type_D || dst->getType() == Type_W)
1131 {
1132 dst->setType(dst->getType() == Type_D ? Type_UD : Type_UW);
1133 }
1134 auto changeSrcToUnsigned = [](G4_Operand* opnd)
1135 {
1136 if (opnd->isSrcRegRegion() && (opnd->getType() == Type_D || opnd->getType() == Type_W))
1137 {
1138 opnd->asSrcRegRegion()->setType(opnd->getType() == Type_D ? Type_UD : Type_UW);
1139 }
1140 };
1141 changeSrcToUnsigned(inst->getSrc(0));
1142 changeSrcToUnsigned(inst->getSrc(1));
1143 changeSrcToUnsigned(inst->getSrc(2));
1144 }
1145 return changed;
1146 }
1147
1148 /*
1149 * fixOpnds() looks for operands conformity:
1150 * 1. checks can operand be a constant.
1151 * 2. checks if operand's type is conformant to operation.
1152 * 3. check if only src0 uses VxH
1153 * 4. check if indirect scalar is used in compressed inst
1154 * It tries to fix these cases by changing operands order if possible
1155 * or by insertion if temporary location with appropriate conversion.
1156 */
fixOpnds(INST_LIST_ITER it,G4_BB * bb,G4_Type & exType)1157 void HWConformity::fixOpnds(INST_LIST_ITER it, G4_BB* bb, G4_Type& exType)
1158 {
1159 G4_INST* inst = *it;
1160 if (inst->isSend())
1161 {
1162 return;
1163 }
1164
1165 G4_Operand* src0, * src1, * src2;
1166
1167 src0 = inst->getSrc(0);
1168 src1 = inst->getSrc(1);
1169 src2 = inst->getSrc(2);
1170
1171 if (inst->opcode() == G4_mul)
1172 {
1173 if (IS_DTYPE(src1->getType()) &&
1174 !(IS_DTYPE(src0->getType()) || IS_FTYPE(src0->getType())))
1175 {
1176 // check if src0 uses VxH
1177 bool src0_use_VxH = false;
1178
1179 if (src0->isSrcRegRegion() && src0->asSrcRegRegion()->getRegAccess() != Direct &&
1180 src0->asSrcRegRegion()->getRegion()->isRegionWH()) // is this safe?
1181 {
1182 src0_use_VxH = true;
1183 }
1184 if (src0_use_VxH)
1185 {
1186 src0 = insertMovBefore(it, 0, src0->getType(), bb);
1187 }
1188 inst->setSrc(src0, 1);
1189 inst->setSrc(src1, 0);
1190 inst->swapDefUse();
1191 src0 = inst->getSrc(0);
1192 src1 = inst->getSrc(1);
1193 }
1194
1195 if (src1->isSrcRegRegion() && src1->asSrcRegRegion()->getRegAccess() != Direct &&
1196 src1->asSrcRegRegion()->getRegion()->isRegionWH())
1197 {
1198 if (IS_DTYPE(src0->getType()) &&
1199 !(IS_DTYPE(src1->getType()) || IS_FTYPE(src1->getType())))
1200 {
1201 inst->setSrc(insertMovBefore(it, 1, src1->getType(), bb), 1);
1202 }
1203 else
1204 {
1205 inst->swapSrc(0, 1);
1206 inst->swapDefUse();
1207 }
1208 src0 = inst->getSrc(0);
1209 src1 = inst->getSrc(1);
1210 }
1211 }
1212
1213 fixImmAndARFSrc(it, bb);
1214
1215 src0 = inst->getSrc(0);
1216 src1 = inst->getSrc(1);
1217 src2 = inst->getSrc(2);
1218
1219 // Vx1 and VxH can only be used for src0
1220 bool src0_use_VxH = false, src1_use_VxH = false;
1221
1222 if (src2 &&
1223 src2->isSrcRegRegion() &&
1224 src2->asSrcRegRegion()->getRegion()->isRegionWH())
1225 {
1226 inst->setSrc(insertMovBefore(it, 2, exType, bb), 2);
1227 }
1228
1229 if (src0 != NULL &&
1230 src0->isSrcRegRegion() &&
1231 src0->asSrcRegRegion()->getRegion()->isRegionWH())
1232 {
1233 src0_use_VxH = true;
1234 }
1235
1236 if (src1 != NULL &&
1237 src1->isSrcRegRegion() &&
1238 src1->asSrcRegRegion()->getRegion()->isRegionWH())
1239 {
1240 src1_use_VxH = true;
1241 }
1242
1243 if (src1_use_VxH)
1244 {
1245 if ((INST_COMMUTATIVE(inst->opcode()) || inst->opcode() == G4_cmp)
1246 && !src0_use_VxH &&
1247 !(inst->opcode() == G4_mul && IS_DTYPE(src0->getType())))
1248 {
1249 inst->swapSrc(0, 1);
1250 if (inst->opcode() == G4_cmp)
1251 {
1252 // change condMod
1253 G4_CondMod* condMod = inst->getCondMod();
1254 if (condMod)
1255 {
1256 G4_CondMod* newCondModOpnd = builder.createCondMod(
1257 getReverseCondMod(condMod->getMod()), condMod->getBase(), condMod->getSubRegOff());
1258 inst->setCondMod(newCondModOpnd);
1259 }
1260 }
1261 }
1262 else
1263 {
1264 inst->setSrc(insertMovBefore(it, 1, exType, bb), 1);
1265 }
1266 }
1267
1268 // at this point only src0 may be VxH
1269 // VxH regioning and conditional modifiers may not co-exist
1270 if (builder.getPlatform() >= GENX_ICLLP)
1271 {
1272 src0 = inst->getSrc(0);
1273 if (src0 && src0->isSrcRegRegion() && src0->asSrcRegRegion()->getRegion()->isRegionWH())
1274 {
1275 if (inst->getCondMod())
1276 {
1277 inst->setSrc(insertMovBefore(it, 0, src0->getType(), bb), 0);
1278 }
1279 }
1280 }
1281 }
1282
fixAlign13SrcInst(INST_LIST_ITER iter,G4_BB * bb)1283 void HWConformity::fixAlign13SrcInst(INST_LIST_ITER iter, G4_BB* bb)
1284 {
1285 // again mad should already conform by construction
1286 G4_INST* inst = *iter;
1287 MUST_BE_TRUE(inst->getNumSrc() == 3 && !inst->isSend(), "expect 3src inst");
1288
1289 if (inst->opcode() != G4_mad && inst->opcode() != G4_madw)
1290 {
1291 G4_DstRegRegion* dst = inst->getDst();
1292 if (!isGoodAlign1TernaryDst(inst))
1293 {
1294 auto alignment = builder.noSrc2Regioning() ? GRFALIGN : Four_Word;
1295 replaceDst(iter, dst->getType(), alignment);
1296 }
1297
1298 bool canBeImm = true;
1299 for (int i = 0; i < inst->getNumSrc(); ++i)
1300 {
1301 if (!isGoodAlign1TernarySrc(inst, i, canBeImm))
1302 {
1303 if (i == 2 && builder.noSrc2Regioning())
1304 {
1305 // some additional handling for src2 when src2 regioning is not available
1306 fixSrc2(iter, bb, false);
1307 }
1308 else
1309 {
1310 G4_SubReg_Align subalign = (i == 2) ? Four_Word : Any;
1311 inst->setSrc(insertMovBefore(iter, i, inst->getSrc(i)->getType(), bb, subalign), i);
1312 }
1313 }
1314 else
1315 {
1316 if (inst->getSrc(i)->isImm())
1317 {
1318 canBeImm = false;
1319 }
1320 }
1321 }
1322 }
1323 }
1324
fix3SrcInst(INST_LIST_ITER iter,G4_BB * bb)1325 void HWConformity::fix3SrcInst(INST_LIST_ITER iter, G4_BB* bb)
1326 {
1327 G4_INST* inst = *iter;
1328 if (inst->getNumSrc() != 3 || inst->mayExceedTwoGRF() || inst->opcode() == G4_madm)
1329 {
1330 return;
1331 }
1332
1333 if (builder.hasAlign1Ternary())
1334 {
1335 fixAlign13SrcInst(iter, bb);
1336 return;
1337 }
1338
1339 if (inst->opcode() != G4_mad && inst->opcode() != G4_madw)
1340 {
1341 // check that dst and srcs are legal for 3src. We do not check
1342 // mad and madw since they should already conform by construction
1343 uint8_t execSize = inst->getExecSize();
1344 G4_DstRegRegion* dst = inst->getDst();
1345 if (dst->getRegAccess() != Direct || dst->getHorzStride() != 1 ||
1346 !builder.isOpndAligned(dst, (execSize >= 8) ? 32 : execSize * 4))
1347 {
1348 replaceDst(iter, dst->getType());
1349 }
1350 for (int i = 0; i < 3; i++)
1351 {
1352 if (!isGoodAlign16Src(inst, i))
1353 {
1354 inst->setSrc(
1355 insertMovBefore(iter, i, inst->getSrc(i)->getType(), bb),
1356 i);
1357 }
1358 }
1359 }
1360
1361 //When it is set (Align16), the instruction uses 16-byte-aligned addressing for source and destination operands.
1362 if ((inst->getExecSize() == g4::SIMD1))
1363 {
1364 if (inst->getDst() &&
1365 inst->getDst()->getBase()->isRegVar())
1366 {
1367 if (!builder.isOpndAligned(inst->getDst(), 16))
1368 {
1369 replaceDst(iter, inst->getDst()->getType(), Eight_Word);
1370 }
1371 }
1372 }
1373
1374 if (inst->getExecSize() == g4::SIMD16)
1375 {
1376 bool wa3rc = (VISA_WA_CHECK(builder.getPWaTable(), WaDisableSIMD16On3SrcInstr) &&
1377 !(inst->getExecType() == Type_HF &&
1378 inst->getOperand(Opnd_src1)->isSrcRegRegion() &&
1379 inst->getOperand(Opnd_src1)->getType() == Type_HF &&
1380 !inst->getOperand(Opnd_src1)->asSrcRegRegion()->crossGRF()));
1381
1382 if (wa3rc)
1383 {
1384 evenlySplitInst(iter, bb);
1385 }
1386 }
1387 }
1388
fixCompareInst(INST_LIST_ITER i,G4_BB * bb,G4_Type exType,int dst_elsize)1389 void HWConformity::fixCompareInst(
1390 INST_LIST_ITER i,
1391 G4_BB* bb,
1392 G4_Type exType,
1393 int dst_elsize)
1394 {
1395 G4_INST* inst = *i;
1396 G4_Operand* dst = inst->getDst();
1397
1398 if (dst && dst->isNullReg())
1399 {
1400 // change dst hstride if necessary
1401 if (TypeSize(exType) != dst->getTypeSize())
1402 {
1403 // create a new dst with new stride
1404 G4_DstRegRegion* new_null = builder.createNullDst(exType);
1405 inst->setDest(new_null);
1406 }
1407 }
1408 }
1409
1410 // For integer packing moves, we can replace the src type with the dst type instead of inserting
1411 // a new move to satisfy dst alignment, since integer down conversion is based on truncation
1412 // an inst has to satisfy the following properties:
1413 // -- is a move (duh) and does not have conditional modifiers or saturation
1414 // -- dst must be a direct DstRegRegion that is GRF-aligned
1415 // -- src must be a direct SrcRegRegion with GRF base, no modifiers, and packed/scalar region
1416 // -- both dst and src have integer type, with source stride > dst stride
1417 // returns true if we have successfully down cast the src type
canReplaceMovSrcType(IR_Builder & builder,G4_INST * inst,uint32_t extypesize)1418 static bool canReplaceMovSrcType(IR_Builder& builder, G4_INST* inst, uint32_t extypesize)
1419 {
1420
1421 if (inst->opcode() != G4_mov || inst->getCondMod() != NULL || inst->getSaturate())
1422 {
1423 return false;
1424 }
1425 if (!inst->getSrc(0)->isSrcRegRegion())
1426 {
1427 return false;
1428 }
1429
1430 G4_DstRegRegion* dst = inst->getDst();
1431 G4_SrcRegRegion* src0 = inst->getSrc(0)->asSrcRegRegion();
1432 int dstByteOffset = dst->getByteOffset();
1433 if (dstByteOffset % extypesize != 0 ||
1434 dst->getRegAccess() != Direct)
1435 {
1436 // don't do this if dst is not GRF aligned, since we have to fix it later anyway
1437 return false;
1438 }
1439
1440 if (src0->getRegAccess() != Direct || src0->getModifier() != Mod_src_undef ||
1441 (src0->getTopDcl() == NULL || src0->getTopDcl()->getRegFile() != G4_GRF))
1442 {
1443 return false;
1444 }
1445
1446 bool isIntPackingMove = false;
1447 if (IS_TYPE_INT(dst->getType()) && IS_TYPE_INT(src0->getType()))
1448 {
1449 uint32_t dstAlign = dst->getTypeSize() * dst->getHorzStride();
1450 if (dstAlign < src0->getTypeSize())
1451 {
1452 isIntPackingMove = true;
1453 }
1454 }
1455
1456 if (!isIntPackingMove)
1457 {
1458 return false;
1459 }
1460
1461 // we only handle direct contiguous and scalar source region for now,
1462 // as VxH and strided regions are a bit harder to update
1463 if (src0->getRegion()->isContiguous(inst->getExecSize()))
1464 {
1465 uint16_t newHS = extypesize / dst->getTypeSize();
1466 if (newHS > 4)
1467 {
1468 // rule out Q -> B moves if Q is not scalar
1469 return false;
1470 }
1471 }
1472 else if (!src0->isScalar())
1473 {
1474 // only handle scalar and contiguous regions for now
1475 return false;
1476 }
1477
1478 // instead of inserting a move, we change src's type to be same as dst type
1479 // e.g.,
1480 // mov (8) r1.0<1>:b r2.4<8;8,1>:d
1481 // becomes
1482 // mov (8) r1.0<1>:b r2.16<32;8,4>:b
1483 // This is safe since integer down conversion is based on truncation
1484 uint32_t typeSizeRatio = extypesize / dst->getTypeSize();
1485 uint32_t numElt = src0->isScalar() ? 1 : inst->getExecSize() * typeSizeRatio;
1486 G4_Declare* newDcl = builder.createTempVar(numElt, dst->getType(), Any);
1487 newDcl->setAliasDeclare(src0->getBase()->asRegVar()->getDeclare(), 0);
1488 const RegionDesc* region = src0->isScalar() ? builder.getRegionScalar() :
1489 builder.createRegionDesc((uint16_t)inst->getExecSize(), (uint16_t)inst->getExecSize() * typeSizeRatio,
1490 inst->getExecSize(),
1491 (uint16_t)typeSizeRatio);
1492 G4_SrcRegRegion* newSrc = builder.createSrc(
1493 newDcl->getRegVar(),
1494 src0->getRegOff(),
1495 src0->getSubRegOff() * typeSizeRatio,
1496 region,
1497 dst->getType());
1498 inst->setSrc(newSrc, 0);
1499 return true;
1500 }
1501
1502 // implement HW restrictions on mov
1503 // -- There is no direct conversion from B/UB to DF or DF to B/UB.
1504 // Use two instructions and a word or DWord intermediate type.
1505 // -- There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
1506 // Use two instructions and a word or DWord intermediate integer type.
1507 // -- There is no direct conversion from HF to DF or DF to HF.
1508 // Use two instructions and F (Float) as an intermediate type.
1509 // -- There is no direct conversion from HF to Q/UQ or Q/UQ to HF.
1510 // Use two instructions and F (Float) or a word integer type or a DWord integer type as an intermediate type.
1511 // -- There is no direct scalar conversion from B/UB to HF or F.
1512 // Use two instructions and a WORD or DWORD intermediate type respectively.
1513 // -- There is no direct conversion from HF to Integer (DWORD or WORD).
1514 // Use two instructions and F (Float) as an intermediate type.
1515 // returns true if a move is inserted
fixMov(INST_LIST_ITER i,G4_BB * bb)1516 bool HWConformity::fixMov(INST_LIST_ITER i, G4_BB* bb)
1517 {
1518 G4_INST* inst = *i;
1519
1520 if (inst->opcode() != G4_mov)
1521 {
1522 return false;
1523 }
1524
1525 G4_Type dstType = inst->getDst()->getType();
1526 G4_Type srcType = inst->getSrc(0)->getType();
1527 auto src = inst->getSrc(0);
1528
1529 bool scalarByteToFloat = builder.noScalarByteToFloat() &&
1530 IS_BTYPE(srcType) &&
1531 (IS_FTYPE(dstType) || IS_HFTYPE(dstType)) &&
1532 (src->isSrcRegRegion() && src->asSrcRegRegion()->isScalar());
1533 bool dstByteSrc64b = IS_BTYPE(dstType) && (IS_DFTYPE(srcType) || IS_QTYPE(srcType));
1534
1535 if (scalarByteToFloat || dstByteSrc64b)
1536 {
1537 replaceDst(i, Type_W);
1538 return true;
1539 }
1540 if (IS_BTYPE(srcType) && (IS_DFTYPE(dstType) || IS_QTYPE(dstType)))
1541 {
1542 // mov Q/DF B
1543 replaceDst(i, Type_W);
1544 return true;
1545 }
1546 if (isLowPrecisionFloatTy(dstType) && (IS_DFTYPE(srcType) || IS_QTYPE(srcType)))
1547 {
1548 // mov HF Q/DF
1549 replaceDst(i, Type_F);
1550 return true;
1551 }
1552 if (isLowPrecisionFloatTy(srcType) && (IS_DFTYPE(dstType) || IS_QTYPE(dstType)))
1553 {
1554 // mov Q/DF HF
1555 replaceDst(i, Type_F);
1556 return true;
1557 }
1558 const bool noHFToInteger = builder.noHFToInteger() &&
1559 IS_HFTYPE(srcType) &&
1560 (dstType == Type_D || dstType == Type_W);
1561 if (noHFToInteger)
1562 {
1563 // mov W/DW HF
1564 replaceDst(i, Type_F);
1565 return true;
1566 }
1567 return false;
1568 }
1569
fixRotate(INST_LIST_ITER i,G4_BB * bb)1570 bool HWConformity::fixRotate(INST_LIST_ITER i, G4_BB* bb)
1571 {
1572
1573 // rotate requires src0 and dst to have the same datatype precision
1574 // It also does not support *B/*Q types, but that should be enforced at the vISA level
1575 // returns true if new instruction is inserted
1576 bool changed = false;
1577 G4_INST* inst = *i;
1578 if (inst->opcode() != G4_rol && inst->opcode() != G4_ror)
1579 {
1580 return false;
1581 }
1582 G4_DstRegRegion* dst = inst->getDst();
1583 G4_SrcRegRegion* src = inst->getSrc(0)->asSrcRegRegion();
1584
1585 MUST_BE_TRUE(IS_WTYPE(dst->getType()) || IS_DTYPE(dst->getType()) || IS_QTYPE(dst->getType()), "dst type must be *W or *D or *Q");
1586 MUST_BE_TRUE(IS_WTYPE(src->getType()) || IS_DTYPE(src->getType()) || IS_QTYPE(src->getType()), "src type must be *W or *D or *Q");
1587
1588 if (dst->getTypeSize() != src->getTypeSize())
1589 {
1590 // keep exec type same and change dst to be same type as src
1591 replaceDst(i, src->getType());
1592 dst = inst->getDst();
1593 changed = true;
1594 }
1595
1596 if (dst->getType() == Type_W)
1597 {
1598 dst->setType(Type_UW);
1599 }
1600 else if (dst->getType() == Type_D)
1601 {
1602 dst->setType(Type_UD);
1603 }
1604 else if (builder.getPlatform() >= GENX_PVC && dst->getType() == Type_Q)
1605 {
1606 dst->setType(Type_UQ);
1607 }
1608
1609 if (src->getType() == Type_W)
1610 {
1611 src->setType(Type_UW);
1612 }
1613 else if (src->getType() == Type_D)
1614 {
1615 src->setType(Type_UD);
1616 }
1617 else if (builder.getPlatform() >= GENX_PVC && src->getType() == Type_Q)
1618 {
1619 src->setType(Type_UQ);
1620 }
1621 return changed;
1622 }
1623
fixDstAlignment(INST_LIST_ITER i,G4_BB * bb,G4_Type extype,unsigned int dst_elsize)1624 bool HWConformity::fixDstAlignment(INST_LIST_ITER i, G4_BB* bb, G4_Type extype, unsigned int dst_elsize)
1625 {
1626 G4_INST* inst = *i;
1627 bool insertMOV = false;
1628
1629 unsigned char exec_size = inst->getExecSize();
1630 G4_DstRegRegion* dst = inst->getDst();
1631 G4_Operand* src0 = inst->getSrc(0);
1632 unsigned h_stride = dst->getHorzStride();
1633 unsigned int extypesize = TypeSize(extype);
1634
1635 if (hasDedicateAlignRegionConformity(i))
1636 {
1637 return insertMOV;
1638 }
1639
1640 if (inst->hasNULLDst())
1641 {
1642 if (dst_elsize * h_stride < extypesize)
1643 {
1644 uint16_t newHStride = extypesize / dst_elsize;
1645 if (newHStride == 8)
1646 {
1647 MUST_BE_TRUE(dst_elsize == 1, "expect B/UB dst");
1648 if (inst->opcode() == G4_mov && exec_size == 1 &&
1649 src0->isSrcRegRegion() && !src0->asSrcRegRegion()->hasModifier())
1650 {
1651 // Just set src to be the same type as dst
1652 src0->asSrcRegRegion()->setType(dst->getType());
1653 }
1654 else
1655 {
1656 replaceDst(i, Type_W);
1657 return true;
1658 }
1659 }
1660 else
1661 {
1662 MUST_BE_TRUE(newHStride <= 4, "horizontal stride must be <=4");
1663 dst->setHorzStride(newHStride);
1664 }
1665 }
1666
1667 return insertMOV;
1668 }
1669
1670 // optimize initialization instructions
1671 if (inst->opcode() == G4_mov && src0->isImm() &&
1672 (bb->isAllLaneActive() || inst->isWriteEnableInst()) &&
1673 !inst->getPredicate() &&
1674 dst->getRegAccess() == Direct &&
1675 dst->getHorzStride() == 1 &&
1676 inst->getSaturate() == false &&
1677 IS_BTYPE(dst->getType()) &&
1678 !IS_TYPE_F32_F64(src0->getType()) &&
1679 builder.isOpndAligned(dst, src0->getTypeSize()))
1680 {
1681 // inst is a mov with packed byte dst and int imm source
1682 int64_t value = src0->asImm()->getInt();
1683 uint64_t new_value = (value & 0xFF) | (value << 0x8);
1684 int scale = 2;
1685
1686 if (IS_DTYPE(src0->getType()))
1687 {
1688 scale = 4;
1689 new_value = (new_value & 0xFFFF) | (new_value << 0x10);
1690 }
1691
1692 if (exec_size >= scale)
1693 {
1694 G4_Type new_type = (scale == 2) ? Type_UW : Type_UD;
1695 auto newDst = builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff() / scale, 1, new_type, dst->getAccRegSel());
1696 inst->setDest(newDst);
1697 inst->setSrc(builder.createImm(new_value, new_type), 0);
1698 inst->setExecSize(G4_ExecSize(exec_size / scale));
1699 return insertMOV;
1700 }
1701 }
1702
1703 bool byteDst = IS_BTYPE(dst->getType());
1704
1705 // Byte can not be used as dstination of INT*INT
1706 if ((byteDst && inst->opcode() == G4_mul &&
1707 IS_TYPE_INT(inst->getSrc(0)->getType()) && IS_TYPE_INT(inst->getSrc(1)->getType())))
1708 {
1709 // change dst type to W
1710 replaceDst(i, Type_W);
1711 return true;
1712 }
1713
1714 if (byteDst && extypesize == 8)
1715 {
1716 // Gen doesn't support hstride 8, so we add a W move here
1717 replaceDst(i, Type_W);
1718 return true;
1719 }
1720
1721 if (builder.hasBFMixMode() && extype == Type_F && inst->getDst()->getType() == Type_BF && !inst->isDpas())
1722 {
1723 // For now, BF mixed mode should not need this check.
1724 // If visa may allow any region as input under bf mixed mode, we need to change this.
1725 return false;
1726 }
1727
1728 bool dstHFMixModeInst = inst->getDst()->getType() == builder.getMixModeType() && extype == Type_F;
1729 bool dstNotAlignedToExecType = exec_size > 1 && (dst_elsize * h_stride) < extypesize &&
1730 !(builder.hasMixMode() && dstHFMixModeInst);
1731 unsigned short dst_byte_offset;
1732 builder.isOpndAligned(dst, dst_byte_offset, extypesize);
1733 if (!((dst_byte_offset % extypesize == 0) ||
1734 (byteDst &&
1735 (dst_byte_offset % extypesize == 1))
1736 ) ||
1737 /*
1738 * Dynamic offset can be odd for serialized instructions
1739 * or when horizontal offset is dynamic.
1740 * Probably we need the same for any dst with dynamic offsets.
1741 */
1742 (dst_elsize < extypesize &&
1743 dst->getRegAccess() != Direct &&
1744 !(byteDst && extypesize == 2 && exec_size == 1)
1745 ) ||
1746 dstNotAlignedToExecType)
1747 {
1748 /*
1749 * 10.3
1750 * For byte dst type:
1751 * 1. no 1 horstride
1752 * 2. no odd start subreg
1753 * There is only one excpetion - raw mov op
1754 * Raw means src operand has no attribute.
1755 *
1756 * Note: Actually all these cases are now controlled
1757 * by extypesize value.
1758 */
1759
1760 if (inst->isRawMov() &&
1761 (dst_byte_offset % extypesize == 0 ||
1762 (byteDst && dst_byte_offset % extypesize == 1)))
1763 {
1764 return insertMOV;
1765 }
1766
1767 if (canReplaceMovSrcType(builder, inst, extypesize))
1768 {
1769 return false;
1770 }
1771
1772 if (inst->opcode() == G4_mov)
1773 {
1774 bool intHFConversion = false;
1775 G4_Operand* src0 = inst->getSrc(0);
1776 if (isLowPrecisionFloatTy(dst->getType()) && IS_TYPE_INT(src0->getType()))
1777 {
1778 intHFConversion = true;
1779 }
1780 else if (isLowPrecisionFloatTy(src0->getType()) && IS_TYPE_INT(dst->getType()))
1781 {
1782 intHFConversion = true;
1783 }
1784 // F to packed HF operations are handled specially later
1785 bool FtoHFMov = dst->getType() == Type_HF && src0->getType() == Type_F;
1786 if (builder.getPlatform() >= GENX_CHV && !intHFConversion &&
1787 (inst->isMixedMode() || (builder.hasFtoPackedHFMove() && FtoHFMov && inst->getExecSize() >= builder.getNativeExecSize())))
1788 {
1789 return insertMOV;
1790 }
1791 }
1792
1793 if (splitInstListForByteDst(i, bb, (uint16_t)extypesize))
1794 {
1795 return true;
1796 }
1797
1798 inst->setDest(insertMovAfter(i, dst, dst->getType(), bb));
1799 insertMOV = true;
1800 }
1801
1802 return insertMOV;
1803 }
1804
fixPredicateIndirectInst(INST_LIST_ITER it,G4_BB * bb)1805 void HWConformity::fixPredicateIndirectInst(INST_LIST_ITER it, G4_BB* bb)
1806 {
1807 G4_INST* inst = (*it);
1808 if (inst->getPredicate() &&
1809 inst->getDst() &&
1810 !inst->getDst()->isNullReg() &&
1811 inst->getDst()->getRegAccess() == Direct)
1812 {
1813 bool hasIndirectSource = false;
1814 for (int i = 0; i < inst->getNumSrc(); i++)
1815 {
1816 G4_Operand* opnd = inst->getSrc(i);
1817
1818 if (opnd && opnd->isSrcRegRegion() &&
1819 opnd->asSrcRegRegion()->getRegAccess() == IndirGRF)
1820 {
1821 if (inst->opcode() == G4_sel)
1822 {
1823 replaceSrc(it, i, opnd->getType(), bb);
1824 }
1825 else
1826 {
1827 hasIndirectSource = true;
1828 break;
1829 }
1830 }
1831 }
1832
1833 if (hasIndirectSource)
1834 {
1835 replaceDst(it, inst->getDst()->getType());
1836 }
1837 }
1838 }
1839
1840 /*
1841 * This function checks to see if the instruction's indirect operands
1842 * potentially require totally more than 8 distinct addr reg sub-registers, and
1843 * then determines which of the operands to spill into temporary GRFs so
1844 * as to limit total number of distinct sub-registers used by the instruction
1845 * to 8. This is a requirement imposed by the CM register allocator.
1846 */
1847
fixIndirectOpnd(INST_LIST_ITER i,G4_BB * bb)1848 bool HWConformity::fixIndirectOpnd(INST_LIST_ITER i, G4_BB* bb)
1849 {
1850 G4_INST* inst = *i;
1851
1852 G4_Operand* src0 = inst->getSrc(0), * src1 = inst->getSrc(1);
1853 G4_DstRegRegion* dst = inst->getDst();
1854 bool null_dst = (!dst || inst->hasNULLDst());
1855
1856 bool null_src0 = !src0;
1857 bool null_src1 = !src1 || (inst->isMath() && src1->isNullReg());
1858
1859 const int addr_reg_max_count = 16;
1860 const int addr_reg_size = TypeSize(Type_UW);
1861 int src_uniq_count = 0;
1862 int src1_count = 0;
1863 int src0_count = 0;
1864 int dst_uniq_count = 0;
1865 int dst_count = 0;
1866 bool nospill_src1 = false;
1867 bool nospill_src0 = false;
1868 bool nospill_dst = false;
1869 bool spill_src1 = false;
1870 bool spill_src0 = false;
1871 bool spill_dst = false;
1872 G4_Declare* addr_dcl0 = NULL, * addr_dcl1 = NULL, * addr_dcl2 = NULL;
1873 if (!null_src0 && src0->isSrcRegRegion() &&
1874 src0->getRegAccess() != Direct && src0->asSrcRegRegion()->getBase()->isRegVar()) {
1875 addr_dcl0 = src0->asSrcRegRegion()->getBase()->asRegVar()->getDeclare()->getRootDeclare();
1876 // is the following precise?
1877 src0_count = addr_dcl0->getTotalElems();
1878 MUST_BE_TRUE(src0_count <= addr_reg_max_count, "More than 8 address subregisters required for one operand.");
1879 src_uniq_count += src0_count;
1880 }
1881
1882 if (!null_src1 && src1->isSrcRegRegion() &&
1883 src1->getRegAccess() != Direct && src1->asSrcRegRegion()->getBase()->isRegVar()) {
1884 addr_dcl1 = src1->asSrcRegRegion()->getBase()->asRegVar()->getDeclare()->getRootDeclare();
1885 src1_count = addr_dcl1->getTotalElems();
1886 MUST_BE_TRUE(src1_count <= addr_reg_max_count, "More than 8 address subregisters required for one operand.");
1887 if (addr_dcl1 != addr_dcl0) {
1888 // should we use top level dcl here?
1889 src_uniq_count += src1_count;
1890 }
1891 else {
1892 nospill_src1 = true;
1893 nospill_src0 = true;
1894 }
1895 }
1896
1897 if (!null_dst &&
1898 dst->getRegAccess() != Direct && dst->getBase()->isRegVar())
1899 {
1900 addr_dcl2 = dst->getBase()->asRegVar()->getDeclare()->getRootDeclare();
1901 dst_count = addr_dcl2->getTotalElems();
1902 MUST_BE_TRUE(dst_count <= addr_reg_max_count, "More than 8 address subregisters required for one operand.");
1903 if (addr_dcl2 != addr_dcl0 && addr_dcl2 != addr_dcl1) {
1904 dst_uniq_count += dst_count;
1905 }
1906 else if (addr_dcl2 != addr_dcl0) {
1907 nospill_dst = true;
1908 nospill_src0 = true;
1909 }
1910 else {
1911 nospill_dst = true;
1912 nospill_src1 = true;
1913 }
1914 }
1915
1916 if (src_uniq_count > addr_reg_max_count) {
1917 if (src0_count > src1_count || nospill_src1) {
1918 MUST_BE_TRUE(nospill_src0 == false, "Address of source0 should be spilled.");
1919 spill_src0 = true;
1920 src_uniq_count -= src0_count;
1921 }
1922 else {
1923 MUST_BE_TRUE(nospill_src1 == false, "Address of source1 should be spilled.");
1924 spill_src1 = true;
1925 src_uniq_count -= src1_count;
1926 }
1927 }
1928
1929 if (src_uniq_count + dst_uniq_count > addr_reg_max_count) {
1930 MUST_BE_TRUE(nospill_dst == false, "Address of dst should be spilled.");
1931
1932 if (nospill_src1 && nospill_src0) {
1933 spill_dst = true;
1934 dst_uniq_count = 0;
1935 }
1936 else if (dst_uniq_count > src0_count&& dst_uniq_count > src1_count) {
1937 spill_dst = true;
1938 dst_uniq_count = 0;
1939 }
1940 else if (spill_src0) {
1941 spill_src1 = true;
1942 src_uniq_count -= src1_count;
1943 }
1944 else if (spill_src1) {
1945 spill_src0 = true;
1946 src_uniq_count -= src0_count;
1947 }
1948 else if (src0_count > src1_count) {
1949 spill_src0 = true;
1950 src_uniq_count -= src0_count;
1951 }
1952 else {
1953 spill_src1 = true;
1954 src_uniq_count -= src1_count;
1955 }
1956 }
1957
1958 MUST_BE_TRUE(src_uniq_count + dst_uniq_count <= addr_reg_max_count,
1959 "Remianed number of address registers should be no more than 8 after spill.");
1960
1961 // Is this only for iselect?
1962 // What if a scalar with indirect addressing is used?
1963 if (spill_src0) {
1964 G4_Operand* new_src0 = insertMovBefore(i, 0, src0->getType(), bb);
1965 inst->setSrc(new_src0, 0);
1966 }
1967
1968 if (spill_src1 && src1) {
1969 G4_Operand* new_src1 = insertMovBefore(i, 1, src1->getType(), bb);
1970 inst->setSrc(new_src1, 1);
1971 }
1972
1973 if (spill_dst && dst)
1974 {
1975 G4_DstRegRegion* new_dst = insertMovAfter(i, dst, dst->getType(), bb);
1976 inst->setDest(new_dst);
1977 if (dst != new_dst &&
1978 (IS_FTYPE(dst->getType()) || IS_DFTYPE(dst->getType())))
1979 {
1980 inst->setSaturate(g4::NOSAT);
1981 }
1982 }
1983 return spill_dst;
1984 }
1985
1986 // If an accumulator is a source operand, its register region must match that of the
1987 // destination register (which means GRF-aligned since we always GRF-align Acc)
1988 // also check for restrictions on explicit acc dst
fixAcc(INST_LIST_ITER iter,G4_BB * bb)1989 bool HWConformity::fixAcc(INST_LIST_ITER iter, G4_BB* bb)
1990 {
1991 G4_INST* inst = *iter;
1992
1993 bool changed = false;
1994 auto dst = inst->getDst();
1995 if ((dst && dst->isAccReg()) || inst->opcode() == G4_mach)
1996 {
1997 if (!builder.accDstforIndirectSrc())
1998 {
1999 if (inst->getSrc(0)->isSrcRegRegion() && inst->getSrc(0)->asSrcRegRegion()->getRegAccess() == IndirGRF)
2000 {
2001 inst->setSrc(insertMovBefore(iter, 0, inst->getSrc(0)->getType(), bb), 0);
2002 changed = true;
2003 }
2004 }
2005 }
2006
2007 // implicit acc src/dst get its offset from dst
2008 bool useAcc = inst->hasImplicitAccSrc() || inst->hasImplicitAccDst();
2009 if (!useAcc)
2010 {
2011 for (int i = 0; i < inst->getNumSrc(); ++i)
2012 {
2013 G4_Operand* src = inst->getSrc(i);
2014 if (src && src->isAccReg())
2015 {
2016 useAcc = true;
2017 break;
2018 }
2019 }
2020 }
2021
2022 if (useAcc &&
2023 dst &&
2024 dst->getBase() &&
2025 dst->getBase()->isRegVar())
2026 {
2027 if (!builder.isOpndAligned(dst, numEltPerGRF<Type_UB>()))
2028 {
2029 inst->setDest(insertMovAfter(iter, dst, dst->getType(), bb, GRFALIGN));
2030 changed = true;
2031 }
2032 }
2033
2034 return changed;
2035 }
2036
2037 /*
2038 * When operation execution size is 1, destination horizontal stride is set
2039 * according to rule 10.2:
2040 *
2041 * 10.1.2. If ExecSize is greater than 1, dst.HorzStride*sizeof(dst.Type) must
2042 * be equal to or greater than the size of the execution data type.
2043 * 10.2. If ExecSize is 1, dst.HorzStride must not be 0. Note that this is
2044 * relaxed from rule 10.1.2. Also note that this rule for destination
2045 * horizontal stride is different from that for source as stated
2046 * in rule #7.
2047 *
2048 * There are some instructions which work unpredictably if both ExecSize
2049 * and dst.HorzStride are 1. But they work fine if dst.HorzStride is set
2050 * according to rule 10.1.2. So we have to correct all such cases.
2051 *
2052 * This supposed to be the last operation before emitting final assembly code.
2053 */
fixDstHstride(INST_LIST_ITER i,int extypesize)2054 void HWConformity::fixDstHstride(INST_LIST_ITER i, int extypesize)
2055 {
2056 G4_INST* inst = *i;
2057 G4_DstRegRegion* dst = inst->getDst();
2058 int dst_elsize = dst->getTypeSize();
2059
2060 if (dst)
2061 {
2062 unsigned short hs = dst->getHorzStride();
2063 if (hs * dst_elsize < extypesize)
2064 {
2065 dst->setHorzStride((unsigned short)(extypesize / dst_elsize));
2066 }
2067 }
2068 }
2069
2070 template<class T>
isPreAssignedRegOffsetNonZero(T * region)2071 bool isPreAssignedRegOffsetNonZero(T* region)
2072 {
2073 // T is non-NULL and either
2074 // G4_SrcRegRegion or G4_DstRegRegion
2075 bool ret = false;
2076
2077 if ((region->isSrcRegRegion() || region->isDstRegRegion()) &&
2078 region->getBase() &&
2079 region->getBase()->isRegVar() &&
2080 region->getBase()->asRegVar()->isPhyRegAssigned() &&
2081 region->getBase()->asRegVar()->getPhyRegOff() != 0)
2082 {
2083 ret = true;
2084 }
2085
2086 return ret;
2087 }
2088
generateMacl(INST_LIST_ITER it,G4_BB * bb)2089 void HWConformity::generateMacl(INST_LIST_ITER it, G4_BB* bb)
2090 {
2091 G4_INST* mulInst = *it;
2092 MUST_BE_TRUE(mulInst->opcode() == G4_mul, "expect mul instruction");
2093 if (mulInst->getExecSize() > builder.getNativeExecSize())
2094 {
2095 auto startIter = it;
2096 bool isFirstInst = startIter == bb->begin();
2097 if (!isFirstInst)
2098 {
2099 --startIter;
2100 }
2101 evenlySplitInst(it, bb);
2102 if (!isFirstInst)
2103 {
2104 ++startIter;
2105 }
2106 // startIter now points to first mul created by split
2107 auto endIter = it;
2108 ++endIter;
2109 // endIter points to the first inst after the original mul
2110 for (auto iter = startIter; iter != endIter;)
2111 {
2112 auto nextIter = iter;
2113 ++nextIter;
2114 G4_INST* currInst = *iter;
2115 if (currInst->opcode() == G4_mul)
2116 {
2117 doGenerateMacl(iter, bb);
2118 }
2119 iter = nextIter;
2120 }
2121 }
2122 else
2123 {
2124 doGenerateMacl(it, bb);
2125 }
2126 }
2127
2128 // convert vISA mul (8) dst src0 src1 into
2129 // mul (8) acc0.0<1>:d src0:d src1:w
2130 // mach (8) dst:d src0:d src1:d
2131 //
doGenerateMacl(INST_LIST_ITER it,G4_BB * bb)2132 void HWConformity::doGenerateMacl(INST_LIST_ITER it, G4_BB* bb)
2133 {
2134 G4_INST* mulInst = *it;
2135 MUST_BE_TRUE(mulInst->opcode() == G4_mul, "expect mul instruction");
2136 assert(mulInst->getExecSize() <= builder.getNativeExecSize() && "expect single register inst");
2137
2138 G4_Operand* src0 = mulInst->getSrc(0);
2139 G4_Operand* src1 = mulInst->getSrc(1);
2140 MUST_BE_TRUE(IS_DTYPE(src0->getType()) && IS_DTYPE(src1->getType()), "both sources must have dword type");
2141
2142 // src1 does not support modifier
2143 checkSrcMod(it, bb, 1);
2144 // fix src1 region: stride can't exceed 4, otherwise the stride of src1 in the expanded mul will be invalid
2145 // mulh dst:d src0:d src1:d
2146 // =>
2147 // mul acc0:d src0:d src1:uw
2148 // mach dst:d src0:d src1:d
2149 fixSrc1Region(it, bb);
2150 src1 = mulInst->getSrc(1);
2151
2152 if (!builder.supportSrcModforMul())
2153 {
2154 checkSrcMod(it, bb, 0);
2155 src0 = mulInst->getSrc(0);
2156 }
2157
2158 // sat cannot be used at all in the macro sequence
2159 // this effectivly means sat is broken for mul D D D
2160 mulInst->setSaturate(g4::NOSAT);
2161
2162 G4_DstRegRegion* origDst = mulInst->getDst();
2163 G4_Type tmpType = (IS_UNSIGNED_INT(src0->getType()) && IS_UNSIGNED_INT(src1->getType())) ? Type_UD : Type_D;
2164 if (builder.noMulOrMadwExpandingBeforeScheduler() && builder.getOption(vISA_expandMulPostSchedule))
2165 {
2166 // Here just create tmp variables to fix srcMod, cond modifier, saturate, etc. And Mul->Mul+Macl expanding will
2167 // be done in expandMulPostSchedule pass.
2168
2169 // Need extra mov if dst is acc and src0 is indirect
2170 if (!builder.accDstforIndirectSrc())
2171 {
2172 if (src0->isSrcRegRegion() && src0->asSrcRegRegion()->getRegAccess() == IndirGRF)
2173 {
2174 mulInst->setSrc(insertMovBefore(it, 0, src0->getType(), bb), 0);
2175 }
2176 }
2177
2178 //need extra move for dst
2179 if (!IS_DTYPE(origDst->getType()) || origDst->getHorzStride() != 1 ||
2180 !builder.isOpndAligned(origDst, getGRFSize()))
2181 {
2182 // macl dst must be grf-aligned, packed D/UD as it is also used for the implicit acc source's region
2183 G4_DstRegRegion* tmpDst = insertMovAfter(it, origDst, tmpType, bb, GRFALIGN);
2184 mulInst->setDest(tmpDst);
2185 }
2186
2187 // set implicit acc dst to the mul instruction as acc will be used as dst of the expanded mul after local scheduling.
2188 // it is a must to fix the WAR/WAW issue of acc in local scheduling.
2189 G4_DstRegRegion* accDstOpnd = builder.createDst(builder.phyregpool.getAcc0Reg(), 0, 0, 1, mulInst->getDst()->getType());
2190 mulInst->setImplAccDst(accDstOpnd);
2191 }
2192 else
2193 {
2194 G4_DstRegRegion* accDstOpnd = builder.createDst(builder.phyregpool.getAcc0Reg(), 0, 0, 1, tmpType);
2195 mulInst->setDest(accDstOpnd);
2196
2197 uint32_t origOptions = mulInst->getOption();
2198 fixMulSrc1(it, bb);
2199 mulInst->setOptionOn(InstOpt_WriteEnable);
2200
2201 G4_Predicate* predicate = mulInst->getPredicate();
2202 if (predicate != nullptr)
2203 {
2204 // move pred to mach
2205 mulInst->setPredicate(nullptr);
2206 }
2207 if (mulInst->getCondMod() != nullptr)
2208 {
2209 // conditional modifier cannot be used
2210 // when the MUL source operand is of dword type.
2211 MUST_BE_TRUE(false, "Dw multiply does not support conditional modifiers");
2212 mulInst->setCondMod(nullptr);
2213 }
2214
2215 // create a macl inst
2216 G4_INST* maclInst = builder.createMacl(mulInst->getExecSize(),
2217 origDst, builder.duplicateOperand(src0), builder.duplicateOperand(src1), origOptions, tmpType);
2218 maclInst->setPredicate(predicate);
2219
2220 // maintain du chain as fixAccDst uses it later
2221 mulInst->addDefUse(maclInst, Opnd_implAccSrc);
2222
2223 INST_LIST_ITER machIter = it;
2224 machIter = bb->insertBefore(++machIter, maclInst);
2225
2226 if (!IS_DTYPE(origDst->getType()) || origDst->getHorzStride() != 1 ||
2227 !builder.isOpndAligned(origDst, getGRFSize()))
2228 {
2229 // macl dst must be grf-aligned, packed D/UD as it is also used for the implicit acc source's region
2230 G4_DstRegRegion* tmpDst = insertMovAfter(machIter, origDst, tmpType, bb, GRFALIGN);
2231 maclInst->setDest(tmpDst);
2232 }
2233 }
2234 }
2235
2236 // get rid of source modifiers on this inst[srcPos]
checkSrcMod(INST_LIST_ITER it,G4_BB * bb,int srcPos)2237 bool HWConformity::checkSrcMod(INST_LIST_ITER it, G4_BB* bb, int srcPos)
2238 {
2239 bool changed = false;
2240 G4_INST* inst = *it;
2241 assert(srcPos < inst->getNumSrc() && "invalid srcPos");
2242 auto src = inst->getSrc(srcPos);
2243 if (src->isSrcRegRegion())
2244 {
2245 G4_SrcRegRegion* srcRegion = src->asSrcRegRegion();
2246 if (srcRegion->getModifier() != Mod_src_undef)
2247 {
2248 G4_Type type = IS_DTYPE(src->getType()) ? src->getType() : Type_D;
2249 src = insertMovBefore(it, srcPos, type, bb);
2250 inst->setSrc(src, srcPos);
2251 changed = true;
2252 }
2253 }
2254 return changed;
2255 }
2256
2257 // If both source operands of an MUL instruction are of dword integer type,
2258 // only the lower 16 bits of data elements in src0 are used.
2259 // The full precision multiplication results can be only produced together
2260 // with the mach and mov instructions.
2261
fixMULInst(INST_LIST_ITER & i,G4_BB * bb)2262 bool HWConformity::fixMULInst(INST_LIST_ITER& i, G4_BB* bb)
2263 {
2264 bool insertedInst = false;
2265 G4_INST* inst = *i;
2266 G4_DstRegRegion* dst = inst->getDst();
2267 G4_ExecSize execSize = inst->getExecSize();
2268 bool srcExchanged = false;
2269
2270 if (dst->isAccReg())
2271 {
2272 return false;
2273 }
2274
2275 uint32_t inst_opt = inst->getOption();
2276 G4_Operand* src0 = inst->getSrc(0), * src1 = inst->getSrc(1);
2277
2278 // MUL is commutative and only
2279 // allows src1 to be a constant.
2280 // If src0 is a constant and src1
2281 // is not, they are swapped here.
2282 // If both are constants, they
2283 // will be fixed in checking HW conformity.
2284 // this is fixed in fixOpnd.
2285
2286 if (src0->isImm() && !src1->isImm())
2287 {
2288 inst->swapSrc(0, 1);
2289 srcExchanged = true;
2290 }
2291
2292 if (!builder.supportSrcModforMul() &&
2293 (IS_DTYPE(src0->getType()) || IS_DTYPE(src1->getType())) &&
2294 ((src0->getTypeSize()) < 4 || (src1->getTypeSize()) < 4))
2295
2296 {
2297 checkSrcMod(i, bb, 0);
2298 checkSrcMod(i, bb, 1);
2299 }
2300
2301 src0 = inst->getSrc(0);
2302 src1 = inst->getSrc(1);
2303 // Q dst needs 64-bit support regardless of src type
2304 bool isDMul = IS_QTYPE(dst->getType()) || (IS_DTYPE(src0->getType()) && IS_DTYPE(src1->getType()));
2305
2306 if (!isDMul)
2307 {
2308 return false;
2309 }
2310
2311 if (builder.hasMacl() && !IS_QTYPE(dst->getType()) &&
2312 (builder.noDwDstForDwordMul() || inst->getExecSize() > g4::SIMD1))
2313 {
2314 // use macl for D = D x D. We use macl when possible
2315 // except on scalar inst on platforms that support native DMul
2316 generateMacl(i, bb);
2317 return true;
2318 }
2319
2320 bool doNativeMul = false;
2321 if (!builder.no64bitRegioning())
2322 {
2323 // platform natively supports DW-DW multiply, no need to generate mul/mach/mov sequence
2324 doNativeMul = true;
2325 }
2326 else
2327 {
2328 if ((builder.getPlatform() == GENX_CHV || builder.getPlatform() == GENX_BXT))
2329 {
2330 if (inst->getExecSize() == g4::SIMD1)
2331 {
2332 // scalar insts are a-ok
2333 return false;
2334 }
2335 // ok if source is scalar or qword-aligned
2336 doNativeMul = dst->getTypeSize() * dst->getHorzStride() == 8;
2337 auto isQWordStride = [inst, this](G4_SrcRegRegion* src)
2338 {
2339 const RegionDesc* region = src->getRegion();
2340 if (!region->isScalar())
2341 {
2342 uint16_t stride = 0;
2343 (void)region->isSingleNonUnitStride(inst->getExecSize(), stride);
2344 if (stride != 2)
2345 {
2346 return false;
2347 }
2348 // check that source is GRF-aligned to ensure that every element is qword-aligned
2349 return builder.isOpndAligned(src, 32);
2350 }
2351 return true;
2352 };
2353 if (doNativeMul && src0->isSrcRegRegion())
2354 {
2355 doNativeMul = isQWordStride(src0->asSrcRegRegion());
2356 }
2357 if (doNativeMul && src1->isSrcRegRegion())
2358 {
2359 doNativeMul = isQWordStride(src1->asSrcRegRegion());
2360 }
2361 }
2362 }
2363
2364 if (doNativeMul)
2365 {
2366 // promote source to D type if necessary
2367 if (IS_QTYPE(dst->getType()))
2368 {
2369 G4_Type newTy;
2370 G4_Operand* newOpnd;
2371 if (!IS_DTYPE(src0->getType()))
2372 {
2373 newTy = IS_SIGNED_INT(src0->getType()) ? Type_D : Type_UD;
2374 newOpnd = insertMovBefore(i, 0, newTy, bb);
2375 inst->setSrc(newOpnd, 0);
2376 insertedInst = true;
2377 }
2378
2379 if (!IS_DTYPE(src1->getType()))
2380 {
2381 newTy = IS_SIGNED_INT(src1->getType()) ? Type_D : Type_UD;
2382 if (src1->isImm())
2383 {
2384 newOpnd = builder.createImm(src1->asImm()->getImm(), newTy);
2385 }
2386 else
2387 {
2388 newOpnd = insertMovBefore(i, 1, newTy, bb);
2389 }
2390 inst->setSrc(newOpnd, 1);
2391 insertedInst = true;
2392 }
2393 }
2394 return insertedInst;
2395 }
2396
2397 // both sources are dword, replace with mul/mach/mov sequence
2398 // At this point, src0 and src1 are both DW, so we simply make
2399 // acc's type (i.e. dst_type) be DW/UD
2400
2401 G4_CondMod* condmod = builder.duplicateOperand(inst->getCondMod());
2402 G4_Predicate* pred = builder.duplicateOperand(inst->getPredicate());
2403
2404 G4_Type tmp_type = (IS_UNSIGNED_INT(src0->getType()) && IS_UNSIGNED_INT(src1->getType())) ? Type_UD : Type_D;
2405
2406 // src1 does not support modifier
2407 checkSrcMod(i, bb, 1);
2408 src1 = inst->getSrc(1);
2409
2410 if (!builder.supportSrcModforMul())
2411 {
2412 checkSrcMod(i, bb, 0);
2413 src0 = inst->getSrc(0);
2414 }
2415
2416 auto satMod = inst->getSaturate();
2417 inst->setSaturate(g4::NOSAT);
2418
2419 G4_DstRegRegion* acc_dst_opnd = builder.createDst(builder.phyregpool.getAcc0Reg(), 0, 0, 1, tmp_type);
2420 inst->setDest(acc_dst_opnd);
2421 fixMulSrc1(i, bb);
2422
2423 inst->setNoMask(true);
2424
2425 if (pred)
2426 {
2427 // conditional modifier cannot be used
2428 // when the MUL source operand is of dword type.
2429 inst->setCondMod(nullptr);
2430 }
2431
2432 // Dst is either null, or a temp D if the original dst is Q/UQ
2433 G4_DstRegRegion* machDst = NULL;
2434 G4_Declare* high32BitDcl = NULL;
2435 if (IS_QTYPE(dst->getType()))
2436 {
2437 high32BitDcl = builder.createTempVar(execSize, Type_D, Any);
2438 machDst = builder.createDstRegRegion(high32BitDcl, 1);
2439 }
2440 else
2441 {
2442 machDst = builder.createNullDst(Type_D);
2443 }
2444
2445 // create a mach inst
2446 G4_INST* newInst = builder.createMach(execSize, machDst,
2447 builder.duplicateOperand(src0), builder.duplicateOperand(src1), inst_opt, tmp_type);
2448
2449 INST_LIST_ITER iter = i;
2450 iter++;
2451 bb->insertBefore(iter, newInst);
2452
2453 inst->setPredicate(nullptr);
2454
2455 inst->copyDef(newInst, Opnd_src0, Opnd_src0);
2456 inst->copyDef(newInst, Opnd_src1, Opnd_src1);
2457 inst->transferUse(newInst);
2458 inst->addDefUse(newInst, Opnd_implAccSrc);
2459
2460 // create an explciit acc source for later use
2461 const RegionDesc* rd = execSize > 1 ? builder.getRegionStride1() : builder.getRegionScalar();
2462 G4_SrcRegRegion* acc_src_opnd = builder.createSrc(
2463 builder.phyregpool.getAcc0Reg(), 0, 0, rd, tmp_type);
2464
2465 insertedInst = true;
2466
2467 if (IS_QTYPE(dst->getType()))
2468 {
2469 // we have to produce two additional moves to form the Q/UQ:
2470 // mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw
2471 // mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d
2472 // mov (8) r6.0<1>:d acc0:d // Low 32 bits.
2473 // mov (8) dst.0<2>:d r6.0<1>:d
2474 // mov (8) dst.1<2>:d r5.0<1>:d
2475 // Note that we don't try to combine the moves because of the HW restriction that
2476 // "If an accumulator is an explicit source operand, its register region must match that of the destination register"
2477
2478 G4_Declare* low32BitDcl = builder.createTempVar(execSize, Type_D, Any);
2479 G4_INST* movInst = builder.createMov(execSize,
2480 builder.createDstRegRegion(low32BitDcl, 1),
2481 builder.createSrcRegRegion(*acc_src_opnd), inst_opt, false);
2482 bb->insertBefore(iter, movInst);
2483
2484 G4_DstRegRegion* origDst = dst;
2485 bool needsExtraMov = origDst->getHorzStride() > 1 || condmod != NULL || satMod;
2486
2487 G4_Declare* dstAlias = builder.createTempVar(execSize * 2, Type_D, Any);
2488 if (!needsExtraMov)
2489 {
2490 uint32_t aliasOffset = origDst->getRegOff() * numEltPerGRF<Type_UB>() + origDst->getSubRegOff() * 8;
2491 dstAlias->setAliasDeclare(origDst->getBase()->asRegVar()->getDeclare(), aliasOffset);
2492 }
2493 G4_INST* lowMove = builder.createMov(execSize,
2494 builder.createDstRegRegion(dstAlias, 2),
2495 builder.createSrcRegRegion(low32BitDcl, builder.getRegionStride1()),
2496 inst_opt, false);
2497 lowMove->setPredicate(pred);
2498
2499 bb->insertBefore(iter, lowMove);
2500
2501 MUST_BE_TRUE(high32BitDcl != NULL, "mach dst must not be null");
2502 G4_INST* highMove = builder.createMov(execSize,
2503 builder.createDst(dstAlias->getRegVar(), 0, 1, 2, dstAlias->getElemType()),
2504 builder.createSrcRegRegion(high32BitDcl, builder.getRegionStride1()),
2505 inst_opt, false);
2506 highMove->setPredicate(pred);
2507 bb->insertBefore(iter, highMove);
2508
2509 if (needsExtraMov)
2510 {
2511 // this will take care of non-packed dst/cond mod/saturate
2512 G4_Declare* dstAliasAsQ = builder.createTempVar(execSize, Type_Q, Any);
2513 dstAliasAsQ->setAliasDeclare(dstAlias, 0);
2514 G4_INST* moveInst = builder.createMov(execSize, dst, builder.createSrcRegRegion(dstAliasAsQ, builder.getRegionStride1()),
2515 inst_opt, false);
2516 moveInst->setCondMod(condmod);
2517 moveInst->setSaturate(satMod);
2518 bb->insertBefore(iter, moveInst);
2519 }
2520
2521 return true;
2522 }
2523
2524 INST_LIST_ITER last_iter;
2525 // create a mov inst
2526 if (satMod == g4::NOSAT)
2527 {
2528 bool extra_mov = dst && dst->getExecTypeSize() > TypeSize(Type_D);
2529 extra_mov |= isPreAssignedRegOffsetNonZero<G4_DstRegRegion>(dst);
2530
2531 G4_INST* movInst = builder.createMov(execSize, dst, builder.createSrcRegRegion(*acc_src_opnd),
2532 inst_opt, false);
2533 movInst->setPredicate(pred);
2534 movInst->setCondMod(condmod);
2535
2536 newInst->transferUse(movInst);
2537 newInst->addDefUse(movInst, Opnd_src0);
2538
2539 bb->insertBefore(iter, movInst);
2540 last_iter = iter;
2541 last_iter--;
2542 if (extra_mov)
2543 {
2544 // add a tmp mov
2545 iter--;
2546 G4_DstRegRegion* new_next_dst = insertMovAfter(iter, dst, dst->getType(), bb);
2547 movInst->setDest(new_next_dst);
2548 movInst->setPredicate(NULL);
2549 }
2550 }
2551 else
2552 {
2553 // create an extra mov inst
2554 G4_Declare* dcl = builder.createTempVar(
2555 execSize,
2556 tmp_type,
2557 GRFALIGN);
2558
2559 G4_DstRegRegion* tmp_dst_opnd = builder.createDst(
2560 dcl->getRegVar(),
2561 0,
2562 0,
2563 1,
2564 tmp_type);
2565 G4_INST* movInst = builder.createMov(execSize, tmp_dst_opnd,
2566 builder.createSrcRegRegion(*acc_src_opnd), InstOpt_NoOpt, false);
2567 movInst->setCondMod(condmod);
2568 bb->insertBefore(iter, movInst);
2569
2570 last_iter = iter;
2571 last_iter--;
2572
2573 G4_SrcRegRegion* tmp_src_opnd = builder.createSrc(dcl->getRegVar(), 0, 0, rd, tmp_type);
2574
2575 G4_INST* newInst2 = builder.createInternalInst(
2576 pred, G4_mov, condmod, satMod, execSize, dst, tmp_src_opnd, NULL, inst_opt);
2577
2578 newInst->transferUse(newInst2);
2579 newInst->addDefUse(movInst, Opnd_src0);
2580 movInst->addDefUse(newInst2, Opnd_src0);
2581 bb->insertBefore(iter, newInst2);
2582 iter++;
2583 }
2584
2585 if (execSize > builder.getNativeExecSize())
2586 {
2587 splitDWMULInst(i, last_iter, bb);
2588 }
2589
2590 return insertedInst;
2591 }
2592
2593
2594 // Translate MULH into
2595 // MUL acc src0 src1
2596 // MACH dst src0 src1
fixMULHInst(INST_LIST_ITER & i,G4_BB * bb)2597 void HWConformity::fixMULHInst(INST_LIST_ITER& i, G4_BB* bb)
2598 {
2599 G4_INST* inst = *i;
2600 G4_ExecSize execSize = inst->getExecSize();
2601
2602 int inst_opt = inst->getOption();
2603
2604 G4_Operand* src0 = inst->getSrc(0), * src1 = inst->getSrc(1);
2605
2606 if (src0->isImm() && !src1->isImm())
2607 {
2608 inst->swapSrc(0, 1);
2609 src0 = inst->getSrc(0);
2610 src1 = inst->getSrc(1);
2611 }
2612
2613 bool useMulQDD = false;
2614 if (execSize <= builder.getNativeExecSize() && !builder.no64bitRegioning() &&
2615 builder.supportFloatOr64bRegioning())
2616 {
2617 useMulQDD = true;
2618 if (!IS_DTYPE(src0->getType()) || !IS_DTYPE(src1->getType()))
2619 {
2620 if (src1->isImm() &&
2621 IS_DTYPE(src0->getType()) &&
2622 (IS_WTYPE(src1->getType()) || IS_BTYPE(src1->getType())))
2623 {
2624 // Ensure src1 has the same type size as src0.
2625 const G4_Imm* oldImm = src1->asImm();
2626 G4_Imm* newImm = builder.createImm(oldImm->getInt(), src0->getType());
2627 inst->setSrc(newImm, 1);
2628 }
2629 else
2630 {
2631 useMulQDD = false;
2632 }
2633 }
2634 }
2635 if (useMulQDD)
2636 {
2637 // use mul Q D D to get the upper 32-bit
2638 // note that we don't do this for CHV/BXT due to the 64-bit type restrictions
2639 inst->setOpcode(G4_mul);
2640 G4_DstRegRegion* dst = inst->getDst();
2641 G4_Type dstType = dst->getType();
2642
2643 if (dstType == Type_UD)
2644 dstType = Type_UQ;
2645 else
2646 dstType = Type_Q;
2647 G4_Declare* dstDcl = dst->getBase()->asRegVar()->getDeclare();
2648 G4_Declare* tmpDcl = builder.createTempVar(
2649 execSize,
2650 dstType,
2651 Any,
2652 "TV");
2653 tmpDcl->copyAlign(dstDcl);
2654
2655 G4_DstRegRegion* tmpDst = builder.createDstRegRegion(tmpDcl, 1);
2656 inst->setDest(tmpDst);
2657
2658 //need move to cast back to D/UD type
2659 G4_SrcRegRegion* tmpSrc = builder.createSrc(
2660 tmpDcl->getRegVar(),
2661 0,
2662 1,
2663 execSize > 1 ? builder.getRegionStride2() : builder.getRegionScalar(),
2664 dst->getType());
2665
2666 G4_INST* tmpMov = builder.createMov(execSize, dst, tmpSrc, inst->getOption(), false);
2667 tmpMov->setPredicate(builder.duplicateOperand(inst->getPredicate()));
2668
2669 bb->insertAfter(i, tmpMov);
2670
2671 // Check the new inserted mov inst
2672 i++;
2673
2674 // Need to remove dst from uses list of mulh, and add them to movInst useList
2675 // add movInst to uselist of mulh.
2676 // Add mulh to def instruction list of movInst
2677 inst->transferUse(tmpMov);
2678 inst->addDefUse(tmpMov, Opnd_src0);
2679 return;
2680 }
2681
2682 // src1 does not support modifier
2683 checkSrcMod(i, bb, 1);
2684 // fix src1 region: stride can't exceed 4, otherwise the stride of src1 in the expanded mul will be invalid
2685 // mulh dst:d src0:d src1:d
2686 // =>
2687 // mul acc0:d src0:d src1:uw
2688 // mach dst:d src0:d src1:d
2689 fixSrc1Region(i, bb);
2690 src1 = inst->getSrc(1);
2691
2692 if (!builder.supportSrcModforMul())
2693 {
2694 checkSrcMod(i, bb, 0);
2695 src0 = inst->getSrc(0);
2696 }
2697
2698 G4_Type tmp_type = (IS_UNSIGNED_INT(src0->getType()) && IS_UNSIGNED_INT(src1->getType())) ? Type_UD : Type_D;
2699
2700 assert(IS_DTYPE(src0->getType()) && "src0 must be DW type");
2701
2702
2703 if (builder.noMulOrMadwExpandingBeforeScheduler() && builder.getOption(vISA_expandMulPostSchedule))
2704 {
2705 // Here just create tmp variables to fix srcMod, cond modifier, saturate, etc. And Mul->Mul + Macl expanding will
2706 // be done in expandMulPostSchedule pass.
2707
2708 if (src1->isImm() && src0->getType() != src1->getType())
2709 {
2710 G4_Imm* oldImm = src1->asImm();
2711 // Ensure src1 has the same type as src0.
2712 inst->setSrc(builder.createImm(oldImm->getInt(), src0->getType()), 1);
2713 }
2714 else if (!IS_DTYPE(src1->getType()))
2715 {
2716 // this can happen due to vISA opt, convert them to src0 type which should be D/UD
2717 // We use D as the tmp type to make sure we can represent all src1 values
2718 inst->setSrc(insertMovBefore(i, 1, Type_D, bb), 1);
2719 }
2720
2721 // Need extra mov if dst is acc and src0 is indirect
2722 if (!builder.accDstforIndirectSrc())
2723 {
2724 if (src0->isSrcRegRegion() && src0->asSrcRegRegion()->getRegAccess() == IndirGRF)
2725 {
2726 inst->setSrc(insertMovBefore(i, 0, src0->getType(), bb), 0);
2727 }
2728 }
2729
2730 INST_LIST_ITER end_iter = i;
2731 // this mul will be expanded into mul+macl in expandMulPostSchedule pass. Since expanded macl
2732 // must be grf-aligned, so need to make mul to be grf-aligned.
2733 G4_DstRegRegion* dst = inst->getDst();
2734 if (inst->getSaturate() ||
2735 dst->getExecTypeSize() > TypeSize(Type_D) ||
2736 isPreAssignedRegOffsetNonZero<G4_DstRegRegion>(dst) ||
2737 !builder.isOpndAligned(dst, getGRFSize()))
2738 {
2739 // add a tmp mov
2740 inst->setDest(insertMovAfter(i, dst, dst->getType(), bb, GRFALIGN));
2741 end_iter++;
2742 }
2743
2744 // sat cannot be used at all in the macro sequence
2745 // this effectivly means sat is broken for mul D D D
2746 inst->setSaturate(g4::NOSAT);
2747
2748 // set implicit acc dst to the mulh instruction as acc will be used as dst of the expanded mul after local scheduling.
2749 // it is a must to fix the WAR/WAW issue of acc in local scheduling.
2750 G4_DstRegRegion* accDstOpnd = builder.createDst(builder.phyregpool.getAcc0Reg(), 0, 0, 1, inst->getDst()->getType());
2751 inst->setImplAccDst(accDstOpnd);
2752
2753 if (execSize > builder.getNativeExecSize())
2754 {
2755 auto start_iter = i;
2756 splitDWMULInst(start_iter, end_iter, bb);
2757 // start_iter points to the first half of mulh. Need double check this new inserted mulh to see if need split again
2758 i = start_iter;
2759 }
2760 else
2761 {
2762 i++;
2763 }
2764 }
2765 else
2766 {
2767 G4_DstRegRegion* acc_dst_opnd = builder.createDst(
2768 builder.phyregpool.getAcc0Reg(),
2769 0,
2770 0,
2771 1,
2772 tmp_type);
2773
2774 G4_INST* newMul = builder.createBinOp(G4_mul, execSize,
2775 acc_dst_opnd, builder.duplicateOperand(src0), builder.duplicateOperand(src1), inst_opt, false);
2776
2777 bb->insertBefore(i, newMul);
2778 inst->copyDefsTo(newMul, false);
2779
2780 fixMulSrc1(std::prev(i), bb);
2781 newMul->setNoMask(true);
2782
2783 auto machSrc1 = inst->getSrc(1);
2784 if (src1->isImm() && src0->getType() != src1->getType())
2785 {
2786 G4_Imm* oldImm = src1->asImm();
2787 // Ensure src1 has the same type as src0.
2788 machSrc1 = builder.createImm(oldImm->getInt(), src0->getType());
2789 }
2790 else if (!IS_DTYPE(src1->getType()))
2791 {
2792 // this can happen due to vISA opt, convert them to src0 type which should be D/UD
2793 // We use D as the tmp type to make sure we can represent all src1 values
2794 machSrc1 = insertMovBefore(i, 1, Type_D, bb);
2795 }
2796
2797 // We don't duplicate the operands here as original inst is unlinked
2798 // ToDo: this invalidate du-chain, do we still need to maintain it?
2799 auto machInst = builder.createMach(inst->getExecSize(), inst->getDst(), inst->getSrc(0), machSrc1, inst_opt, tmp_type);
2800 machInst->setPredicate(inst->getPredicate());
2801 machInst->setCondMod(inst->getCondMod());
2802 *i = machInst;
2803 inst->transferUse(machInst);
2804 inst->removeAllDefs();
2805 newMul->addDefUse(machInst, Opnd_implAccSrc);
2806
2807 INST_LIST_ITER end_iter = i;
2808 // check if the ACC source is aligned to mach dst
2809 // ToDo: this should be checked by fixAcc?
2810 G4_DstRegRegion* dst = inst->getDst();
2811 if (inst->getSaturate() ||
2812 dst->getExecTypeSize() > TypeSize(Type_D) ||
2813 isPreAssignedRegOffsetNonZero<G4_DstRegRegion>(dst))
2814 {
2815 // add a tmp mov
2816 machInst->setDest(insertMovAfter(i, dst, dst->getType(), bb));
2817 end_iter++;
2818 }
2819
2820 if (execSize > builder.getNativeExecSize())
2821 {
2822 auto start_iter = std::prev(i);
2823 splitDWMULInst(start_iter, end_iter, bb);
2824 // start_iter ponits to the first half of mul. Need to check the new inserted mul/mach instructions
2825 i = start_iter;
2826 }
2827 else
2828 {
2829 // i points to mach, and need to check the new inserted mul before mach
2830 i = std::prev(i);
2831 }
2832 }
2833 return;
2834 }
2835
2836 //
2837 // insert move instructions to copy numDwords dwords from src to dst at the specified location
2838 // a NoMask UD move is used.
2839 // dst and src must be dword-aligned.
2840 // srcOffset and dstOffset are in bytes
2841 // numDwords must be one of {1,2,4,8,16}
2842 // ToDo: may want to generalize this into a copyBytes function that selects the appropriate move type
2843 // based on dst and src type
2844 //
copyDwords(G4_Declare * dst,int dstOffset,G4_Declare * src,int srcOffset,int numDwords,G4_BB * bb,INST_LIST_ITER iter)2845 void HWConformity::copyDwords(G4_Declare* dst,
2846 int dstOffset,
2847 G4_Declare* src,
2848 int srcOffset,
2849 int numDwords,
2850 G4_BB* bb,
2851 INST_LIST_ITER iter)
2852 {
2853
2854 MUST_BE_TRUE(numDwords == 1 || numDwords == 2 || numDwords == 4 ||
2855 numDwords == 8 || numDwords == 16 || numDwords == 32, "invalid number of dwords to copy");
2856
2857 G4_Declare* newDst = dst;
2858
2859 if (dst->getElemType() != Type_UD)
2860 {
2861 // create an alias with type UD
2862 newDst = builder.createTempVar(numDwords, Type_UD, Any);
2863 newDst->setAliasDeclare(dst, 0);
2864 }
2865
2866 G4_Declare* newSrc = src;
2867 if (src->getElemType() != Type_UD)
2868 {
2869 // create an alias with type UD
2870 newSrc = builder.createTempVar(numDwords, Type_UD, Any);
2871 newSrc->setAliasDeclare(src, 0);
2872 }
2873
2874 G4_SrcRegRegion* srcOpnd = builder.createSrc(
2875 newSrc->getRegVar(), srcOffset / numEltPerGRF<Type_UB>(),
2876 (srcOffset % numEltPerGRF<Type_UB>()) / TypeSize(Type_UD),
2877 builder.getRegionStride1(), Type_UD);
2878 G4_DstRegRegion* dstOpnd = builder.createDst(newDst->getRegVar(),
2879 dstOffset / numEltPerGRF<Type_UB>(),
2880 (dstOffset % numEltPerGRF<Type_UB>()) / TypeSize(Type_UD), 1, Type_UD);
2881
2882 G4_INST* movInst = builder.createMov(G4_ExecSize(numDwords), dstOpnd, srcOpnd, InstOpt_WriteEnable, false);
2883
2884 INST_LIST_ITER movPos = bb->insertBefore(iter, movInst);
2885
2886 if (numDwords == numEltPerGRF<Type_UD>() * 2 &&
2887 ((dstOffset % numEltPerGRF<Type_UB>()) != 0 || (srcOffset % numEltPerGRF<Type_UB>()) != 0))
2888 {
2889 // move crosses 2 GRF boundary, needs splitting
2890 evenlySplitInst(movPos, bb);
2891 }
2892 }
2893
2894 // like the above, but source is an indirect 64-bit source and dst offset is always 0
2895 // If source is Indirect 1x1, we generate
2896 // mov (esize*2) tmp<1>:ud r[A0]<1;1,0>:ud
2897 // ... tmpSrc<region>:q
2898 // If source is VxH indirect, we have to generate instead
2899 // mov (esize*2) tmp<1>:ud r[A0]<2,1>:ud
2900 // ... tmpSrc<1;1,0>:q
2901 // as we can't have the indirect region on the 64-bit type operand
2902 // A0 is not changed otherwise
copyDwordsIndirect(G4_Declare * dst,G4_SrcRegRegion * src,int numDwords,G4_BB * bb,INST_LIST_ITER iter)2903 void HWConformity::copyDwordsIndirect(G4_Declare* dst,
2904 G4_SrcRegRegion* src,
2905 int numDwords,
2906 G4_BB* bb,
2907 INST_LIST_ITER iter)
2908 {
2909 MUST_BE_TRUE(
2910 TypeSize(dst->getElemType()) >= 4 && src->getTypeSize() >= 4,
2911 "dst and src must have dword or qword type");
2912
2913 MUST_BE_TRUE(src->getRegAccess() == IndirGRF, "source must be indirect GRF");
2914
2915 G4_Declare* newDst = dst;
2916
2917 if (dst->getElemType() != Type_UD)
2918 {
2919 // create an alias with type UD
2920 newDst = builder.createTempVar(numDwords, Type_UD, Any);
2921 newDst->setAliasDeclare(dst, 0);
2922 }
2923
2924 G4_SrcRegRegion* newSrc = builder.duplicateOperand(src);
2925 MUST_BE_TRUE(newSrc->getTypeSize() == 8, "only support 64-bit type source so far");
2926 newSrc->setType(Type_UD);
2927 newSrc->setModifier(Mod_src_undef);
2928 if (newSrc->getRegion()->isRegionWH())
2929 {
2930 MUST_BE_TRUE(newSrc->getRegion()->width == 1, "only handle <1,0> region for now");
2931 newSrc->setRegion(builder.createRegionDesc(UNDEFINED_SHORT, 2, 1));
2932 }
2933 else
2934 {
2935 newSrc->setRegion(builder.getRegionStride1());
2936 }
2937
2938 G4_DstRegRegion* dstOpnd = builder.createDst(newDst->getRegVar(), 0, 0, 1, Type_UD);
2939
2940 G4_INST* movInst = builder.createMov(G4_ExecSize(numDwords), dstOpnd, newSrc, InstOpt_WriteEnable, false);
2941
2942 bb->insertBefore(iter, movInst);
2943 }
2944
2945 // copy numRegs GRFs from src[srcOffset] to dst[dstOffset]
2946 // dst[dstOffset] and src[srcOffset] are both GRF-aligned
copyRegs(G4_Declare * dst,int dstOffset,G4_Declare * src,int srcOffset,int numRegs,G4_BB * bb,INST_LIST_ITER iter)2947 void HWConformity::copyRegs(G4_Declare* dst,
2948 int dstOffset,
2949 G4_Declare* src,
2950 int srcOffset,
2951 int numRegs,
2952 G4_BB* bb,
2953 INST_LIST_ITER iter)
2954 {
2955 int numByteCopied = 0;
2956 for (; numRegs >= 2; numRegs -= 2, numByteCopied += numEltPerGRF<Type_UB>() * 2)
2957 {
2958 copyDwords(dst, dstOffset + numByteCopied, src, srcOffset + numByteCopied, numEltPerGRF<Type_UD>() * 2, bb, iter);
2959 }
2960 if (numRegs != 0)
2961 {
2962 copyDwords(dst, dstOffset + numByteCopied, src, srcOffset + numByteCopied, numEltPerGRF<Type_UD>(), bb, iter);
2963 }
2964 }
2965
2966 //
2967 // Note that this function may invalidate <iter>
2968 //
emulate64bMov(INST_LIST_ITER iter,G4_BB * bb)2969 bool HWConformity::emulate64bMov(INST_LIST_ITER iter, G4_BB* bb)
2970 {
2971 auto inst = (*iter);
2972 auto origIter = iter;
2973 auto dst = inst->getDst();
2974 auto src0 = inst->getSrc(0);
2975
2976 MUST_BE_TRUE(!inst->getCondMod(), "cant handle cond mod");
2977 auto dstHS = dst->getHorzStride();
2978
2979 auto incrementVar = [&](G4_Operand* var, unsigned int width, unsigned int regOff, unsigned int sregOff, G4_INST* inst, short increment)
2980 {
2981 auto addrDst = builder.createDst(var->getBase(), regOff, sregOff, 1, Type_UW);
2982 auto addrSrc = builder.createSrc(var->getBase(), regOff, sregOff,
2983 builder.getRegionStride1(), Type_UW);
2984 auto incrementImm = builder.createImm(increment, Type_W);
2985 auto addrAddInst = builder.createInternalInst(
2986 nullptr, G4_add, nullptr, g4::NOSAT,
2987 G4_ExecSize(inst->getExecSize() / width),
2988 addrDst, addrSrc, incrementImm, InstOpt_WriteEnable);
2989 return addrAddInst;
2990 };
2991
2992 if (src0->isSrcRegRegion())
2993 {
2994 auto src0RR = src0->asSrcRegRegion();
2995 MUST_BE_TRUE(IS_INT(src0RR->getType()) && IS_INT(dst->getType()), "expecting int types on src, dst");
2996 MUST_BE_TRUE(src0RR->getModifier() == Mod_src_undef, "cannot handle saturation");
2997
2998 const RegionDesc* rgnToUse = nullptr;
2999
3000 if (src0RR->getRegion()->isScalar())
3001 rgnToUse = builder.getRegionScalar();
3002 else if (!src0RR->isIndirect())
3003 {
3004 uint16_t stride = 0;
3005 bool legal = src0RR->getRegion()->isSingleStride(inst->getExecSize(), stride);
3006 MUST_BE_TRUE(legal, "unsupported region");
3007 if (stride == 1)
3008 rgnToUse = builder.getRegionStride2();
3009 else if (stride == 2)
3010 rgnToUse = builder.getRegionStride4();
3011 else
3012 MUST_BE_TRUE(false, "unsupported stride");
3013 }
3014 else
3015 {
3016 if (src0RR->getTypeSize() < 8)
3017 rgnToUse = src0RR->getRegion();
3018 else
3019 {
3020 // this will be broken up in to 2 instructions
3021 auto factor = src0RR->getTypeSize() / dst->getTypeSize();
3022 auto vs = src0RR->getRegion()->vertStride * factor;
3023 auto w = src0RR->getRegion()->width;
3024 auto hs = src0RR->getRegion()->horzStride * factor;
3025 rgnToUse = builder.createRegionDesc(vs, w, hs);
3026 }
3027 }
3028
3029 if (dst->getTypeSize() == 8)
3030 {
3031 if (src0->getTypeSize() == 8)
3032 {
3033 // may be q->uq or uq->q or raw mov
3034 // safe to do raw copy for all 3 cases
3035
3036 bool isNoMaskInst = !inst->getPredicate() && (inst->isWriteEnableInst() || bb->isAllLaneActive());
3037 if (isNoMaskInst && inst->getExecSize() == g4::SIMD1 && src0->asSrcRegRegion()->isScalar())
3038 {
3039 // For SIMD1 case that is not under divergent CF, we can change to UD type directly:
3040 // mov (1) r10.1<1>:uq r20.0<0;1,0>:uq
3041 // =>
3042 // mov (2) r10.2<1>:ud r20.0<1;1,0>:ud
3043 G4_DstRegRegion* newDst = nullptr;
3044 if (dst->isIndirect())
3045 {
3046 newDst = builder.createIndirectDst(dst->getBase(), dst->getSubRegOff(), dst->getHorzStride(), Type_UD, dst->getAddrImm());
3047 }
3048 else
3049 {
3050 newDst = builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff() * 2, dst->getHorzStride(), Type_UD, dst->getAccRegSel());
3051 }
3052
3053 G4_SrcRegRegion* newSrc = nullptr;
3054 if (src0->getRegAccess() == Direct)
3055 {
3056 newSrc = builder.createSrcRegRegion(src0RR->getModifier(), Direct, src0RR->getBase(),
3057 src0RR->getRegOff(), src0RR->getSubRegOff() * 2, builder.getRegionStride1(), Type_UD);
3058 }
3059 else
3060 {
3061 newSrc = builder.createIndirectSrc(src0RR->getModifier(), src0RR->getBase(), src0RR->getRegOff(),
3062 src0RR->getSubRegOff(), builder.getRegionStride1(), Type_UD, src0RR->getAddrImm());
3063 }
3064
3065 inst->setSrc(newSrc, 0);
3066 inst->setDest(newDst);
3067 inst->setExecSize(G4_ExecSize(inst->getExecSize() * 2u));
3068 inst->setOptionOn(InstOpt_WriteEnable);
3069 inst->setMaskOption(InstOpt_M0);
3070
3071 return true;
3072 }
3073 else
3074 {
3075 // mov (8) r10.0<1>:uq r20.0<1;1,0>:uq
3076 // =>
3077 // mov (8) r10.0<2>:ud r20.0<2;1,0>:ud
3078 // mov (8) r10.1<2>:ud r20.1<2;1,0>:ud
3079
3080 // 1st half
3081 auto newDst = dst->isIndirect() ? (builder.createIndirectDst(dst->getBase(), dst->getSubRegOff(), 2 * dstHS, Type_UD, dst->getAddrImm())) :
3082 (builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff() * 2, 2 * dstHS, Type_UD));
3083 auto newSrc = builder.createSrcRegRegion(Mod_src_undef, src0RR->getRegAccess(), src0RR->getBase(), src0RR->getRegOff(),
3084 src0RR->isIndirect() ? src0RR->getSubRegOff() : (src0RR->getSubRegOff() * 2), rgnToUse, Type_UD);
3085 newSrc->setImmAddrOff(src0RR->getAddrImm());
3086 auto newInst = builder.createMov(inst->getExecSize(), newDst, newSrc, inst->getOption(), false);
3087 newInst->setPredicate(inst->getPredicate() ? builder.createPredicate(*inst->getPredicate()) : nullptr);
3088 iter = bb->insertBefore(origIter, newInst);
3089
3090 // second half
3091 bool dstAddrIncremented = false, src0AddrIncremented = false;
3092 unsigned int immAddrOff = 4;
3093 if (dst->isIndirect() && (4 + dst->getAddrImm()) > 512)
3094 {
3095 // increment dst address register by 4, later decrement it
3096 dstAddrIncremented = true;
3097 immAddrOff = 0;
3098 iter = bb->insertBefore(origIter, incrementVar(dst, inst->getExecSize(), dst->getRegOff(), dst->getSubRegOff(), inst, 4));
3099 }
3100 newDst = dst->isIndirect() ? (builder.createIndirectDst(dst->getBase(), dst->getSubRegOff(), 2 * dstHS, Type_UD, immAddrOff + dst->getAddrImm())) :
3101 (builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff() * 2 + 1, 2 * dstHS, Type_UD));
3102 newSrc = builder.createSrcRegRegion(Mod_src_undef, src0RR->getRegAccess(), src0RR->getBase(), src0RR->getRegOff(),
3103 src0RR->isIndirect() ? src0RR->getSubRegOff() : (src0RR->getSubRegOff() * 2 + 1), rgnToUse, Type_UD);
3104 if (newSrc->isIndirect())
3105 {
3106 // upper 4 bytes
3107 if ((4 + src0RR->getAddrImm()) > 512)
3108 {
3109 src0AddrIncremented = true;
3110 iter = bb->insertBefore(origIter, incrementVar(src0RR, src0RR->getRegion()->width, src0RR->getRegOff(), src0RR->getSubRegOff(), inst, 4));
3111 newSrc->setImmAddrOff(src0RR->getAddrImm());
3112 }
3113 else
3114 newSrc->setImmAddrOff(4 + src0RR->getAddrImm());
3115 }
3116 newInst = builder.createMov(inst->getExecSize(), newDst, newSrc, inst->getOption(), false);
3117 newInst->setPredicate(inst->getPredicate() ? builder.createPredicate(*inst->getPredicate()) : nullptr);
3118 iter = bb->insertBefore(origIter, newInst);
3119
3120 if (dstAddrIncremented)
3121 {
3122 iter = bb->insertBefore(origIter, incrementVar(dst, inst->getExecSize(), dst->getRegOff(), dst->getSubRegOff(), inst, -4));
3123 }
3124
3125 if (src0AddrIncremented)
3126 {
3127 iter = bb->insertBefore(origIter, incrementVar(src0RR, src0RR->getRegion()->width, src0RR->getRegOff(), src0RR->getSubRegOff(), inst, -4));
3128 }
3129
3130 bb->erase(origIter);
3131
3132 return true;
3133 }
3134 }
3135 else if (dst->getTypeSize() == 8 && src0->getTypeSize() < 8)
3136 {
3137 // d/ud/w/uw/b/ub -> q/uq
3138 if (IS_SIGNED_INT(src0->getType()))
3139 {
3140 // when src is signed, sign extend
3141 // b/w/d -> q/uq
3142 //
3143 // dst<2>.0:d = src:[d|w|b]
3144 // dst<2>.1:d = asr dst<2>.0:d 31
3145 auto newDst = dst->isIndirect() ? (builder.createIndirectDst(dst->getBase(), dst->getSubRegOff(), 2 * dstHS, Type_D, dst->getAddrImm())) :
3146 (builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff() * 2, 2 * dstHS, Type_D));
3147 auto newSrc = builder.createSrcRegRegion(*src0RR);
3148 auto newInst = builder.createMov(inst->getExecSize(), newDst, newSrc, inst->getOption(), false);
3149 newInst->setPredicate(inst->getPredicate() ? builder.createPredicate(*inst->getPredicate()) : nullptr);
3150 iter = bb->insertBefore(origIter, newInst);
3151
3152 bool dstAddrIncremented = false;
3153 unsigned int immAddrOff = 4;
3154 if (dst->isIndirect() && (4 + dst->getAddrImm()) > 512)
3155 {
3156 // increment dst address register by 4, later decrement it
3157 dstAddrIncremented = true;
3158 immAddrOff = 0;
3159 iter = bb->insertBefore(origIter, incrementVar(dst, inst->getExecSize(), dst->getRegOff(), dst->getSubRegOff(), inst, 4));
3160 }
3161
3162 newDst = dst->isIndirect() ? (builder.createIndirectDst(dst->getBase(), dst->getSubRegOff(), 2 * dstHS, Type_D, immAddrOff + dst->getAddrImm())) :
3163 (builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff() * 2 + 1, 2 * dstHS, Type_D));
3164 if (dst->isIndirect())
3165 {
3166 newSrc = builder.createSrcRegRegion(Mod_src_undef, IndirGRF, dst->getBase(), dst->getRegOff(), dst->getSubRegOff(),
3167 rgnToUse, Type_D);
3168 newSrc->setImmAddrOff(newDst->getAddrImm());
3169 }
3170 else
3171 newSrc = builder.createSrc(dst->getBase(), dst->getRegOff(), dst->getSubRegOff() * 2,
3172 builder.getRegionStride2(), Type_D);
3173 auto imm31 = builder.createImm(31, Type_W);
3174 newInst = builder.createBinOp(G4_asr, inst->getExecSize(), newDst, newSrc, imm31, inst->getOption(), false);
3175 newInst->setPredicate(inst->getPredicate() ? builder.createPredicate(*inst->getPredicate()) : nullptr);
3176 iter = bb->insertBefore(origIter, newInst);
3177
3178 if (dstAddrIncremented)
3179 {
3180 iter = bb->insertBefore(origIter, incrementVar(dst, inst->getExecSize(), dst->getRegOff(), dst->getSubRegOff(), inst, -4));
3181 }
3182
3183 bb->erase(origIter);
3184
3185 return true;
3186 }
3187 else
3188 {
3189 // when src is unsigned, zero extend
3190 // ub/uw/ud -> q/uq
3191 //
3192 // dst<2>.0:ud = src:[ud|uw|ub]
3193 // dst<2>.1:ud = 0
3194
3195 auto newDst = dst->isIndirect() ? (builder.createIndirectDst(dst->getBase(), dst->getSubRegOff(), 2 * dstHS, Type_UD, dst->getAddrImm())) :
3196 (builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff() * 2, 2 * dstHS, Type_UD));
3197 auto newSrc = builder.createSrcRegRegion(*src0RR);
3198 auto newInst = builder.createMov(inst->getExecSize(), newDst, newSrc, inst->getOption(), false);
3199 newInst->setPredicate(inst->getPredicate() ? builder.createPredicate(*inst->getPredicate()) : nullptr);
3200 iter = bb->insertBefore(origIter, newInst);
3201
3202 bool dstAddrIncremented = false;
3203 unsigned int immAddrOff = 4;
3204 if (dst->isIndirect() && (4 + dst->getAddrImm()) > 512)
3205 {
3206 // increment dst address register by 4, later decrement it
3207 dstAddrIncremented = true;
3208 immAddrOff = 0;
3209 iter = bb->insertBefore(origIter, incrementVar(dst, inst->getExecSize(), dst->getRegOff(), dst->getSubRegOff(), inst, 4));
3210 }
3211 newDst = dst->isIndirect() ? (builder.createIndirectDst(dst->getBase(), dst->getSubRegOff(), 2 * dstHS, Type_UD, immAddrOff + dst->getAddrImm())) :
3212 (builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff() * 2 + 1, 2 * dstHS, Type_UD));
3213 auto imm0 = builder.createImm(0);
3214 newInst = builder.createMov(inst->getExecSize(), newDst, imm0, inst->getOption(), false);
3215 newInst->setPredicate(inst->getPredicate() ? builder.createPredicate(*inst->getPredicate()) : nullptr);
3216 iter = bb->insertBefore(origIter, newInst);
3217
3218 if (dstAddrIncremented)
3219 {
3220 iter = bb->insertBefore(origIter, incrementVar(dst, inst->getExecSize(), dst->getRegOff(), dst->getSubRegOff(), inst, -4));
3221 }
3222
3223 bb->erase(origIter);
3224
3225 return true;
3226 }
3227 }
3228 }
3229 else if (dst->getTypeSize() < 8 && src0->getTypeSize() == 8)
3230 {
3231 // truncate
3232 // q/uq -> d/ud/w/uw/b/ub
3233 // 1. mov(8) r10.0<1>:d r20.0<1;1,0>:uq
3234 // =>
3235 // mov(8) r10.0<1>:d r20.0<2;1,0>:d
3236 //
3237 // 2. mov(8) r10.0<1>:d r20.1<2;1,0>:uq
3238 // =>
3239 // mov(8) r10.0<1>:d r20.2<4;1,0>:d
3240
3241 unsigned int factor = src0->getTypeSize() / dst->getTypeSize();
3242 auto newDst = builder.createDstRegRegion(*dst);
3243 auto newSrc = builder.createSrcRegRegion(Mod_src_undef, src0RR->getRegAccess(), src0RR->getBase(), src0RR->getRegOff(),
3244 src0RR->isIndirect() ? src0RR->getSubRegOff() : (src0RR->getSubRegOff() * factor), rgnToUse, dst->getType());
3245 newSrc->setImmAddrOff(src0RR->getAddrImm());
3246 auto newInst = builder.createMov(inst->getExecSize(), newDst, newSrc, inst->getOption(), false);
3247 newInst->setPredicate(inst->getPredicate() ? builder.createPredicate(*inst->getPredicate()) : nullptr);
3248 iter = bb->insertBefore(origIter, newInst);
3249
3250 bb->erase(origIter);
3251
3252 return true;
3253 }
3254 }
3255 else if (src0->isImm())
3256 {
3257 auto imm = src0->asImm()->getInt();
3258 int low = imm & 0xffffffff;
3259 int high = (imm >> 32) & 0xffffffff;
3260
3261 // low
3262 auto newDst = dst->isIndirect() ? (builder.createIndirectDst(dst->getBase(), dst->getSubRegOff(), 2 * dstHS, Type_D, dst->getAddrImm())) :
3263 (builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff() * 2, 2 * dstHS, Type_D));
3264 auto immLowSrc = builder.createImm(low, Type_D);
3265 auto newInst = builder.createMov(inst->getExecSize(), newDst, immLowSrc, inst->getOption(), false);
3266 newInst->setPredicate(inst->getPredicate() ? builder.createPredicate(*inst->getPredicate()) : nullptr);
3267 iter = bb->insertBefore(origIter, newInst);
3268
3269 // high
3270 bool dstAddrIncremented = false;
3271 unsigned int immAddrOff = 4;
3272 if (dst->isIndirect() && (4 + dst->getAddrImm()) > 512)
3273 {
3274 // increment dst address register by 4, later decrement it
3275 dstAddrIncremented = true;
3276 immAddrOff = 0;
3277 iter = bb->insertBefore(origIter, incrementVar(dst, inst->getExecSize(), dst->getRegOff(), dst->getSubRegOff(), inst, 4));
3278 }
3279 newDst = dst->isIndirect() ? (builder.createIndirectDst(dst->getBase(), dst->getSubRegOff(), 2 * dstHS, Type_D, immAddrOff + dst->getAddrImm())) :
3280 (builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff() * 2 + 1, 2 * dstHS, Type_D));
3281 auto immHigh = builder.createImm(high, Type_D);
3282 newInst = builder.createMov(inst->getExecSize(), newDst, immHigh, inst->getOption(), false);
3283 newInst->setPredicate(inst->getPredicate() ? builder.createPredicate(*inst->getPredicate()) : nullptr);
3284 iter = bb->insertBefore(origIter, newInst);
3285
3286 if (dstAddrIncremented)
3287 {
3288 iter = bb->insertBefore(origIter, incrementVar(dst, inst->getExecSize(), dst->getRegOff(), dst->getSubRegOff(), inst, -4));
3289 }
3290
3291 bb->erase(origIter);
3292
3293 return true;
3294 }
3295
3296 return false;
3297 }
3298
fix64bInst(INST_LIST_ITER iter,G4_BB * bb)3299 bool HWConformity::fix64bInst(INST_LIST_ITER iter, G4_BB* bb)
3300 {
3301
3302 // HW restrictions:
3303 // [DevCHV, DevBXT]: When source or destination datatype is 64b, indirect addressing must not be used.
3304 // the region rules are:
3305 // Source and Destination horizontal stride must be aligned to the execution datatype.
3306 // Example:
3307 // mov (4) r10.0:df r11.0<16;8,2>:f // Source stride must be 2 since datatype is smaller
3308 // move (4) r10.0<2>:f r11.0<4;4,1>:df // Destination stride must be 2 since datatype is smaller.
3309 // as this would require splitting in some cases
3310 // Regioning must ensure Src.Vstride = Src.Width * Src.Hstride
3311 // Source and Destination offset must be the same, except the case of scalar source
3312 // [DevCHV, DevBXT]: When source or destination datatype is 64b, indirect addressing must not be used.
3313 // [DevCHV, DevBXT]: ARF registers must never be used with 64b datatype.
3314
3315 if (!builder.no64bitRegioning())
3316 {
3317 return false;
3318 }
3319
3320 G4_INST* inst = *iter;
3321 bool uses64BitType = false;
3322 bool isDWMultiply = false;
3323 uint8_t execSize = inst->getExecSize();
3324
3325 if (inst->mayExceedTwoGRF())
3326 {
3327 return false;
3328 }
3329 if (inst->getDst() && inst->getDst()->getTypeSize() == 8)
3330 {
3331 uses64BitType = true;
3332 }
3333 for (int i = 0, size = inst->getNumSrc(); !uses64BitType && i < size; i++)
3334 {
3335 G4_Operand* src = inst->getSrc(i);
3336
3337 if (src && src->getTypeSize() == 8)
3338 {
3339 uses64BitType = true;
3340 }
3341 }
3342 if (inst->opcode() == G4_mul && IS_DTYPE(inst->getSrc(0)->getType()) &&
3343 IS_DTYPE(inst->getSrc(1)->getType()))
3344 {
3345 //WA: dw*dw multiply is considered to use 64bit data type since the result is 64-bit
3346 uses64BitType = true;
3347 isDWMultiply = true;
3348 }
3349
3350 if (uses64BitType)
3351 {
3352 if (builder.noInt64())
3353 {
3354 // handle i64 mov/add/cmp/sel
3355 // ToDo: move it to its own pass
3356 if (inst->opcode() == G4_mov && IS_INT(inst->getDst()->getType()) && IS_INT(inst->getSrc(0)->getType()))
3357 {
3358 if (emulate64bMov(iter, bb))
3359 return true;
3360 }
3361 }
3362
3363 int numSrc = inst->getNumSrc();
3364
3365 // handle indirect sources first
3366 for (int i = 0; i < numSrc; ++i)
3367 {
3368 G4_Operand* src = inst->getSrc(i);
3369 if (src != nullptr && src->isSrcRegRegion() && src->asSrcRegRegion()->getRegAccess() == IndirGRF)
3370 {
3371 G4_SrcRegRegion* srcAsRegion = src->asSrcRegRegion();
3372 const RegionDesc* region = srcAsRegion->getRegion();
3373 int byteSize = srcAsRegion->getTypeSize();
3374 if (byteSize == 8)
3375 {
3376 // right bound is not available for indirect operands
3377 // FIXME: this code should be moved to getRightBound()
3378 int rightBound = 0;
3379 // we must change move type to UD
3380
3381 if (region->isScalar())
3382 {
3383 rightBound = byteSize;
3384 }
3385 else if (region->isRegionWH())
3386 {
3387 rightBound = inst->getExecSize() * byteSize;
3388 }
3389 else
3390 {
3391 int num_rows = inst->getExecSize() / region->width;
3392 rightBound = (num_rows - 1) * region->vertStride * byteSize +
3393 region->horzStride * (region->width - 1) * byteSize +
3394 byteSize;
3395 }
3396
3397 int numDwords = rightBound / TypeSize(Type_UD);
3398 numDwords = Round_Up_Pow2(numDwords);
3399 G4_Declare* tmpSrc = builder.createTempVar(numDwords / 2, src->getType(), GRFALIGN);
3400 // new source's region varies depending on whether it's VxH or 1x1
3401 const RegionDesc* newRegion = region->isRegionWH() ? builder.getRegionStride1() : region;
3402 copyDwordsIndirect(tmpSrc, srcAsRegion, numDwords, bb, iter);
3403 G4_SrcRegRegion* tmpSrcOpnd = builder.createSrcRegRegion(srcAsRegion->getModifier(),
3404 Direct, tmpSrc->getRegVar(), 0, 0, newRegion, tmpSrc->getElemType());
3405 inst->setSrc(tmpSrcOpnd, i);
3406 }
3407 else
3408 {
3409 // use the good ol' insertMovBefore
3410 G4_Operand* tmpSrc = insertMovBefore(iter, i, src->getType(), bb);
3411 G4_Declare* tmpSrcDcl = tmpSrc->getTopDcl();
3412 tmpSrcDcl->setSubRegAlign(GRFALIGN);
3413 inst->setSrc(tmpSrc, i);
3414 }
3415 }
3416 }
3417
3418 // now handle direct sources with bad region/alignment
3419 bool hasSameOffset = hasSameSubregOffset(inst);
3420 for (int i = 0; i < numSrc; i++)
3421 {
3422 G4_Operand* src = inst->getSrc(i);
3423 if (src != NULL && src->isSrcRegRegion())
3424 {
3425 G4_SrcRegRegion* srcAsRegion = src->asSrcRegRegion();
3426 const RegionDesc* region = srcAsRegion->getRegion();
3427 int byteSize = srcAsRegion->getTypeSize();
3428
3429 if (!isDWMultiply && !region->isScalar() &&
3430 (byteSize != 8 && (byteSize * region->horzStride) < 8))
3431 {
3432 // source is not 8 byte aligned
3433 // this can happen e.g. for
3434 // mov (8) r1.0<1>:df (mod)r3<8;8,1>:f
3435 // which we'd need to change to
3436 // mov (8) r10.0<2>:f (mod)r3.0<8;8,1>:f
3437 // mov (8) r1.0<1>:df r10.0<8;4,2>:f
3438 // to satisfy rule 1
3439 uint8_t exSize = inst->getExecSize();
3440 uint16_t multFactor = (uint16_t)(8 / byteSize);
3441 G4_Type tmpType = srcAsRegion->getType();
3442 if (multFactor == 8)
3443 {
3444 // byte type needs special handling since we can't have stride 8
3445 tmpType = (tmpType == Type_B) ? Type_W : Type_UW;
3446 multFactor = 4;
3447 }
3448 MUST_BE_TRUE(multFactor != 8, "does not support 64b operation with byte source");
3449 G4_Declare* tmp = builder.createTempVar(exSize * multFactor,
3450 tmpType, GRFALIGN);
3451 G4_DstRegRegion* tmpDst = builder.createDstRegRegion(tmp, multFactor);
3452 G4_INST* movInst = builder.createMov(inst->getExecSize(), tmpDst, src, inst->getOption(), false);
3453 bb->insertBefore(iter, movInst);
3454 uint16_t width = exSize;
3455 if (width * 8u > numEltPerGRF<Type_UB>())
3456 {
3457 // can't have width cross GRF
3458 width = 4;
3459 }
3460 G4_SrcRegRegion* newSrc = builder.createSrcRegRegion(tmp,
3461 builder.createRegionDesc((uint16_t)multFactor * width, width, multFactor));
3462 inst->setSrc(newSrc, i);
3463 }
3464 else if (region->isScalar())
3465 {
3466 #if 0
3467 // scalar region still must be aligned to qword, though it can be any qword
3468 if (byteSize < 8 && !builder.isOpndAligned(srcAsRegion, 8))
3469 {
3470 G4_Operand* tmpSrc = insertCopyBefore(iter, i, Four_Word, bb);
3471 inst->setSrc(tmpSrc, i);
3472 }
3473 #endif
3474 }
3475 else if (!hasSameOffset)
3476 {
3477 // we need a temp src that is GRF-aligned
3478 if (byteSize == 8)
3479 {
3480 // the same src/dst offset restriction applies to move as well, so we have to generate
3481 // a packed move with UD type to work around the restriction
3482 // e.g., for
3483 // add (2) ... r1.1<4;2,2>:q
3484 // we turn it into
3485 // mov (8) r10.0<1>:ud r1.2<1;1,0>:ud {NoMask}
3486 // add (2) ... r10.0<4;2,2>:q
3487 int numDwords = (src->getRightBound() - src->getLeftBound() + 1) / TypeSize(Type_UD);
3488 numDwords = Round_Up_Pow2(numDwords);
3489 G4_Declare* tmpSrc = builder.createTempVar(numDwords / 2, src->getType(), GRFALIGN);
3490 copyDwords(tmpSrc, 0, src->getTopDcl(), src->getLeftBound(), numDwords, bb, iter);
3491 G4_SrcRegRegion* tmpSrcOpnd = builder.createSrcRegRegion(srcAsRegion->getModifier(),
3492 Direct, tmpSrc->getRegVar(), 0, 0, srcAsRegion->getRegion(), tmpSrc->getElemType());
3493 inst->setSrc(tmpSrcOpnd, i);
3494 }
3495 else
3496 {
3497 // use the good ol' insertMovBefore
3498 G4_Operand* tmpSrc = insertMovBefore(iter, i, src->getType(), bb);
3499 G4_Declare* tmpSrcDcl = tmpSrc->getTopDcl();
3500 tmpSrcDcl->setSubRegAlign(GRFALIGN);
3501 inst->setSrc(tmpSrc, i);
3502 }
3503 }
3504 }
3505 }
3506
3507 for (int i = 0; i < numSrc; i++)
3508 {
3509 // rewrite <1;1,0> to <2;2,1> since HW does not like the former
3510 G4_Operand* src = inst->getSrc(i);
3511 if (src != nullptr && src->isSrcRegRegion())
3512 {
3513 G4_SrcRegRegion* srcAsRegion = src->asSrcRegRegion();
3514 const RegionDesc* region = srcAsRegion->getRegion();
3515 if (!region->isRegionWH() && region->vertStride != region->horzStride * region->width)
3516 {
3517 // see if we can fix the region to satisfy VS = W * HS
3518 if (region->width == inst->getExecSize())
3519 {
3520 // vs is a don't care, change to <w*hs, w, hz>
3521 srcAsRegion->setRegion(builder.createRegionDesc(region->width * region->horzStride, region->width, region->horzStride));
3522 }
3523 else if (region->width == 1)
3524 {
3525 // hs is a don't care, change it to <esize*vs, esize, vs>
3526 MUST_BE_TRUE(region->vertStride <= 4, "illegal vertical stride");
3527
3528 uint16_t wd = inst->getExecSize();
3529 uint16_t hs = region->vertStride;
3530 if (src->crossGRF())
3531 {
3532 // Make sure the new hs does not cross GRF
3533 uint32_t nbytesIn1stGRF = numEltPerGRF<Type_UB>() - (src->getLeftBound() % numEltPerGRF<Type_UB>());
3534 uint32_t eltBytes = srcAsRegion->getTypeSize();
3535 uint32_t neltsIn1stGRF = nbytesIn1stGRF / eltBytes;
3536
3537 MUST_BE_TRUE((nbytesIn1stGRF % eltBytes) == 0, "Bad region with element crossing GRF");
3538 MUST_BE_TRUE((neltsIn1stGRF % hs) == 0, "hs cannot cross GRF");
3539
3540 wd = neltsIn1stGRF / hs;
3541 // Get the largest powOfTwo that can divide wd
3542 wd = wd & (-wd);
3543 //MUST_BE_TRUE(wd > 1, "Cannot select non-1 width w/o crossing GRF");
3544 }
3545 srcAsRegion->setRegion(builder.createRegionDesc(wd * hs, wd, hs));
3546 }
3547
3548 else
3549 {
3550 // FIXME: Both VS and HS are used by the region, so we have to either split inst or insert multiple moves to pack the source
3551 // both are painful, so we assert for now and fix later if we encounter such a case
3552 MUST_BE_TRUE(false, "Unhandled bad 64b region on CHV/BXT");
3553 }
3554
3555 }
3556 }
3557 }
3558 G4_DstRegRegion* dst = inst->getDst();
3559 if (dst != NULL && !dst->isNullReg())
3560 {
3561 bool needsTmpDst = dst->getRegAccess() != Direct ||
3562 (execSize > 1 && !hasSameOffset) ||
3563 dst->isAreg();
3564 if (needsTmpDst)
3565 {
3566 // we need to have a temp dst that is direct and GRF-aligned
3567 if (dst->getRegAccess() == Direct && dst->getTypeSize() == 8)
3568 {
3569 // the same src/dst offset restriction applies to move as well, so we have to generate
3570 // a move with UD type to work around the restriction
3571 // e.g., for
3572 // add (2) r1.2<1>:q ...
3573 // we generate
3574 // add (2) r3.0<1>:q ...
3575 // mov (4) r1.4<1>:ud r3.0<1;1,0>:ud {NoMask}
3576 // If dst is not contiguous, we additionally add a move to pre-load the old values:
3577 // add (2) r1.2<2>:q ...
3578 // becomes
3579 // mov (8) r3.0<1>:ud r1.4<1;1,0>:ud {NoMask}
3580 // add (2) r3.0<2>:q ...
3581 // mov (8) r1.4<1>:ud r3.0<1;1,0>:ud {NoMask}
3582 int numDwords = (dst->getRightBound() - dst->getLeftBound() + 1) / TypeSize(Type_UD);
3583 numDwords = Round_Up_Pow2(numDwords);
3584 G4_Declare* tmpDst = builder.createTempVar(numDwords / 2, dst->getType(), GRFALIGN);
3585 if (numDwords > execSize * 2)
3586 {
3587 // dst is not packed, need a move to pre-load the dst value into tmp
3588 copyDwords(tmpDst, 0, dst->getTopDcl(), dst->getLeftBound(), numDwords, bb, iter);
3589 }
3590 INST_LIST_ITER next = iter;
3591 ++next;
3592 copyDwords(dst->getTopDcl(), dst->getLeftBound(), tmpDst, 0, numDwords, bb, next);
3593 inst->setDest(builder.createDstRegRegion(tmpDst, dst->getHorzStride()));
3594 }
3595 else
3596 {
3597 // use the good ol' insertMoveAfter
3598 G4_DstRegRegion* tmpDst = insertMovAfter(iter, dst, dst->getType(), bb);
3599 G4_Declare* tmpDstDcl = tmpDst->getTopDcl();
3600 tmpDstDcl->setSubRegAlign(GRFALIGN);
3601 inst->setDest(tmpDst);
3602 if (dst->getTypeSize() == 8)
3603 {
3604 // tmpDst is indirect and thus still does not conform
3605 // we rewrite
3606 // mov (e) r[a0.0]<1>:q src<1;1,0>:q
3607 // into
3608 // mov (e*2) r[a0.0]<1>:ud src<1;1,0>:ud {NoMask}
3609 ++iter;
3610 G4_INST* movInst = *iter;
3611 MUST_BE_TRUE(movInst->opcode() == G4_mov && movInst->getDst() == dst &&
3612 movInst->getSrc(0)->isSrcRegRegion(),
3613 "unexpected instruction created by insertMovAfter");
3614 MUST_BE_TRUE(dst->getHorzStride() == 1, "only stride 1 is supported for now");
3615 dst->setType(Type_UD);
3616 G4_SrcRegRegion* src = movInst->getSrc(0)->asSrcRegRegion();
3617 G4_Declare* tmpAsUD = builder.createTempVar(tmpDstDcl->getNumElems() * 2, Type_UD, Any);
3618 tmpAsUD->setAliasDeclare(tmpDstDcl, 0);
3619 const RegionDesc* newRegion = src->getRegion()->isScalar() ?
3620 builder.createRegionDesc(0, 2, 1) : builder.getRegionStride1();
3621 G4_SrcRegRegion* srcAsUD = builder.createSrcRegRegion(src->getModifier(),
3622 src->getRegAccess(), tmpAsUD->getRegVar(), src->getRegOff(),
3623 src->getSubRegOff() * 2, newRegion, tmpAsUD->getElemType());
3624 movInst->setSrc(srcAsUD, 0);
3625 movInst->setExecSize(G4_ExecSize(inst->getExecSize() * 2u));
3626
3627 // NoMask is set on the mov instruction, but if we fall outside of the new execution size,
3628 // it won't be executed fully
3629 // e.g., we have to change
3630 // (W) mov (16|M24) r[a0.0,64]<1>:ud r67.0<8;8,1>:ud
3631 // into
3632 // (W) mov (16|M0) r[a0.0,64]<1>:ud r67.0<8;8,1>:ud
3633 movInst->setMaskOption(InstOpt_M0);
3634
3635
3636 // mov saturate/pred to the original inst
3637 movInst->setOptionOn(InstOpt_WriteEnable);
3638 if (movInst->getSaturate())
3639 {
3640 movInst->setSaturate(g4::NOSAT);
3641 inst->setSaturate(g4::SAT);
3642 }
3643 G4_Predicate* pred = movInst->getPredicate();
3644 if (pred)
3645 {
3646 MUST_BE_TRUE(inst->getPredicate() == nullptr, "both inst and movInst have predicates");
3647 movInst->setPredicate(nullptr);
3648 inst->setPredicate(pred);
3649 }
3650 }
3651 }
3652 }
3653 }
3654 }
3655 return false;
3656 }
3657
3658 //------------------------------------------------------------------------------
3659 //
3660 // For BDW, 32 bits integer multiply is implemented as the following macro
3661 //
3662 // mul (8) acc0:d r2.0<8;8,1>d r3.0<16;8,2>:uw
3663 // mach (8) rTemp<1>:d r2.0<8;8,1>d r3.0<8;8,1>:d
3664 // mov (8) r5.0<1>:d rTemp:d // hi-32bits
3665 // mov (8) r6.0<1>:d acc0:d // lo-32bits
3666 //
3667 // Note that this only changes the mul instruction's src1, mach and mov is generated elsewhere
3668 //------------------------------------------------------------------------------
fixMulSrc1(INST_LIST_ITER i,G4_BB * bb)3669 void HWConformity::fixMulSrc1(INST_LIST_ITER i, G4_BB* bb)
3670 {
3671 G4_INST* inst = *i;
3672 G4_Operand* src1 = inst->getSrc(1);
3673
3674 if (!IS_DTYPE(src1->getType()))
3675 {
3676 // this could happen if dst is Q
3677 return;
3678 }
3679
3680 if (src1->isImm())
3681 {
3682 uint64_t truncVal = src1->asImm()->getImm() & 0xFFFF;
3683 G4_Imm* new_src1 = builder.createImm(truncVal, Type_UW);
3684 inst->setSrc(new_src1, 1);
3685 }
3686 else
3687 {
3688 assert(src1->isSrcRegRegion() && "region expected");
3689 G4_SrcRegRegion* srcRegion = src1->asSrcRegRegion();
3690 const RegionDesc* rd = srcRegion->getRegion();
3691
3692 // create a new opnd with type UW
3693 unsigned short scale = TypeSize(Type_D) / TypeSize(Type_UW);
3694 unsigned short newHS = rd->horzStride * scale;
3695 unsigned short newVS = rd->vertStride * scale;
3696 const RegionDesc* new_rd = builder.createRegionDesc(newVS, rd->width, newHS);
3697 short subRegOff = srcRegion->getSubRegOff();
3698 if (srcRegion->getRegAccess() == Direct)
3699 {
3700 subRegOff *= scale;
3701 }
3702 auto new_src1 = builder.createSrcRegRegion(
3703 srcRegion->getModifier(), srcRegion->getRegAccess(),
3704 srcRegion->getBase(), srcRegion->getRegOff(), subRegOff, new_rd,
3705 Type_UW);
3706 inst->setSrc(new_src1, 1);
3707 if (srcRegion->getRegAccess() != Direct)
3708 {
3709 new_src1->setImmAddrOff(srcRegion->getAddrImm());
3710 }
3711 }
3712 }
3713
3714 /*
3715 * only acc0 may be used in DWord operations, so we have to break a
3716 * SIMD16 DWord multiply into two mul-mach-mov sequences.
3717 *
3718 * Input:
3719 * (f0) mul (16) dst:d src0:d src1:d
3720 *
3721 * Output:
3722 * mul (8) acc0:d src0:d src1:d
3723 * mach (8) null:d src0:d src1:d
3724 * (f0) mov (8) dst:d acc0:d
3725 * mul (8) acc0:d src0+1:d src1+1:d
3726 * mach (8) null:d src0+1:d src1+1:d
3727 * (f1) mov (8) dst+1:d acc0:d
3728 *
3729 */
splitDWMULInst(INST_LIST_ITER & start,INST_LIST_ITER & end,G4_BB * bb)3730 void HWConformity::splitDWMULInst(INST_LIST_ITER& start, INST_LIST_ITER& end, G4_BB* bb)
3731 {
3732 // split simd16 inst into SIMD8 ones, since D is not supported for acc1
3733 INST_LIST_ITER iter = start, last_iter = end;
3734 //iter--;
3735 last_iter++;
3736 INST_LIST_ITER curr_iter;
3737 while (iter != end)
3738 {
3739 curr_iter = iter;
3740 evenlySplitInst(curr_iter, bb);
3741 // curr_iter points to the second half after instruction splitting
3742 G4_INST* expand_sec_half_op = *curr_iter;
3743 iter++;
3744
3745 bb->insertBefore(last_iter, expand_sec_half_op);
3746 if (curr_iter == start)
3747 {
3748 start--;
3749 }
3750 bb->erase(curr_iter);
3751 }
3752 // handle the last inst
3753 if (iter == end)
3754 {
3755 evenlySplitInst(iter, bb);
3756 G4_INST* expand_sec_half_op = *iter;
3757 bb->insertBefore(last_iter, expand_sec_half_op);
3758 // For the case that only one instruction needed to split, that is to say start equals to end
3759 if (start == end)
3760 {
3761 start--;
3762 }
3763 end--;
3764 bb->erase(iter);
3765 }
3766 }
3767
isGoodMadType(G4_Type type)3768 static bool isGoodMadType(G4_Type type)
3769 {
3770 switch (type)
3771 {
3772 case Type_F:
3773 case Type_HF:
3774 case Type_DF:
3775 case Type_BF:
3776 return true;
3777 default:
3778 return false;
3779 }
3780 }
3781
isGoodAlign1TernaryDst(G4_INST * inst) const3782 bool HWConformity::isGoodAlign1TernaryDst(G4_INST* inst) const
3783 {
3784 // Align1 MAD requirements:
3785 // -- dst must be direct GRF/acc with horizontal stride 1 or 2
3786 G4_Type execType = inst->getExecType();
3787 G4_DstRegRegion* dst = inst->getDst();
3788
3789 MUST_BE_TRUE(!IS_QTYPE(dst->getType()) && !IS_BTYPE(dst->getType()), "3Src inst don't support Q and B dst types");
3790
3791 if (!builder.hasMixMode() &&
3792 isLowPrecisionFloatTy(dst->getType()) && !isLowPrecisionFloatTy(execType))
3793 {
3794 return false;
3795 }
3796
3797 auto dstTySize = dst->getTypeSize();
3798
3799 int alignInBytes = std::max((int) dstTySize, builder.get3SrcDstAlign());
3800
3801 if (builder.noSrc2Regioning())
3802 {
3803 // src2 is required to have the same subreg as dst if src2 is not a scalar
3804 // If we can't guarantee this we have to align both of them to GRF
3805 unsigned src2Pos = inst->opcode() == G4_pseudo_mad ? 0 : 2;
3806 auto src2 = inst->getSrc(src2Pos);
3807 if (src2->isSrcRegRegion() && !src2->asSrcRegRegion()->isScalar())
3808 {
3809 alignInBytes = getGRFSize();
3810 }
3811 }
3812
3813 if (!builder.isOpndAligned(dst, alignInBytes))
3814 {
3815 // dst may have special alignment due to encoding issues
3816 return false;
3817 }
3818
3819 uint32_t effectiveStride = dst->getHorzStride();
3820 if (dstTySize < TypeSize(execType))
3821 {
3822 if (IS_TYPE_INT(dst->getType()))
3823 {
3824 effectiveStride *= TypeSize(execType) / dstTySize;
3825 }
3826 else
3827 {
3828 // we have mixed HF and F inst
3829 // dst can be packed HF, but then it must be oword aligned
3830 // this should be checked later for mixed mode inst
3831 }
3832 }
3833
3834 return dst->getRegAccess() == Direct && effectiveStride <= 2;
3835 }
3836
3837 //
3838 // check for legal align1 ternary inst sources
3839 //
isGoodAlign1TernarySrc(G4_INST * inst,int srcPos,bool canBeImm)3840 bool HWConformity::isGoodAlign1TernarySrc(G4_INST* inst, int srcPos, bool canBeImm)
3841 {
3842 MUST_BE_TRUE(srcPos >= 0 && srcPos < 3, "illegal source pos");
3843
3844 uint8_t execSize = inst->getExecSize();
3845 G4_Operand* src = inst->getSrc(srcPos);
3846 // for pseudo_mad we have to swap src0 and src2
3847 bool isSrc2 = inst->opcode() == G4_pseudo_mad ? srcPos == 0 : srcPos == 2;
3848
3849 if (!builder.hasMixMode())
3850 {
3851 G4_Type execType = inst->getExecType();
3852 if (isLowPrecisionFloatTy(src->getType()) && !isLowPrecisionFloatTy(execType))
3853 {
3854 return false;
3855 }
3856 }
3857
3858 if (IS_QTYPE(src->getType()))
3859 {
3860 return false;
3861 }
3862
3863 // mad specific checks
3864 if (inst->opcode() == G4_pseudo_mad)
3865 {
3866 if (isSrc2)
3867 {
3868 if (IS_DTYPE(src->getType()))
3869 {
3870 return false;
3871 }
3872
3873 if (builder.noSrc2Regioning() && IS_BTYPE(src->getType()))
3874 {
3875 return false;
3876 }
3877 }
3878 else if (srcPos == 1)
3879 {
3880 if (IS_DTYPE(src->getType()) && src->isSrcRegRegion() &&
3881 src->asSrcRegRegion()->getModifier() != Mod_src_undef)
3882 {
3883 // no source modifier for DW multiply
3884 return false;
3885 }
3886 }
3887 }
3888
3889 if (src->isImm())
3890 {
3891 // either src0 or src2 can be 16b imm, but not both
3892 // permanent WA: simd16 inst can't have src0 imm.
3893 // Instead of splitting, we just add a move
3894
3895 if (canBeImm && (srcPos == 0 || srcPos == 2) && src->getTypeSize() <= 2)
3896 {
3897 if (VISA_WA_CHECK(builder.getPWaTable(), WaNoSimd16TernarySrc0Imm))
3898 {
3899 return !isSrc2 && inst->getExecSize() != g4::SIMD16;
3900 }
3901 return true;
3902 }
3903 return false;
3904 }
3905 else if (src->isSrcRegRegion())
3906 {
3907 if (src->asSrcRegRegion()->getRegAccess() != Direct)
3908 {
3909 return false;
3910 }
3911
3912 auto checkSingleStrideRegion = [](G4_SrcRegRegion* src, int stride, uint8_t execSize, IR_Builder& builder)
3913 {
3914 const RegionDesc* srcRegion = src->getRegion();
3915
3916 if (stride > 4)
3917 {
3918 return false;
3919 }
3920 else if (srcRegion->isContiguous(execSize))
3921 {
3922 // Normalize the region if it is not.
3923 if (srcRegion->width != 1)
3924 {
3925 src->setRegion(builder.getRegionStride1(), /*invariant*/ true);
3926 }
3927 if (!builder.encodeUnitStrideTernary())
3928 {
3929 // we have to make sure width is not being used to cross GRF, as <1;1,0>
3930 // is not a legal region for align1 ternary source (vs 1 not supported)
3931 // mad doesn't support <1;1,0>, the width is at least 2
3932 int minAlignment = src->getTypeSize() * 2;
3933 return builder.isOpndAligned(src, minAlignment);
3934 }
3935 }
3936 return true;
3937 };
3938
3939 // the following regions are supported:
3940 // <N;N,0>
3941 // <0;1,0>
3942 // <W*H;W,H>
3943 const RegionDesc* srcRegion = src->asSrcRegRegion()->getRegion();
3944 if (srcRegion->isScalar())
3945 {
3946 return true;
3947 }
3948
3949 // src0 and src1 (for psuedo-mad, it's src1 and src2) may use the <N;N,0> region
3950 // as they come with a vStride in encoding
3951 // TODO: we may consider swapping src1 and src2 to catch more regions
3952 if (!isSrc2)
3953 {
3954 uint16_t stride = 0;
3955 if (srcRegion->isSingleStride(execSize, stride))
3956 {
3957 return checkSingleStrideRegion(src->asSrcRegRegion(), stride, execSize, builder);
3958 }
3959
3960 if (builder.encodeUnitStrideTernary())
3961 {
3962 // <4;4,0> and <8;8,0> are ok
3963 return srcRegion->vertStride == srcRegion->width &&
3964 srcRegion->horzStride == 0 &&
3965 (srcRegion->width == 4 || srcRegion->width == 8);
3966 }
3967 else
3968 {
3969 // <2;2,0>, <4;4,0> and <8;8,0> are ok
3970 return srcRegion->vertStride == srcRegion->width &&
3971 srcRegion->horzStride == 0 &&
3972 srcRegion->width <= 8;
3973 }
3974 }
3975 else
3976 {
3977 if (!builder.noSrc2Regioning())
3978 {
3979 // src2 (src0 for pseudo-mad) is without vstride, and its region must be
3980 // <esize*H;esize,H>, with vstride derived from exSize and hstride
3981 uint16_t stride = 0;
3982 if (srcRegion->isSingleStride(execSize, stride))
3983 {
3984 return checkSingleStrideRegion(src->asSrcRegRegion(), stride, execSize, builder);
3985 }
3986 }
3987 else
3988 {
3989 // not a scalar, src2 must be GRF aligned.
3990 if (!builder.isOpndAligned(src, numEltPerGRF<Type_UB>()))
3991 {
3992 return false;
3993 }
3994
3995 uint16_t stride = 0;
3996 if (srcRegion->isSingleStride(execSize, stride))
3997 {
3998 unsigned short dstExecSize = inst->getDst()->getExecTypeSize();
3999 unsigned short srcExecSize = stride * src->asSrcRegRegion()->getElemSize();
4000 // Source 2 and destination stride must be aligned to the same execution type.
4001 // E.g. mad (4) r10.0<1>:hf src0 src1 r13.0<1>:hf
4002 // mad (4) r10.0<2>:hf src0 src1 r13.0<1>:f
4003 // mad (4) r10.0<1>:f src0 src1 r13.0<2>:hf
4004 // this rule is relaxed if mix mode is enabled (packed HF ok)
4005 if (dstExecSize == srcExecSize)
4006 {
4007 return true;
4008 }
4009 if (builder.hasPartialMixMode() && inst->isMixedMode())
4010 {
4011 return true;
4012 }
4013 }
4014 }
4015
4016 return false;
4017 }
4018 }
4019
4020 return true;
4021 }
4022
4023 //
4024 // a source is good for align16 if:
4025 // -- it is a direct srcRegRegion
4026 // -- it has contiguous region and can be made either GRF-aligned (for exec size >= 8)
4027 // or oword aligned (for exec size == 4)
4028 // -- or it has scalar region and is not non-simd1 double
isGoodAlign16Src(G4_INST * inst,int srcPos)4029 bool HWConformity::isGoodAlign16Src(G4_INST* inst, int srcPos)
4030 {
4031 MUST_BE_TRUE(srcPos >= 0 && srcPos < 3, "illegal source pos");
4032
4033 uint8_t execSize = inst->getExecSize();
4034 G4_Operand* src = inst->getSrc(srcPos);
4035 G4_Type opnd_type = src->getType();
4036
4037 // Constants are not allowed as MAD opnds.
4038 if (src->isSrcRegRegion())
4039 {
4040 const RegionDesc* region = src->asSrcRegRegion()->getRegion();
4041 G4_RegAccess regAcc = src->asSrcRegRegion()->getRegAccess();
4042
4043 if (regAcc != Direct)
4044 {
4045 return false;
4046 }
4047
4048 if (region->isContiguous(execSize))
4049 {
4050 if (builder.getPlatform() == GENX_BDW && TypeSize(opnd_type) < 4)
4051 {
4052 // BDW HF has to be 32-byte aligned
4053 if (!builder.isOpndAligned(src, 32))
4054 {
4055 return false;
4056 }
4057 }
4058 else
4059 {
4060 if (execSize >= 8)
4061 {
4062 // operand must be GRF aligned, or oword aligned for HF/W
4063 uint32_t align = std::min<uint32_t>(execSize * src->getTypeSize(), 32);
4064 if (!builder.isOpndAligned(src, align))
4065 {
4066 return false;
4067 }
4068 }
4069 else if (execSize == 4 || execSize == 2)
4070 {
4071 // operand must be oword-aligned
4072 if (!builder.isOpndAligned(src, 16))
4073 {
4074 return false;
4075 }
4076 }
4077 }
4078 }
4079 else if (src->asSrcRegRegion()->isScalar())
4080 {
4081 if (opnd_type == Type_DF && execSize != 1)
4082 {
4083 // scalar region is illegal for DF since replicate is not supported
4084 return false;
4085 }
4086
4087 if (opnd_type == Type_HF && builder.getPlatform() == GENX_BDW)
4088 {
4089 return false;
4090 }
4091 }
4092 else
4093 {
4094 // all other regions are illegal
4095 return false;
4096 }
4097
4098 return true;
4099 }
4100 else
4101 {
4102 return false;
4103 }
4104
4105 }
4106
4107 //
4108 // Move modifiers of src2 in pseudo_mad to its defining instruction.
4109 //
4110 // mul (16) V66(0,0)<1>:d V46(23,0)<16;16,1>:w 0x39db:w {Align1, H1}
4111 // psuedo_mad (16) V67(0,0)<1>:d V469,0)<8;8,1>:w 0x1b5d:w -V66(0,0)<16;16,1>:d
4112 //
4113 // becomes
4114 //
4115 // mul (16) V66(0,0)<1>:d -V46(23,0)<16;16,1>:w 0x39db:w {Align1, H1}
4116 // psuedo_mad (16) V67(0,0)<1>:d V469,0)<8;8,1>:w 0x1b5d:w V66(0,0)<16;16,1>:d
4117 //
tryTransferSrcModifier(IR_Builder & builder,G4_INST * def,G4_Operand * src)4118 static void tryTransferSrcModifier(IR_Builder& builder, G4_INST* def,
4119 G4_Operand* src)
4120 {
4121 // Only when def has no other users.
4122 if (!def->hasOneUse())
4123 return;
4124
4125 // Only transfer for integer types.
4126 if (!IS_SIGNED_INT(src->getType()))
4127 return;
4128
4129 // In case the use type is different from the def type.
4130 if (!def->getDst() || (def->getDst()->getType() != src->getType()))
4131 return;
4132
4133 switch (def->opcode()) {
4134 default:
4135 break;
4136
4137 // Probably this is the only interesting op, since G4_math will not be
4138 // used to generate mac.
4139 case G4_mul:
4140 {
4141 // Chances are src1 is an immediate.
4142 G4_Operand* defSrc1 = def->getSrc(1);
4143 if (!IS_SIGNED_INT(defSrc1->getType()))
4144 return;
4145
4146 if (defSrc1->isImm())
4147 {
4148 G4_Imm* val = defSrc1->asImm();
4149 // Mod_Minus is assumed.
4150 G4_Imm* newVal = builder.createImm(-val->getInt(), val->getType());
4151 def->setSrc(newVal, 1);
4152 src->asSrcRegRegion()->setModifier(Mod_src_undef);
4153 }
4154 else if (defSrc1->isSrcRegRegion())
4155 {
4156 G4_SrcRegRegion* reg = defSrc1->asSrcRegRegion();
4157 if (reg->getModifier() == Mod_src_undef)
4158 {
4159 reg->setModifier(src->asSrcRegRegion()->getModifier());
4160 src->asSrcRegRegion()->setModifier(Mod_src_undef);
4161 }
4162 else if (reg->getModifier() == Mod_Minus)
4163 {
4164 reg->setModifier(Mod_src_undef);
4165 src->asSrcRegRegion()->setModifier(Mod_src_undef);
4166 }
4167 }
4168 } break;
4169 }
4170 }
4171
4172 // Try to move source modifiers on MAD's src2 into its defintion. This allows
4173 // pseudo_mad ops to be translated into mac ops.
tryEliminateMadSrcModifier(IR_Builder & builder,G4_INST * inst)4174 void HWConformity::tryEliminateMadSrcModifier(IR_Builder& builder, G4_INST* inst)
4175 {
4176 ASSERT_USER(inst->opcode() == G4_pseudo_mad, "not a speudo-mad");
4177
4178 // For pseudo_mad, src2 is the major source operand to be examined later.
4179 // If there is no modifier on src2, then nothing to do.
4180 G4_Operand* src2 = inst->getSrc(2);
4181 if (!src2->isSrcRegRegion())
4182 return;
4183
4184 // Currently, only handle modifier minus. To handle others, we may need
4185 // to insert extra instructions.
4186 if (src2->asSrcRegRegion()->getModifier() != Mod_Minus)
4187 return;
4188
4189 // Only when src2 has a single definition.
4190 if (G4_INST* def = inst->getSingleDef(Opnd_src2, true))
4191 {
4192 tryTransferSrcModifier(builder, def, src2);
4193 }
4194 }
4195
4196 /// Heuristic to decide whether this fp pseudo-mad should be lowered into a
4197 /// GEN mad or not. Returns true if mad is preferred, false otherwise.
4198 ///
4199 /// We flavor generating non-mad when this vISA mad is part of b2b mads that
4200 /// share the same dst.
4201 ///
isFpMadPreferred(G4_BB * bb,INST_LIST_ITER iter)4202 bool HWConformity::isFpMadPreferred(G4_BB* bb, INST_LIST_ITER iter)
4203 {
4204 G4_INST* inst = *iter;
4205 G4_Operand* dst = inst->getDst();
4206 MUST_BE_TRUE(inst->opcode() == G4_pseudo_mad, "expect pseudo mad");
4207
4208 // Check whether test_inst is sharing the same dst.
4209 auto equal_mad_dst = [](G4_INST* test_inst, G4_Operand* dst)
4210 {
4211 if (test_inst->opcode() == G4_pseudo_mad)
4212 {
4213 G4_Operand* test_dst = test_inst->getDst();
4214 if (test_dst->compareOperand(dst) == Rel_eq)
4215 return true;
4216 }
4217 return false;
4218 };
4219
4220 auto next_iter = std::next(iter);
4221 if (next_iter != bb->end())
4222 {
4223 G4_INST* next_inst = *next_iter;
4224 if (equal_mad_dst(next_inst, dst))
4225 return false;
4226 }
4227 if (iter != bb->begin())
4228 {
4229 auto prev_iter = std::prev(iter);
4230 G4_INST* prev_inst = *prev_iter;
4231 if (equal_mad_dst(prev_inst, dst))
4232 return false;
4233 }
4234
4235 // FIXME: remove possile duplicate calls to isGoodAlign16Src, Cm only.
4236 // This will go away if we use an extra opcode to represent muladd.
4237 unsigned extraMov = 0;
4238 for (int k = 0; k < inst->getNumSrc(); k++)
4239 {
4240 if (!isGoodAlign16Src(inst, k))
4241 {
4242 // If need to insert >1 number of moves, then do not use mad.
4243 if (++extraMov > 1)
4244 return false;
4245 }
4246 }
4247
4248 return true;
4249 }
4250
4251 // generate align1 mad, inserting moves if necessary
4252 // returns true if conversion is successful
4253 // for floating point mad this must succeed due to precision requirements
generateAlign1Mad(G4_BB * bb,INST_LIST_ITER iter)4254 bool HWConformity::generateAlign1Mad(G4_BB* bb, INST_LIST_ITER iter)
4255 {
4256
4257 G4_INST* inst = *iter;
4258 MUST_BE_TRUE(inst->opcode() == G4_pseudo_mad, "expect pseudo mad");
4259 bool mustDoMad = IS_TYPE_FLOAT_ALL(inst->getDst()->getType());
4260
4261
4262 // try swapping src0 (really src2) and src1 to see if we can save a move
4263 // some conditions where swap may help:
4264 // -- if src0 is D, as MAD only supports D + D * W
4265 // -- if src1 is imm, as MAD src2 supports 16-bit imm
4266 // -- if src0 is HF in a mix mode MAD, as MAD src1 supports HF
4267 // -- if src1 is scalar, as MAD src2 has more region restrictions
4268 // We perform the swapping before the dst checks as some platforms require dst and src2 to have the same subreg
4269 {
4270 G4_Operand* src0 = inst->getSrc(0);
4271 G4_Operand* src1 = inst->getSrc(1);
4272 if (IS_DTYPE(src0->getType()) && src0->isSrcRegRegion() && !IS_DTYPE(src1->getType()))
4273 {
4274 inst->swapSrc(0, 1);
4275 }
4276 else if (src1->isImm() && src1->getTypeSize() == 2)
4277 {
4278 //swap src0 and src1 as src0 supports imm
4279 inst->swapSrc(0, 1);
4280 }
4281 else if (src0->isSrcRegRegion() && !src0->asSrcRegRegion()->isScalar() &&
4282 src1->isSrcRegRegion() &&
4283 src1->asSrcRegRegion()->isScalar())
4284 {
4285 // Swap src0 and src1 if src1 is scalar but src0 is not, as src2 regioning support is quite limited.
4286 inst->swapSrc(0, 1);
4287 }
4288 else if (isLowPrecisionFloatTy(src0->getType()) && src1->getType() == Type_F)
4289 {
4290 inst->swapSrc(0, 1);
4291 }
4292 }
4293
4294 if (!isGoodAlign1TernaryDst(inst))
4295 {
4296 if (mustDoMad)
4297 {
4298 auto alignment = builder.noSrc2Regioning() ? GRFALIGN : Four_Word;
4299 inst->setDest(insertMovAfter(iter, inst->getDst(), inst->getDst()->getType(), bb, alignment));
4300 }
4301 else
4302 {
4303 return false;
4304 }
4305 }
4306
4307 // check src
4308 bool canBeImm = true;
4309 for (int k = inst->getNumSrc() - 1; k >= 0; k--)
4310 {
4311 G4_Operand* src = inst->getSrc(k);
4312 if (!isGoodAlign1TernarySrc(inst, k, canBeImm))
4313 {
4314 if (mustDoMad)
4315 {
4316 bool isSrc2 = (k == 0);
4317 if (builder.noSrc2Regioning() && isSrc2)
4318 {
4319 fixSrc2(iter, bb, true);
4320 }
4321 else
4322 {
4323 inst->setSrc(insertMovBefore(iter, k, src->getType(), bb), k);
4324 }
4325 }
4326 else
4327 {
4328 // Promote src2 from :b to :w to allow mad, for example:
4329 // pseudo_mad (16) V211(0,0)<1>:d V210(0,0)<1;0>:d V106(0,0)<0;0>:b V81(0,0)<1;0>:d
4330 // =>
4331 // mov (1) TV74(0,0)<1>:w V106(0,0)<0;1,0>:b {Q1, Align1, NoMask}
4332 // mad (16) V211(0,0)<1>:d V81(0,0)<1;0>:d V210(0,0)<1;0>:d TV74(0,0)<0;0>:w {H1, Align1}
4333 // Do not allow mad if both src1 and src2 are :b as it will generate mov+mov+mad. There is no benefit for
4334 // instruction count as mov+mov+mas equals to mov+mul+add. In some spilled cases the performace may be
4335 // even worse as more spill codes inserted.
4336 bool isSrc2 = (k == 0);
4337 if (builder.noSrc2Regioning() && isSrc2 && IS_BTYPE(src->getType()) && !IS_BTYPE(inst->getSrc(1)->getType()))
4338 {
4339 bool hasModMinus = false;
4340 if (src->isSrcRegRegion())
4341 {
4342 G4_SrcModifier mod = src->asSrcRegRegion()->getModifier();
4343 hasModMinus = (mod == Mod_Minus || mod == Mod_Minus_Abs);
4344 }
4345
4346 // If minus modifier is present, need signed type.
4347 G4_Type type = (IS_SIGNED_INT(src->getType()) || hasModMinus) ? Type_W : Type_UW;
4348 auto dstStrideInBytes = inst->getDst()->getHorzStride() * TypeSize(inst->getDst()->getType());
4349 uint16_t stride = (uint16_t)(dstStrideInBytes / TypeSize(type));
4350 inst->setSrc(insertMovBefore(iter, k, type, bb, stride, GRFALIGN), k);
4351 }
4352 else
4353 {
4354 return false;
4355 }
4356 }
4357 }
4358 else
4359 {
4360 if (src->isImm())
4361 {
4362 canBeImm = false;
4363 }
4364 }
4365 }
4366
4367 inst->setOpcode(G4_mad);
4368 //swap src0 and src2 (vISA MAD is src0*src1+src2, while GEN MAD is src1*src2+src0)
4369 inst->swapSrc(0, 2);
4370
4371 return true;
4372 }
4373
4374 // convert a FP (HF/F/DF) pseudo-mad into a GEN mad,
4375 // inserting moves if necessary
4376 // returns true if conversion is successful
4377 // note that this must return true for IGC due to precision requirements
generateFPMad(G4_BB * bb,INST_LIST_ITER iter)4378 bool HWConformity::generateFPMad(G4_BB* bb, INST_LIST_ITER iter)
4379 {
4380 G4_INST* inst = *iter;
4381 MUST_BE_TRUE(inst->opcode() == G4_pseudo_mad, "expect pseudo mad");
4382 uint8_t execSize = inst->getExecSize();
4383 G4_DstRegRegion* dst = inst->getDst();
4384
4385 // Align16 MAD requirements:
4386 // -- dst and all 3 srcs have the same F/HF/DF type (mixed F/HF is allowed on CHV+)
4387 // -- dst and all 3 srcs have direct access
4388 // -- execution size is 16/8/4/1
4389 // -- dst and src must be packed
4390 // -- if src region is not scalar, its subregister must be 16 byte aligned
4391
4392 // do not force fma for CM since it doesn't have precision requirements
4393 bool preferFpMad = builder.getOption(vISA_forceFPMAD) || builder.favorFpMad();
4394 if (!preferFpMad)
4395 {
4396 preferFpMad = isFpMadPreferred(bb, iter);
4397 }
4398
4399 auto alignMent = execSize * dst->getTypeSize();
4400 alignMent = (alignMent > 32) ? 32 : alignMent;
4401 alignMent = (alignMent < 16) ? 16 : alignMent;
4402
4403 if (dst->getRegAccess() != Direct || dst->getHorzStride() != 1 ||
4404 !builder.isOpndAligned(dst, alignMent))
4405 {
4406 if (preferFpMad)
4407 {
4408 G4_DstRegRegion* tmpDst = insertMovAfter(iter, dst, dst->getType(), bb);
4409 inst->setDest(tmpDst);
4410 }
4411 else
4412 {
4413 return false;
4414 }
4415 }
4416
4417 // check src
4418 for (int k = 0; k < inst->getNumSrc(); k++)
4419 {
4420 G4_Type type = inst->getSrc(k)->getType();
4421 bool goodSrc = isGoodAlign16Src(inst, k);
4422 if (!goodSrc && preferFpMad)
4423 {
4424 // insert moves if type is legal mad type
4425 if (isGoodMadType(type))
4426 {
4427 G4_Operand* src = inst->getSrc(k);
4428 bool isReplicated = (type == Type_DF) &&
4429 src->isSrcRegRegion() &&
4430 (src->asSrcRegRegion()->getRegion()->width == 2) &&
4431 (src->asSrcRegRegion()->getRegion()->horzStride == 0) &&
4432 (src->asSrcRegRegion()->getRegion()->vertStride == 2);
4433 if ((type == Type_DF ||
4434 (type == Type_HF && builder.getPlatform() == GENX_BDW)) &&
4435 execSize > 1 &&
4436 (src->isImm() || src->asSrcRegRegion()->isScalar()))
4437 {
4438 // MAD DF does not support .r, so we have to broadcast the value
4439 // '.r' on MAD HF on BDW is not a replication of that
4440 // scalar element but a pair of half.
4441 auto align = type == Type_HF ? GRFALIGN : Eight_Word;
4442 broadcast(bb, iter, k, align);
4443 }
4444 // No need to insert mov for replicated DF src with <2;2,0> region,
4445 // which can be encoded as "xyxy" or "zwzw" swizzle based on offfset
4446 else if (!isReplicated)
4447 {
4448 inst->setSrc(insertMovBefore(iter, k, type, bb), k);
4449 }
4450 goodSrc = true;
4451 }
4452 }
4453 if (!goodSrc)
4454 {
4455 return false;
4456 }
4457 }
4458
4459 inst->setOpcode(G4_mad);
4460
4461 //swap src0 and src2 (vISA MAD is src0*src1+src2, while GEN MAD is src1*src2+src0)
4462 inst->swapSrc(0, 2);
4463
4464 return true;
4465 }
4466
4467 // If the LF MAD does not conform to Genx ISA semantics, then translate
4468 // it into a valid GenX sequence - either an equivalent MUL/ADD sequence
4469 // or an equivalent MAC.
4470 // ASSUMPTION:
4471 // This phase must be called at the end of all other optimizations
4472 // phases and just prior to testing for ACC spilling.
fixMADInst(G4_BB * bb)4473 void HWConformity::fixMADInst(G4_BB* bb)
4474 {
4475 bool doAlign1Mad = builder.hasAlign1Ternary();
4476 bb->resetLocalIds();
4477 INST_LIST_ITER i = bb->begin();
4478
4479 for (auto iterEnd = bb->end(); i != iterEnd; ++i)
4480 {
4481 G4_INST* inst = *i;
4482 if (inst->opcode() != G4_pseudo_mad)
4483 {
4484 continue;
4485 }
4486
4487 tryEliminateMadSrcModifier(builder, inst);
4488
4489 G4_DstRegRegion* dst = inst->getDst();
4490 uint32_t exec_size = inst->getExecSize();
4491
4492 bool conforming_genx_mad = true;
4493
4494 if (exec_size > G4_ExecSize(builder.getNativeExecSize() * 2))
4495 {
4496 conforming_genx_mad = false;
4497 }
4498 else
4499 {
4500 switch (dst->getType())
4501 {
4502 case Type_F:
4503 case Type_HF:
4504 case Type_DF:
4505 case Type_BF:
4506 break;
4507 case Type_W:
4508 case Type_UW:
4509 case Type_D:
4510 case Type_UD:
4511 if (!doAlign1Mad)
4512 {
4513 conforming_genx_mad = false;
4514 }
4515 break;
4516 default:
4517 conforming_genx_mad = false;
4518 }
4519 }
4520
4521 if (conforming_genx_mad)
4522 {
4523 bool doMad = doAlign1Mad ?
4524 generateAlign1Mad(bb, i) : generateFPMad(bb, i);
4525 if (doMad)
4526 {
4527 // done with this pseudo-mad
4528 continue;
4529 }
4530 }
4531
4532 // translate MAD into MUL/ADD
4533 convertMAD2MulAdd(i, bb);
4534 i++; // skip the add
4535 }
4536 }
4537
isAccCandidate(G4_INST * inst,Gen4_Operand_Number opndNum,G4_Kernel & kernel)4538 static bool isAccCandidate(G4_INST* inst, Gen4_Operand_Number opndNum, G4_Kernel& kernel)
4539
4540 {
4541 if (!kernel.fg.builder->canMadHaveSrc0Acc())
4542 {
4543 return false;
4544 }
4545
4546 switch (opndNum)
4547 {
4548 case Opnd_src0:
4549 case Opnd_src1:
4550 break;
4551 default:
4552 return false;
4553 }
4554
4555 if (!inst->canSrcBeAcc(opndNum))
4556 {
4557 return false;
4558 }
4559
4560 return true;
4561 }
4562
4563 struct LiveNode
4564 {
4565 G4_INST* Inst;
4566 Gen4_Operand_Number OpNum;
LiveNodeLiveNode4567 LiveNode(G4_INST* Inst, Gen4_Operand_Number OpNum)
4568 : Inst(Inst)
4569 , OpNum(OpNum)
4570 {
4571 }
4572 };
4573
4574 #define GLOBAL_USE_NUM 15
4575
isSameOperand(G4_Operand * srcOpnd,struct LiveNode * ln)4576 static bool isSameOperand(G4_Operand* srcOpnd, struct LiveNode* ln)
4577 {
4578 G4_Operand* opnd = ln->Inst->getOperand(ln->OpNum);
4579
4580 if (opnd->compareOperand(srcOpnd) == Rel_eq)
4581 {
4582 return true;
4583 }
4584
4585 return false;
4586 }
4587
localizeForAcc(G4_BB * bb)4588 void HWConformity::localizeForAcc(G4_BB* bb)
4589 {
4590 std::map<const G4_Declare*, G4_Operand*> replacedOperand;
4591 std::unordered_map<const G4_Declare*, std::vector<struct LiveNode>> useNodes;
4592 std::vector<const G4_Declare*> erasedCandidates;
4593
4594 curBB = bb;
4595
4596 for (auto instIter = bb->begin(), instEnd = bb->end(); instIter != instEnd; ++instIter)
4597 {
4598 G4_INST* inst = *instIter;
4599
4600 //Not defined in current BB
4601 G4_Operand* dst = inst->getOperand(Opnd_dst);
4602 if (dst && dst->isGreg() && kernel.fg.globalOpndHT.isOpndGlobal(dst))
4603 {
4604 const G4_Declare* dcl = dst->getTopDcl();
4605 if (useNodes.find(dcl) != useNodes.end())
4606 {
4607 useNodes.erase(dcl); //Maybe added again
4608 erasedCandidates.emplace_back(dcl); //erased declares
4609 }
4610 }
4611
4612 //Source operand
4613 for (auto OpNum :
4614 { Gen4_Operand_Number::Opnd_src0, Gen4_Operand_Number::Opnd_src1,
4615 Gen4_Operand_Number::Opnd_src2 })
4616 {
4617 G4_Operand* src = inst->getOperand(OpNum);
4618 if (src && src->isGreg() && kernel.fg.globalOpndHT.isOpndGlobal(src))
4619 {
4620 const G4_Declare* dcl = src->getTopDcl();
4621 if ((OpNum != Opnd_src0 && //Acc can be used only for src0 and src1
4622 OpNum != Opnd_src1) ||
4623 !isAccCandidate(inst, OpNum, kernel)) //The operand is can be replaced with ACC
4624 {
4625 auto dclIter = std::find(erasedCandidates.begin(), erasedCandidates.end(), dcl);
4626 if (dclIter == erasedCandidates.end())
4627 {
4628 erasedCandidates.emplace_back(dcl);
4629 }
4630 }
4631 else
4632 {
4633 if (useNodes[dcl].empty() ||
4634 isSameOperand(src, &(useNodes[dcl][0])))
4635 {
4636 useNodes[dcl].emplace_back(inst, OpNum);
4637 }
4638 }
4639 }
4640 }
4641 }
4642
4643 for (auto& Nodes : useNodes)
4644 {
4645 const G4_Declare* dcl = Nodes.first;
4646 auto dclIter = std::find(erasedCandidates.begin(), erasedCandidates.end(), dcl);
4647 if (dclIter != erasedCandidates.end())
4648 {
4649 //removed already
4650 continue;
4651 }
4652
4653 if (Nodes.second.size() >= GLOBAL_USE_NUM)
4654 {
4655 for (auto& LN : Nodes.second)
4656 {
4657 G4_INST* inst = LN.Inst;
4658 Gen4_Operand_Number opNum = LN.OpNum;
4659 int i = inst->getSrcNum(opNum);
4660 G4_Operand* src = inst->getSrc(i);
4661 G4_Operand* tmpOpnd = nullptr;
4662
4663 auto itR = replacedOperand.find(dcl);
4664 if (itR != replacedOperand.end())
4665 {
4666 tmpOpnd = builder.duplicateOperand(itR->second);
4667 }
4668 else
4669 {
4670 tmpOpnd = insertCopyAtBBEntry(bb, inst->getExecSize(), src);
4671 replacedOperand[dcl] = tmpOpnd;
4672 }
4673 inst->setSrc(tmpOpnd, i);
4674 }
4675 }
4676 }
4677
4678 return;
4679 }
4680
4681 // convert a psuedo mad inst into mul/add
4682 // return the iterator pointing to add
convertMAD2MulAdd(INST_LIST_ITER iter,G4_BB * bb)4683 void HWConformity::convertMAD2MulAdd(INST_LIST_ITER iter, G4_BB* bb)
4684 {
4685 G4_INST* inst = *iter;
4686 assert(inst->opcode() == G4_pseudo_mad && "expect pseudo-mad");
4687
4688 G4_DstRegRegion* addOpDst = inst->getDst();
4689 G4_Operand* addOpnd2 = inst->getSrc(2);
4690 G4_Type mulOpDstType = addOpDst->getType();
4691 G4_Type mulOpExecType = inst->getExecType();
4692 // pick the widest type of mad's src and dst as the intermediate type
4693 if (TypeSize(mulOpDstType) > TypeSize(mulOpExecType))
4694 {
4695 mulOpExecType = mulOpDstType;
4696 }
4697
4698 mulOpDstType = mulOpExecType;
4699
4700 G4_SubReg_Align subAlign = Get_G4_SubRegAlign_From_Type(mulOpDstType);
4701
4702 // Reuse the MAD op for MUL.
4703 inst->setOpcode(G4_mul);
4704 inst->setSrc(nullptr, 2);
4705
4706 G4_Declare* mulDefDcl = builder.createTempVar(inst->getExecSize(), mulOpDstType, subAlign);
4707
4708 G4_DstRegRegion* mulOpDst = builder.createDstRegRegion(mulDefDcl, 1);
4709 inst->setDest(mulOpDst);
4710
4711 // Follow with an ADD.
4712 INST_LIST_ITER tIter = iter;
4713 tIter++;
4714
4715 auto addOpnd1 = builder.createSrcRegRegion(mulDefDcl, builder.getRegionStride1());
4716 G4_INST* addOp = builder.createInternalInst(
4717 inst->getPredicate(),
4718 G4_add,
4719 inst->getCondMod(),
4720 inst->getSaturate(),
4721 inst->getExecSize(),
4722 addOpDst,
4723 addOpnd1,
4724 addOpnd2,
4725 nullptr,
4726 inst->getOption());
4727
4728 bb->insertBefore(tIter, addOp);
4729
4730 // predicate/condmod/saturate, if they exist, are propagated to the add instruction
4731 inst->setSaturate(g4::NOSAT);
4732 inst->setPredicate(NULL);
4733 inst->setCondMod(nullptr);
4734
4735 {
4736 inst->transferDef(addOp, Opnd_src2, Opnd_src1);
4737 if (addOp->getPredicate())
4738 {
4739 inst->transferDef(addOp, Opnd_pred, Opnd_pred);
4740 }
4741 inst->transferUse(addOp);
4742 inst->addDefUse(addOp, Opnd_src0);
4743 }
4744 }
4745
4746 // See if we can convert the pseudo_sada2 instruction into an actual Gen sada2
4747 // This can be done if the following conditions are met:
4748 // -- We can find the definition of the pseudo sada2 instruction's source 2 in
4749 // the same basic block, and that
4750 // -- it may be replaced by an acc (i.e., the src2 is its only use, the dst and
4751 // the src have identical regions, and there are no intervening instructions
4752 // that update acc)
4753 //
4754 // We additionally attempt to schedule up the sada2 instruction to be as close
4755 // as possible to the src2 defining instruction (subject to the constraints of
4756 // def-use chains for def, src0 and src1), so that more opportunites may be
4757 // exposed for later sada2 instructions
4758
fixSADA2Inst(G4_BB * bb)4759 void HWConformity::fixSADA2Inst(G4_BB* bb)
4760 {
4761
4762 INST_LIST_ITER i = bb->begin();
4763 while (i != bb->end())
4764 {
4765
4766 G4_INST* inst = *i;
4767 if (inst->opcode() != G4_pseudo_sada2)
4768 {
4769 ++i;
4770 continue;
4771 }
4772
4773 G4_Operand* src2 = inst->getSrc(2);
4774
4775 bool canDoSada2 = true;
4776 G4_INST* src2Dst = NULL;
4777
4778 int emask = inst->getMaskOption();
4779 if (!bb->isAllLaneActive() &&
4780 emask != InstOpt_WriteEnable &&
4781 inst->getMaskOffset() != 0)
4782 {
4783 canDoSada2 = false;
4784 }
4785
4786 G4_DstRegRegion* dst = inst->getDst();
4787 if (canDoSada2)
4788 {
4789 if (src2->isSrcRegRegion() && src2->asSrcRegRegion()->getRegAccess() == Direct)
4790 {
4791 // check Src2
4792 if (kernel.fg.globalOpndHT.isOpndGlobal(src2))
4793 {
4794 // no sada2 if operand is global
4795 canDoSada2 = false;
4796 }
4797 else if (src2->asSrcRegRegion()->getModifier() != Mod_src_undef)
4798 {
4799 // no sada2 if src2 has a modifier
4800 canDoSada2 = false;
4801 }
4802 else
4803 {
4804 for (auto defIter = inst->def_begin(), end = inst->def_end(); defIter != end; ++defIter)
4805 {
4806 if ((*defIter).second == Opnd_src2)
4807 {
4808 if (src2Dst != NULL)
4809 {
4810 // no sada2 if src2 has >1 definition
4811 canDoSada2 = false;
4812 break;
4813 }
4814 src2Dst = (*defIter).first;
4815 }
4816 }
4817
4818 if (!src2Dst)
4819 {
4820 canDoSada2 = false;
4821 }
4822 else
4823 {
4824 if (!src2Dst->hasOneUse())
4825 {
4826 // no sad2 if def has more than one use
4827 canDoSada2 = false;
4828 }
4829 else
4830 {
4831 G4_DstRegRegion* src2DstOpnd = src2Dst->getDst();
4832 G4_Type src2DstType = src2DstOpnd->getType();
4833 if (src2DstOpnd->getRegAccess() != Direct
4834 || (src2DstType != Type_W && src2DstType != Type_UW))
4835 {
4836 // no sada2 if def's dst is indirect, or it type is not W or UW
4837 canDoSada2 = false;
4838 }
4839 else if (src2DstOpnd->compareOperand(src2) !=
4840 Rel_eq)
4841 {
4842 // no sada2 if src2Dst and src2 are not equal
4843 canDoSada2 = false;
4844 }
4845 }
4846 }
4847 }
4848 }
4849 else
4850 {
4851 canDoSada2 = false;
4852 }
4853 }
4854
4855 // The new location of the sada2 after the conversion
4856 INST_LIST_ITER newSada2Iter = i;
4857 --newSada2Iter;
4858 if (canDoSada2)
4859 {
4860 // try to schedule up the sada2 to be as close to the src2-defining instruction
4861 // as possible to expose more optmizaition opportunities
4862 for (; *newSada2Iter != src2Dst; --newSada2Iter)
4863 {
4864 if (inst->isRAWdep(*newSada2Iter) ||
4865 inst->isWAWdep(*newSada2Iter) ||
4866 inst->isWARdep(*newSada2Iter))
4867 {
4868 break;
4869 }
4870 }
4871
4872 // make sure there are no instructions between the sada2's new location
4873 // and the src2-defining instruction that updates acc
4874 for (auto iter = newSada2Iter; *iter != src2Dst; --iter)
4875 {
4876 G4_INST* aInst = *iter;
4877 if (aInst->hasACCOpnd())
4878 {
4879 canDoSada2 = false;
4880 break;
4881 }
4882 }
4883 }
4884
4885 if (canDoSada2)
4886 {
4887 // We have verified all conditions and can convert this instruction to sada2.
4888 // replace the destination for src2Dst to be acc0.
4889 // The actual acc0 offset will be fixed in a later pass
4890 G4_DstRegRegion* accDstOpnd = builder.createDst(
4891 builder.phyregpool.getAcc0Reg(),
4892 0,
4893 0,
4894 1,
4895 src2->getType());
4896 src2Dst->setDest(accDstOpnd);
4897 if (src2Dst->getExecSize() == g4::SIMD1)
4898 {
4899 // This can happen for the first sada2 instruction if src2 is scalar
4900 // expand its execution size so that acc is fully defined
4901 src2Dst->setExecSize(inst->getExecSize());
4902 }
4903
4904 // create an implicit acc parameter for sada2
4905 inst->setOpcode(G4_sada2);
4906 inst->setSrc(nullptr, 2);
4907 G4_SrcRegRegion* accSrcOpnd = builder.createSrc(
4908 builder.phyregpool.getAcc0Reg(),
4909 0,
4910 0,
4911 builder.getRegionStride1(),
4912 src2->getType());
4913
4914 inst->setImplAccSrc(accSrcOpnd);
4915
4916 ++newSada2Iter;
4917 bb->insertBefore(newSada2Iter, inst);
4918 i = bb->erase(i);
4919
4920 // maintain def-use
4921
4922 for (auto tmpIter = src2Dst->use_begin(), end = src2Dst->use_end(); tmpIter != end; ++tmpIter)
4923 {
4924 if ((*tmpIter).first == inst && (*tmpIter).second == Opnd_src2)
4925 {
4926 (*tmpIter).second = Opnd_implAccSrc;
4927 break;
4928 }
4929 }
4930
4931 for (auto tmpIter = inst->def_begin(), end = inst->def_end(); tmpIter != end; ++tmpIter)
4932 {
4933 if ((*tmpIter).first == src2Dst && (*tmpIter).second == Opnd_src2)
4934 {
4935 (*tmpIter).second = Opnd_implAccSrc;
4936 break;
4937 }
4938 }
4939 }
4940 else
4941 {
4942 // pseudo_sada2 (N) dst src0 src1 src2
4943 // becomes
4944 // sad2 (n) tmp<1>:w src0 src1
4945 // add (n) dst tmp<n;n,1>:w src2
4946
4947 inst->setOpcode(G4_sad2);
4948 inst->setSrc(nullptr, 2);
4949
4950 G4_SubReg_Align sad2TmpSubAlign = Get_G4_SubRegAlign_From_Type(dst->getType());
4951
4952 if ((unsigned)inst->getExecSize() * dst->getTypeSize() > numEltPerGRF<Type_UB>())
4953 {
4954 // align to GRF
4955 sad2TmpSubAlign = GRFALIGN;
4956 }
4957 // create a new temp variable as sad2's destination
4958 G4_Declare* sad2Tmp = builder.createTempVar(inst->getExecSize(), dst->getType(), sad2TmpSubAlign);
4959 G4_DstRegRegion* sad2Dst = builder.createDstRegRegion(sad2Tmp, 1);
4960 inst->setDest(sad2Dst);
4961
4962 uint16_t srcVertStride, srcWidth, srcHorzStride;
4963 srcWidth = inst->getExecSize() > g4::SIMD8 ? g4::SIMD8 : inst->getExecSize();
4964 srcHorzStride = 1;
4965 srcVertStride = srcWidth;
4966
4967 // opnd 0 for add is the new temp we've just created
4968 const RegionDesc* rd = builder.createRegionDesc(srcVertStride, srcWidth, srcHorzStride);
4969 G4_Operand* addSrc0Opnd = builder.createSrc(sad2Dst->getBase(),
4970 0, 0, rd, sad2Dst->getType());
4971
4972 // opnd 1 is src2 of the pseudo_sada2
4973 // dst is the same as the pseudo_sada2
4974 G4_INST* addInst = builder.createInternalInst(
4975 inst->getPredicate(),
4976 G4_add,
4977 inst->getCondMod(),
4978 inst->getSaturate(),
4979 inst->getExecSize(),
4980 dst,
4981 addSrc0Opnd,
4982 src2,
4983 NULL,
4984 inst->getOption());
4985
4986 INST_LIST_ITER addLoc = i;
4987 ++addLoc;
4988 bb->insertBefore(addLoc, addInst);
4989
4990 // FIXME: redundant?
4991 inst->addDefUse(addInst, Opnd_src0);
4992
4993 // The sad2 op should not have the SAT attribute set,
4994 // as this is intended only for the final result of the
4995 // SADA2 (and thus the add op will keep the SAT attribute).
4996 inst->setSaturate(g4::NOSAT);
4997 inst->setPredicate(NULL);
4998
4999 {
5000 inst->transferDef(addInst, Opnd_src2, Opnd_src1);
5001 if (addInst->getPredicate())
5002 {
5003 inst->transferDef(addInst, Opnd_pred, Opnd_pred);
5004 }
5005 inst->transferUse(addInst);
5006 inst->addDefUse(addInst, Opnd_src0);
5007 }
5008 ++i;
5009 }
5010 }
5011 }
5012
fixSendInst(G4_BB * bb)5013 void HWConformity::fixSendInst(G4_BB* bb)
5014 {
5015
5016 for (INST_LIST_ITER i = bb->begin(), end = bb->end(); i != end; i++)
5017 {
5018
5019 G4_INST* inst = *i;
5020 if (!inst->isSend())
5021 {
5022 continue;
5023 }
5024
5025 if (inst->getExecSize() < builder.getNativeExecSize())
5026 {
5027 // A64 messages require a minimum msg len of two for address (src0), which is inconsistent
5028 // with our input IR as it allows <2 GRF address variables (e.g., simd1 A64 scatter r/w).
5029 // To avoid this causing overlap between send dst/src0/src1 (it is known to cause HW hang),
5030 // we have to ensure they are all 2GRF-aligned
5031 G4_Declare* src0Dcl = inst->getSrc(0)->getTopDcl();
5032 // ToDo: check if dst/src1 may also exhibit such size mismatch
5033 bool sizeMismatch = inst->getMsgDesc()->getSrc0LenRegs() == 2 &&
5034 (src0Dcl && src0Dcl->getRootDeclare()->getByteSize() < 2u * numEltPerGRF<Type_UB>());
5035 auto doEvenAlign = [](G4_Declare* dcl)
5036 {
5037 if (dcl)
5038 {
5039 dcl = dcl->getRootDeclare();
5040 // variables >= 2 GRF don't need even alignment since they can't possibly overlap
5041 if (dcl->getByteSize() < 2u * numEltPerGRF<Type_UB>())
5042 {
5043 dcl->setEvenAlign();
5044 }
5045 }
5046 };
5047 if (sizeMismatch)
5048 {
5049 doEvenAlign(inst->getSrc(0)->getTopDcl());
5050 if (inst->isSplitSend())
5051 {
5052 doEvenAlign(inst->getSrc(1)->getTopDcl());
5053 }
5054 if (builder.WaDisableSendSrcDstOverlap())
5055 {
5056 doEvenAlign(inst->getDst()->getTopDcl());
5057 }
5058 }
5059 }
5060
5061 uint16_t offset = 0;
5062 if (!builder.isOpndAligned(inst->getDst(), offset, numEltPerGRF<Type_UB>()))
5063 {
5064 replaceDst(i, inst->getDst()->getType(), GRFALIGN);
5065 }
5066
5067 G4_Operand* src0 = inst->getSrc(0);
5068 G4_Declare* src0TopDcl = src0->getTopDcl();
5069
5070 // if src0 and src1 are hard-wired GRF, check that
5071 // they satisfy EOT and preemption restrictions
5072 auto needsTempSrc = [this](G4_INST* inst, G4_Declare* dcl)
5073 {
5074 return dcl->getRegVar() && dcl->getRegVar()->getPhyReg() &&
5075 ((inst->isEOT() && builder.hasEOTGRFBinding() &&
5076 dcl->getRegVar()->getPhyReg()->asGreg()->getRegNum() < 112) ||
5077 (builder.getOption(vISA_enablePreemption) &&
5078 dcl->getRegVar()->getPhyReg()->asGreg()->getRegNum() < 2));
5079 };
5080
5081 auto fixSrc = [&](G4_INST* inst, bool isSrc0)
5082 {
5083 auto sendSrc = isSrc0 ? inst->getSrc(0)->asSrcRegRegion() : inst->getSrc(1)->asSrcRegRegion();
5084 uint16_t rows = isSrc0 ? inst->getMsgDesc()->getSrc0LenRegs() : inst->getMsgDesc()->getSrc1LenRegs();
5085 G4_Type type = sendSrc->getType();
5086 G4_Declare* dcl = builder.createTempVar(rows * builder.getNativeExecSize(), type, GRFALIGN);
5087
5088 MUST_BE_TRUE(TypeSize(type) == 4, "Invalid src opnd type for send.");
5089
5090 const RegionDesc* region = builder.getRegionStride1();
5091 G4_VarBase* base = sendSrc->getBase();
5092 short baseOff = sendSrc->getRegOff();
5093 short baseSubOff = sendSrc->getSubRegOff();
5094 for (uint16_t idx = 0; idx != rows; ++idx) {
5095 G4_SrcRegRegion* src = builder.createSrc(base, baseOff + idx, baseSubOff + 0, region, type);
5096 G4_DstRegRegion* dst = builder.createDst(dcl->getRegVar(), idx, 0, 1, type);
5097 G4_INST* newInst = builder.createMov(builder.getNativeExecSize(), dst, src, InstOpt_WriteEnable, false);
5098 bb->insertBefore(i, newInst);
5099 }
5100
5101 G4_Operand* newSrc = builder.createSrcRegRegion(dcl, builder.getRegionStride1());
5102 inst->setSrc(newSrc, isSrc0 ? 0 : 1);
5103 };
5104
5105 if (needsTempSrc(inst, src0TopDcl))
5106 {
5107 fixSrc(inst, true);
5108 }
5109
5110 if (inst->isSplitSend() && !inst->getSrc(1)->isNullReg())
5111 {
5112 // src1 may be null because some messages (e.g., CPS) require split send
5113 if (!builder.isOpndAligned(inst->getSrc(1), numEltPerGRF<Type_UB>()))
5114 {
5115 inst->setSrc(insertMovBefore(i, 1, inst->getSrc(1)->getType(), bb, GRFALIGN), 1);
5116 }
5117 G4_Operand* src1 = inst->getSrc(1);
5118 G4_Declare* src1TopDcl = src1->getTopDcl();
5119
5120 if (needsTempSrc(inst, src1TopDcl))
5121 {
5122 fixSrc(inst, false);
5123 }
5124 }
5125
5126 if (builder.getOption(vISA_enablePreemption))
5127 {
5128 G4_DstRegRegion* dst = inst->getDst();
5129 if (!dst->isNullReg())
5130 {
5131 G4_Declare* dstTopDcl = dst->getTopDcl();
5132 if (dstTopDcl != NULL &&
5133 dstTopDcl->getRegVar() &&
5134 dstTopDcl->getRegVar()->getPhyReg())
5135 {
5136 MUST_BE_TRUE((dstTopDcl->getRegVar()->getPhyReg()->asGreg()->getRegNum() > 2), "Unexpected preg used for send destination.");
5137 }
5138 }
5139 }
5140
5141 if (builder.WaDisableSendSrcDstOverlap())
5142 {
5143 // create copy if dst and src0/src1 overlap due to being the same variable
5144 bool src0Overlap = inst->getDst()->compareOperand(inst->getSrc(0)) != Rel_disjoint;
5145 bool src1Overlap = inst->isSplitSend() && inst->getDst()->compareOperand(inst->getSrc(1)) != Rel_disjoint;
5146 if (src0Overlap || src1Overlap)
5147 {
5148 int dstSize = inst->getMsgDesc()->getDstLenRegs();
5149 int src0Size = src0Overlap ? inst->getMsgDesc()->getSrc0LenRegs() : 0;
5150 int src1Size = src1Overlap ? inst->getMsgDesc()->getSrc1LenRegs() : 0;
5151 if (inst->getPredicate() || (bb->isDivergent() && !inst->isWriteEnableInst()) || dstSize > src0Size + src1Size)
5152 {
5153 //copy src0/src1 if inst does not update all channels
5154 //ToDo: the copies may be OOB if src0/src1 are scalar. It should be ok since we don't care about the values,
5155 //but IR verifier might complain about OOB.
5156 if (src0Overlap)
5157 {
5158 G4_Declare* copyDst = builder.createTempVar(src0Size * numEltPerGRF<Type_UD>(), Type_UD, Any);
5159 copyRegs(copyDst, 0, inst->getSrc(0)->getBase()->asRegVar()->getDeclare(),
5160 inst->getSrc(0)->asSrcRegRegion()->getRegOff() * getGRFSize(), src0Size, bb, i);
5161 inst->setSrc(builder.createSrcRegRegion(copyDst, builder.getRegionStride1()), 0);
5162 }
5163 if (src1Overlap)
5164 {
5165 G4_Declare* copyDst = builder.createTempVar(src1Size * numEltPerGRF<Type_UD>(), Type_UD, Any);
5166 copyRegs(copyDst, 0, inst->getSrc(1)->getBase()->asRegVar()->getDeclare(),
5167 inst->getSrc(1)->asSrcRegRegion()->getRegOff() * getGRFSize(), src1Size, bb, i);
5168 inst->setSrc(builder.createSrcRegRegion(copyDst, builder.getRegionStride1()), 1);
5169 }
5170 }
5171 else
5172 {
5173 // copy dst
5174 auto dst = inst->getDst();
5175 auto dstDcl = dst->getBase()->asRegVar()->getDeclare();
5176 auto copyIter = std::next(i);
5177 G4_Declare* copySrc = builder.createTempVar(dstSize * numEltPerGRF<Type_UD>(), Type_UD, Any);
5178 // speical case when send dst declare is <1 GRF (it must still be GRF-aligned)
5179 if (dstDcl->getByteSize() < getGRFSize())
5180 {
5181 auto numDWords = dstDcl->getByteSize() / TypeSize(Type_UD);
5182 assert(numDWords > 0);
5183 copyDwords(dstDcl, 0, copySrc, 0, numDWords, bb, copyIter);
5184 }
5185 else
5186 {
5187 copyRegs(dstDcl, dst->getRegOff() * getGRFSize(),
5188 copySrc, 0, dstSize, bb, copyIter);
5189 }
5190 inst->setDest(builder.createDstRegRegion(copySrc, 1));
5191 }
5192 }
5193 }
5194
5195 }
5196
5197 }
5198
fixsrc1src2Overlap(G4_BB * bb)5199 void HWConformity::fixsrc1src2Overlap(G4_BB* bb)
5200 {
5201 for (INST_LIST_ITER i = bb->begin(), end = bb->end(); i != end; i++)
5202 {
5203 G4_INST* inst = *i;
5204
5205 if (inst->opcode() != G4_mad)
5206 {
5207 continue;
5208 }
5209
5210 G4_Operand* src1 = inst->getSrc(1);
5211 G4_Operand* src2 = inst->getSrc(2);
5212
5213 if (src1 && src2 &&
5214 !src1->isNullReg() && !src2->isNullReg() &&
5215 src1->getType() == src2->getType())
5216 {
5217 G4_CmpRelation cmpResult = src1->compareOperand(src2);
5218 if (cmpResult != Rel_disjoint && cmpResult != Rel_undef)
5219 {
5220 G4_Type movType = src2->getType();
5221 bool changeType = true;
5222 switch (src2->getType())
5223 {
5224 case Type_DF:
5225 movType = Type_UQ;
5226 break;
5227 case Type_F:
5228 movType = Type_UD;
5229 break;
5230 case Type_HF:
5231 movType = Type_UW;
5232 break;
5233 default:
5234 changeType = false;
5235 break;
5236 }
5237 if (changeType)
5238 {
5239 G4_Operand* opnd = insertMovBefore(i, 2, movType, bb);
5240 INST_LIST_ITER prev_it = i;
5241 prev_it--;
5242 G4_INST* movInst = (*prev_it);
5243 movInst->getSrc(0)->asSrcRegRegion()->setType(movType);
5244 opnd->asSrcRegRegion()->setType(src2->getType());
5245 inst->setSrc(opnd, 2);
5246 }
5247 }
5248 }
5249 }
5250 }
5251
fixOverlapInst(G4_BB * bb)5252 void HWConformity::fixOverlapInst(G4_BB* bb)
5253 {
5254 for (INST_LIST_ITER i = bb->begin(), end = bb->end(); i != end; i++)
5255 {
5256 G4_INST* inst = *i;
5257
5258 if (inst->mayExceedTwoGRF() || inst->opcode() == G4_madm)
5259 {
5260 continue;
5261 }
5262
5263 if (inst->getDst() != NULL)
5264 {
5265 // create copy if dst and src0/src1 overlap due to being the same variable
5266 G4_Operand* dst = inst->getDst();
5267 if (dst != NULL && dst->isDstRegRegion() && dst->getTopDcl() && dst->getTopDcl()->getRegFile() == G4_GRF)
5268 {
5269 int dstSize = (dst->getLinearizedEnd() - dst->getLinearizedStart() + 1) / numEltPerGRF<Type_UB>();
5270 int srcSize = 1;
5271
5272 bool srcOverlap = false;
5273 for (int i = 0; i < inst->getNumSrc(); i++)
5274 {
5275 G4_Operand* src = inst->getSrc(i);
5276 if (src != NULL && !src->isNullReg() && src->getTopDcl() && src->getTopDcl()->getRegFile() == G4_GRF)
5277 {
5278 srcOverlap |= inst->getDst()->compareOperand(inst->getSrc(i)) == Rel_interfere;
5279 if (srcOverlap)
5280 {
5281 srcSize = (src->getLinearizedEnd() - src->getLinearizedStart() + 1) / numEltPerGRF<Type_UB>();
5282 break;
5283 }
5284 }
5285 }
5286
5287 if (srcOverlap && (dstSize > 1 || srcSize > 1))
5288 {
5289 G4_AccRegSel accSel = inst->getDst()->getAccRegSel();
5290 G4_DstRegRegion* newDst = insertMovAfter(i, inst->getDst(), inst->getDst()->getType(), bb);
5291 newDst->setAccRegSel(accSel);
5292 inst->setDest(newDst);
5293 }
5294 }
5295 }
5296 }
5297 }
5298
5299 //
5300 // Fix sel and csel instructions:
5301 // -- set their cond mod to null as they don't modify it. They will be hard-coded to f0.0 in Gen asm
5302
fixSelCsel(INST_LIST_ITER it,G4_BB * bb)5303 void HWConformity::fixSelCsel(INST_LIST_ITER it, G4_BB* bb)
5304 {
5305 G4_INST* inst = *it;
5306 if (inst->opcode() == G4_sel || inst->opcode() == G4_csel)
5307 {
5308 G4_CondMod* condMod = inst->getCondMod();
5309 if (condMod)
5310 {
5311 condMod->setBase(nullptr);
5312 }
5313 }
5314 }
5315
avoidDstSrcOverlap(PointsToAnalysis & p)5316 void HWConformity::avoidDstSrcOverlap(PointsToAnalysis& p)
5317 {
5318 for (auto& bb : kernel.fg)
5319 {
5320 INST_LIST_ITER i = bb->begin(), iEnd = bb->end();
5321 INST_LIST_ITER next_iter = i;
5322 curBB = bb;
5323 for (; i != iEnd; i = next_iter)
5324 {
5325 ++next_iter;
5326 avoidInstDstSrcOverlap(i, bb, p);
5327 }
5328 }
5329 }
5330
5331 //
5332 // Avoid the dst and src overlap when they are using the same variable by inserting a mov instruction
5333 // add(8) var1<2>, var2, var1<0, 1, 0>
5334 //
avoidInstDstSrcOverlap(INST_LIST_ITER it,G4_BB * bb,PointsToAnalysis & p)5335 void HWConformity::avoidInstDstSrcOverlap(INST_LIST_ITER it, G4_BB* bb, PointsToAnalysis& p)
5336 {
5337 G4_INST* inst = *it;
5338
5339 if (inst->mayExceedTwoGRF() ||
5340 inst->opcode() == G4_nop ||
5341 inst->opcode() == G4_madm ||
5342 inst->isLabel())
5343 {
5344 return;
5345 }
5346
5347 auto dst = inst->getDst();
5348 if (!dst ||
5349 dst->isNullReg() ||
5350 !dst->getBase()->isRegVar())
5351 {
5352 return;
5353 }
5354
5355 auto dstSize = inst->getExecSize() * dst->getTypeSize() * dst->getHorzStride();
5356 //Handle VxH
5357 if (dstSize > getGRFSize())
5358 {
5359 // special check for 2-GRF instruction with VxH operands
5360 // strictly speaking dst and VxH src may overlap only if src's address may point to dst variable,
5361 // but we skip such check as VxH access is rare and already expensive, so adding an extra move won't cause much extra overhead
5362 bool hasVxH = std::any_of(inst->src_begin(), inst->src_end(),
5363 [](G4_Operand* src) { return src && src->isSrcRegRegion() && src->asSrcRegRegion()->getRegion()->isRegionWH(); });
5364 if (hasVxH)
5365 {
5366 replaceDst(it, dst->getType());
5367 return;
5368 }
5369 }
5370
5371 G4_Declare* dstDcl = dst->getTopDcl();
5372 if (dstDcl)
5373 {
5374 G4_DstRegRegion* dstRgn = dst;
5375 int dstOpndNumRows = ((dstRgn->getLinearizedEnd() - dstRgn->getLinearizedStart()) / numEltPerGRF(Type_UB)) + 1;
5376 int dstLeft = dstRgn->getLinearizedStart();
5377 int dstRight = dstOpndNumRows > 1 ? ((dstLeft / numEltPerGRF(Type_UB) + 1) * numEltPerGRF(Type_UB) - 1) :
5378 dstRgn->getLinearizedEnd();
5379
5380 for (int i = 0, nSrcs = inst->getNumSrc(); i < nSrcs; i++)
5381 {
5382 G4_Operand* src = inst->getSrc(i);
5383
5384 if (!src || src->isNullReg() || !src->getTopDcl())
5385 {
5386 continue;
5387 }
5388 G4_Declare* srcDcl = src->getTopDcl();
5389 G4_CmpRelation rel = dst->compareOperand(src);
5390 if (src->isSrcRegRegion())
5391 {
5392 G4_SrcRegRegion* srcRg = src->asSrcRegRegion();
5393 if (srcDcl == dstDcl &&
5394 srcRg->getRegAccess() == Direct &&
5395 srcRg->getBase()->isRegVar())
5396 {
5397 if (rel != Rel_disjoint && rel != Rel_undef) //Overlap
5398 {
5399 G4_SrcRegRegion* srcRgn = src->asSrcRegRegion();
5400 int srcOpndNumRows = ((srcRgn->getLinearizedEnd() - srcRgn->getLinearizedStart()) / numEltPerGRF(Type_UB)) + 1;
5401 int srcLeft = srcRgn->getLinearizedStart();
5402 int srcRight = srcRgn->getLinearizedEnd();
5403
5404 if (!srcRgn->isScalar() && srcOpndNumRows > 1)
5405 {
5406 srcLeft = (srcRgn->getLinearizedStart() / numEltPerGRF(Type_UB) + 1) * numEltPerGRF(Type_UB);
5407 }
5408
5409 if (dstOpndNumRows > 1 || srcOpndNumRows > 1)
5410 {
5411 if (!(srcLeft > dstRight || dstLeft > srcRight))
5412 {
5413 inst->setSrc(insertMovBefore(it, i, src->getType(), bb), i);
5414 }
5415 }
5416 }
5417 }
5418 else if (srcRg->isIndirect())
5419 {
5420 G4_RegVar* ptvar = NULL;
5421 int vid = 0;
5422 while ((ptvar = p.getPointsTo(srcDcl->getRegVar(), vid++)) != NULL)
5423 {
5424 G4_Declare* dcl = ptvar->getDeclare();
5425 if (dstDcl == dcl)
5426 {
5427 G4_AccRegSel accSel = inst->getDst()->getAccRegSel();
5428 G4_DstRegRegion* newDst = insertMovAfter(it, inst->getDst(), inst->getDst()->getType(), bb);
5429 newDst->setAccRegSel(accSel);
5430 inst->setDest(newDst);
5431 return;
5432 }
5433 }
5434 }
5435 }
5436 }
5437 }
5438 }
5439
fixCalla(INST_LIST_ITER it,G4_BB * bb)5440 void HWConformity::fixCalla(INST_LIST_ITER it, G4_BB *bb)
5441 {
5442 G4_INST* fcall = *it;
5443 G4_Operand* src0 = fcall->getSrc(0);
5444
5445 // fcall could have imm/label src for direct call
5446 // No need to fix src reg at the case
5447 if (!src0->isSrcRegRegion())
5448 return;
5449
5450 if (builder.isOpndAligned(src0, getGRFSize()))
5451 return;
5452
5453 // insert a mov before fcall(calla) to mov src to a grf aligned reg
5454 replaceSrc(it, 0, src0->getType(), bb, GRFALIGN);
5455 }
5456
replaceHFBFwithFloat(INST_LIST_ITER it,G4_BB * bb)5457 void HWConformity::replaceHFBFwithFloat(INST_LIST_ITER it, G4_BB* bb)
5458 {
5459 auto* inst = *it;
5460 auto* dst = inst->getDst();
5461 auto* src0 = inst->getSrc(0);
5462 assert(src0->getType() == Type_BF || src0->getType() == Type_HF);
5463
5464 G4_InstDpas* dpasInst = inst->asDpasInst();
5465 uint8_t C = dpasInst->getRepeatCount();
5466
5467 unsigned int src_l = src0->getLinearizedStart();
5468 unsigned int src_r = src0->getLinearizedEnd();
5469 unsigned int dstGRFSize = (src_r - src_l + 1) * (TypeSize(Type_F) / src0->getTypeSize());
5470 unsigned movInstNum = (((dstGRFSize + getGRFSize() - 1) / getGRFSize()) + 1) / 2; //2 GRFs per instruction
5471
5472 G4_Declare* dcl = builder.createTempVar(builder.getNativeExecSize() * C, Type_F, ThirtyTwo_Word);
5473
5474 // Copy HF/BF data to float with mov instructions.
5475 // If the new destination is more than 2 GRFs, multiple moves required.
5476 for (unsigned i = 0; i < movInstNum; i++)
5477 {
5478 G4_DstRegRegion* newDst = builder.createDst(
5479 dcl->getRegVar(),
5480 2 * i,
5481 0,
5482 dst->getHorzStride(),
5483 Type_F);
5484
5485 G4_Operand* newSrc = builder.createSrc(
5486 src0->getBase(),
5487 src0->asSrcRegRegion()->getRegOff() + i,
5488 src0->asSrcRegRegion()->getSubRegOff(),
5489 builder.getRegionStride1(),
5490 src0->asSrcRegRegion()->getType());
5491
5492 G4_ExecSize numOfF {(2 * getGRFSize()) / TypeSize(Type_F)};
5493 if (i == movInstNum - 1)
5494 {
5495 numOfF = G4_ExecSize((dstGRFSize / TypeSize(Type_F)) - i * numOfF);
5496 }
5497 G4_INST* newInst = builder.createMov(numOfF, newDst, newSrc, InstOpt_WriteEnable, false);
5498
5499 bb->insertBefore(it, newInst);
5500 }
5501
5502 //Replace the original source with the float type operand
5503 G4_Operand* newSrc0 = builder.createSrc(
5504 dcl->getRegVar(),
5505 0,
5506 0,
5507 builder.getRegionStride1(),
5508 dcl->getElemType());
5509 inst->setSrc(newSrc0, 0);
5510
5511 return;
5512 }
5513
fixDPAS(INST_LIST_ITER it,G4_BB * bb)5514 void HWConformity::fixDPAS(INST_LIST_ITER it, G4_BB *bb)
5515 {
5516 G4_INST* inst = *it;
5517
5518 if (VISA_WA_CHECK(builder.getPWaTable(), Wa_22010725011) &&
5519 !builder.getOption(vISA_EnableDPASBFHFH))
5520 {
5521 G4_Type src0Type = inst->getSrc(0)->getType();
5522
5523 if (src0Type == Type_BF || src0Type == Type_HF)
5524 {
5525 replaceHFBFwithFloat(it, bb);
5526 }
5527 }
5528 }
5529
conformBB(G4_BB * bb)5530 void HWConformity::conformBB(G4_BB* bb)
5531 {
5532 INST_LIST_ITER i = bb->begin(), iEnd = bb->end();
5533 INST_LIST_ITER next_iter = i;
5534 for (; i != iEnd; i = next_iter)
5535 {
5536 // by default we skip the newly inserted instructions as we assume they are already HW conformed
5537 // if a check may produce new instructions that violate HW rules, it must adjust the next_iter
5538 // to point to them
5539 ++next_iter;
5540 G4_INST* inst = *i;
5541 G4_opcode opcode = inst->opcode();
5542
5543 if (inst->isDpas())
5544 {
5545 fixDPAS(i, bb);
5546 continue;
5547 }
5548
5549 if (inst->isFCall() && builder.supportCallaRegSrc())
5550 fixCalla(i, bb);
5551
5552 if ((inst->mayExceedTwoGRF() && !inst->isSend()) ||
5553 opcode == G4_nop ||
5554 opcode == G4_label)
5555 {
5556 continue;
5557 }
5558
5559 if (builder.getOption(vISA_InsertDummyMovForHWRSWA) &&
5560 (VISA_WA_CHECK(builder.getPWaTable(), Wa_16012061344) ||
5561 VISA_WA_CHECK(builder.getPWaTable(), Wa_16012292205)))
5562 {
5563 fixPredicateIndirectInst(i, bb);
5564 }
5565 // do this early since otherwise the moves inserted by other passes may still
5566 // inherit bad regions from the original inst
5567 fixSrcRegion(inst);
5568
5569 bool changed = fixMov(i, bb);
5570 if (changed)
5571 {
5572 next_iter = i;
5573 next_iter++;
5574 }
5575
5576 fixOpndType(i, bb);
5577
5578 fixSelCsel(i, bb);
5579
5580 fixPredCtrl(i, bb);
5581
5582 if (inst->getExecSize() > builder.getNativeExecSize())
5583 {
5584 if (inst->opcode() == G4_math &&
5585 inst->getDst()->getType() == Type_HF &&
5586 inst->getSrc(0)->getType() == Type_HF &&
5587 (!inst->getSrc(1) || inst->getSrc(1)->getType() == Type_HF))
5588 {
5589 // split pure HF math to simd8
5590 evenlySplitInst(i, bb);
5591 }
5592 }
5593 fix3SrcInst(i, bb);
5594
5595 G4_Operand* dst = inst->getDst();
5596
5597 #ifdef _DEBUG
5598 verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
5599 #endif
5600
5601 if (inst->isMath())
5602 {
5603 if (fixMathInst(i, bb))
5604 {
5605 // check the newly added insts later
5606 next_iter = i;
5607 next_iter++;
5608 }
5609 }
5610
5611 inst = *i;
5612
5613 #ifdef _DEBUG
5614 verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
5615 #endif
5616
5617 if (inst->opcode() == G4_mul)
5618 {
5619 if (fixMULInst(i, bb))
5620 {
5621 // inserted mach and mov
5622 // check the newly added insts later (MUL, MACH, MOV)
5623 next_iter = i;
5624 next_iter++;
5625 }
5626 }
5627
5628 #ifdef _DEBUG
5629 verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
5630 #endif
5631
5632 if (inst->opcode() == G4_mulh)
5633 {
5634 fixMULHInst(i, bb);
5635 next_iter = i;
5636 continue;
5637 }
5638
5639 #ifdef _DEBUG
5640 verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
5641 #endif
5642
5643 if (inst->opcode() == G4_madw)
5644 {
5645 next_iter = fixMadwInst(i, bb);
5646 continue;
5647 }
5648
5649 #ifdef _DEBUG
5650 verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
5651 #endif
5652
5653 // HW check #6: indirect operand spilling
5654 fixIndirectOpnd(i, bb);
5655
5656 #ifdef _DEBUG
5657 verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
5658 #endif
5659 // HW check #8: unsigned dst with execution type F
5660 /* If the execution type is F and the destination type if either UD, UW
5661 * or UB and the detination is not saturated, then we need to add an
5662 * intermediate type conversion to D.
5663 */
5664 inst = *i;
5665 opcode = inst->opcode();
5666
5667 if (opcode == G4_cmp || opcode == G4_cmpn)
5668 {
5669 dst = inst->getDst();
5670 int dst_elsize = 0;
5671 bool null_dst = !dst || inst->hasNULLDst();
5672 if (!null_dst)
5673 {
5674 dst_elsize = dst->isPredicate() ? TypeSize(Type_UW) : dst->getTypeSize();
5675 }
5676 int extypesize;
5677 G4_Type extype = inst->getOpExecType(extypesize);
5678 fixCompareInst(i, bb, extype, dst_elsize);
5679 }
5680 dst = inst->getDst();
5681
5682 #ifdef _DEBUG
5683 verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
5684 #endif
5685 if (fixAcc(i, bb))
5686 {
5687 next_iter = i;
5688 next_iter++;
5689 }
5690
5691 #ifdef _DEBUG
5692 verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
5693 #endif
5694
5695 {
5696 dst = inst->getDst();
5697 G4_Type extype = inst->getExecType2();
5698 int extypesize = TypeSize(extype);
5699 int dst_elsize = 0;
5700 if (dst)
5701 {
5702 dst_elsize = dst->getTypeSize();
5703 }
5704
5705 if (dst &&
5706 inst->getExecSize() == g4::SIMD1 &&
5707 dst_elsize < extypesize &&
5708 !IS_VTYPE(extype) &&
5709 !inst->isMixedMode() &&
5710 !hasDedicateAlignRegionConformity(inst) &&
5711 !inst->isSend())
5712 {
5713 fixDstHstride(i, extypesize);
5714 }
5715 }
5716
5717 #ifdef _DEBUG
5718 verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
5719 #endif
5720
5721 bool planeDeleted = fixPlaneInst(i, bb);
5722 if (planeDeleted)
5723 {
5724 continue;
5725 }
5726
5727 fixLine(i, bb);
5728 fixRotate(i, bb);
5729
5730 if (!builder.hasVxHFloat64b())
5731 {
5732 fixVxHFloat64b(i, bb);
5733 }
5734
5735 if (fix64bInst(i, bb))
5736 {
5737 continue;
5738 }
5739
5740 #ifdef _DEBUG
5741 verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
5742 #endif
5743 fixImm64(i, bb); // fixed immediates for DF4 in fixImm64()
5744
5745 if ((*i)->opcode() == G4_mov)
5746 {
5747 if (fixBFMove(i, bb))
5748 {
5749 continue;
5750 }
5751 }
5752 if ((*i)->opcode() == G4_fcvt)
5753 {
5754 (void)fixFcvt(i, bb);
5755 continue;
5756 }
5757 if ((*i)->opcode() == G4_srnd)
5758 {
5759 (void)fixSrnd(i, bb);
5760 continue;
5761 }
5762
5763 if ((*i)->opcode() == G4_shl || (*i)->opcode() == G4_shr || (*i)->opcode() == G4_asr)
5764 {
5765 fixShiftInsts(i, bb);
5766 continue;
5767 }
5768
5769 if (builder.getPlatform() == GENX_BDW)
5770 {
5771 fixPackedHFConversions(i, bb);
5772 }
5773
5774 fixFloatARFDst(i, bb);
5775 }
5776
5777 if (!builder.supportFloatOr64bRegioning())
5778 {
5779 for (auto iter = bb->begin(), iterEnd = bb->end(); iter != iterEnd; /* empty */)
5780 {
5781 // pre-compute nextIter as the call may destroy iter
5782 auto nextIter = std::next(iter);
5783 // since insertMovBefore/After and similar helper instructions do not
5784 // understand XeHP_SDV regioning restrictions, they may produce illegal moves
5785 // We do a catch call pass here to catch them
5786 fixUnalignedRegions(iter, bb);
5787 iter = nextIter;
5788 }
5789 }
5790
5791 // previous legalization passes may introduce int64 moves on platforms that don't support int64
5792 // we do another catch-all pass here to legalize any such moves
5793 // ToDo: see if we can remove other calls to emulate64Mov()
5794 if (builder.noInt64())
5795 {
5796 for (auto I = bb->begin(), E = bb->end(); I != E;)
5797 {
5798 auto inst = *I;
5799 auto next = std::next(I);
5800 if (inst->opcode() == G4_mov && (IS_QTYPE(inst->getDst()->getType()) || IS_QTYPE(inst->getSrc(0)->getType())))
5801 {
5802 emulate64bMov(I, bb);
5803 }
5804 I = next;
5805 }
5806 }
5807
5808 if (builder.getNativeExecSize() <= g4::SIMD8)
5809 {
5810 return;
5811 }
5812 i = bb->begin(), iEnd = bb->end();
5813 next_iter = i;
5814 for (; i != iEnd; i = next_iter)
5815 {
5816 // by default we skip the newly inserted instructions as we assume they are already HW conformed
5817 // if a check may produce new instructions that violate HW rules, it must adjust the next_iter
5818 // to point to them
5819 ++next_iter;
5820 fixByteXBarRestriction(i, bb);
5821 #ifdef _DEBUG
5822 verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
5823 #endif
5824 }
5825
5826 if (builder.getPlatform() == GENX_PVCXT) {
5827 for (auto I = bb->begin(), E = bb->end(); I != E;) {
5828 auto inst = *I;
5829 auto next = std::next(I);
5830
5831 G4_DstRegRegion *dst = inst->getDst();
5832 bool crossGRFDst = dst && dst->isCrossGRFDst();
5833
5834 if (crossGRFDst && IS_QTYPE(dst->getType()) && !inst->isSend() &&
5835 !inst->isDpas()) {
5836 bool hasQTypeSrc = false;
5837 for (int i = 0; i < inst->getNumSrc(); i++) {
5838 if (IS_QTYPE(inst->getSrc(i)->getType())) {
5839 hasQTypeSrc = true;
5840 break;
5841 }
5842 }
5843
5844 if (!hasQTypeSrc) {
5845 evenlySplitInst(I, bb);
5846
5847 #ifdef _DEBUG
5848 verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
5849 #endif
5850 }
5851 }
5852
5853 I = next;
5854 }
5855 }
5856 }
5857
5858 //
5859 // SIMD16 addc/subb are illegal on GEN, since they write to acc and there are
5860 // only 8 acc channels for D/UD type. In vISA IR we should get something like
5861 // addc (16|M0) V0 V2 V3
5862 // use (16|M0) V1 ... acc0:ud // or :d
5863 // which needs to be translated to
5864 // addc (8|M0) V0(0) V2(0) V3(0)
5865 // use (8|M0) V1(0) ... acc0:ud
5866 // addc (8|M8) V0(1) V2(1) V3(1)
5867 // use (8|M8) V1(1) ... acc0:ud
5868 // NOTE: we also support other consumers such as add.
5869 //
5870 //
5871 // We do this first thing in HW conformity to avoid REXES from splitting addc/subb incorrectly
5872 // We also count on previous opt to preserve the inst pair by not inserting any acc using inst in between;
5873 // it should hopefully be the case since we generally don't optimize instructions with acc src/dst
5874 //
5875 // If exec size of addc is < 8, we also have to make sure both the addc's dst and the carry move's dst are
5876 // GRF-aligned, since acc's channel is dependent on the dst's subreg offset. In other words, we fix
5877 // addc (1) r1.0 ...
5878 // mov (1) r1.1 acc0.0<0;1,0>
5879 // into
5880 // addc (1) r1.0 ...
5881 // mov (1) r2.0 acc0.0<0;1,0>
5882 // mov (1) r1.1 r2.0
5883 //
fixAddcSubb(G4_BB * bb)5884 bool HWConformity::fixAddcSubb(G4_BB* bb)
5885 {
5886 bool changed = false;
5887 for (auto iter = bb->begin(), iterEnd = bb->end();
5888 iter != iterEnd; ++iter)
5889 {
5890 G4_INST* inst = *iter;
5891
5892 if (inst->opcode() != G4_addc && inst->opcode() != G4_subb)
5893 {
5894 continue;
5895 }
5896
5897 // Fix the src1 if it's a immediate operand whose type can only be :ud
5898 for (int i = 0; i < 2; i++)
5899 {
5900 G4_Operand* src = inst->getSrc(i);
5901 if (src && src->isImm() && src->getType() == Type_UW)
5902 {
5903 // just change the immediate's type to :ud
5904 uint32_t immVal = (uint32_t)src->asImm()->getImm();
5905 inst->setSrc(builder.createImm(immVal, Type_UD), i);
5906 }
5907 }
5908
5909 if (inst->getExecSize() != builder.getNativeExecSize())
5910 {
5911 // find the matching carry move
5912 G4_INST* carryUse = nullptr;
5913 auto srchIter = iter;
5914 for (++srchIter; srchIter != iterEnd; ++srchIter)
5915 {
5916 G4_INST* inst2 = *srchIter;
5917 auto op = inst2->opcode();
5918
5919 bool opPossibleConsumer =
5920 op == G4_mov || op == G4_add || op == G4_addc ||
5921 op == G4_mad || op == G4_pseudo_mad || op == G4_add3;
5922
5923 // only check for a handful of user instructions
5924 // this list could be extended
5925 if (opPossibleConsumer &&
5926 inst2->getExecSize() == inst->getExecSize() &&
5927 inst2->useAcc())
5928 {
5929 carryUse = inst2;
5930 break;
5931 }
5932 else if (inst2->useAcc())
5933 {
5934 // someone redefines acc0; we can stop looking
5935 break;
5936 }
5937 }
5938
5939 if (carryUse == NULL)
5940 {
5941 // can't find the move using acc, skip this addc/subb
5942 assert(false && "unable to find addc/subc consumer");
5943 continue;
5944 }
5945
5946 if (inst->getExecSize() > builder.getNativeExecSize())
5947 {
5948 // we're breaking a bigger instruction into a smaller one
5949 evenlySplitInst(iter, bb);
5950 evenlySplitInst(srchIter, bb);
5951
5952 // srchIter now points to the second half of move, and we want to move the first move to be
5953 // before the second half of the addc/subb, which is pointed by iter
5954 --srchIter;
5955 G4_INST* mov1 = *srchIter;
5956 bb->erase(srchIter);
5957 bb->insertBefore(iter, mov1);
5958
5959 changed = true;
5960 }
5961 else
5962 {
5963 // we will need to GRF-align addc's dst as well as the move dst,
5964 // so that the acc will have the correct offset
5965 // note that insertMovAfter will align the tmp since addc/subb has implicit acc use
5966 if (!builder.isOpndAligned(inst->getDst(), 32))
5967 {
5968 inst->setDest(
5969 insertMovAfter(iter, inst->getDst(), inst->getDst()->getType(), bb));
5970 changed = true;
5971 }
5972 if (!builder.isOpndAligned(carryUse->getDst(), 32))
5973 {
5974 carryUse->setDest(
5975 insertMovAfter(srchIter, carryUse->getDst(), carryUse->getDst()->getType(), bb));
5976 changed = true;
5977 }
5978 }
5979 }
5980 }
5981 return changed;
5982 }
5983
5984 //
5985 // Mixed mode instruction allows bfloat16 operands in the following cases:
5986 // 1. dst, src0, and src1 for 2 source instructions format not involving multiplier(mov, add, cmp, sel).
5987 // 2. dst and src0 for 2 source instructions format involving multiplier(mul, mac etc).
5988 // 3. dst, src0, and src1 for 3 source instructions format(mad).
5989 // 4. Broadcast of bfloat16 scalar is not supported.
5990 // 5. Unpacked bfloat16 destination with stride 2 when register offset is 0 or 1.
5991 // 6. Packed bfloat16 source and destination when register offset is 0 or 8.
5992 // 7. Execution size must not be greater than 8.
5993 // 8. Instructions with pure bfloat16 operands are not supported.
5994 // 6 & 7: register offset would be 0 or 16; execution size is at most 16
5995 //
5996 // **More examples**
5997 // 1. BF imm is not allowed
5998 // mov (1|M0) r12.0<1>:f 0xffff:bf - ILLEGAL "Imm operand with BF type is not allowed"
5999 // 2. BF scalar operand can be used in SIMD1
6000 // mul (1|M0) r14.0<1>:f r11.0<0;1,0>:bf r12.3<0;1,0>:f - OK
6001 // 3. For SIMD1, scalar operands (both dst/src) of F or BF can have any subreg!
6002 // add (1|M0) r16.3<1>:bf r11.0<0;1,0>:f r12.3<0;1,0>:f - OK
6003 // 4. F Operand should have subreg = 0 if execSize > SIMD1
6004 // add (2|M0) r10.4<1>:f r11.0<1;1,0>:bf 0x12345:f
6005 // ILLEGAL "Src0 regioning must be aligned to destination or scalar for Float/64bit pipes"
6006 // 5. Others
6007 // add (8|M0) r16.0<2>:bf r11.0<1;1,0>:f r12.0<1;1,0>:f- OK
6008 // add (8|M0) r16.1<2>:bf r11.0<1;1,0>:f r12.8<1;1,0>:f- OK
6009 // add (8|M0) r16.0<1>:bf r11.0<1;1,0>:f r12.8<1;1,0>:f- OK
6010 // add (8|M0) r16.8<1>:bf r11.0<1;1,0>:f r12.0<1;1,0>:f- OK
6011 // Note that float source operands can be scalar region <0;1,0>
6012 //
fixBFMixedMode()6013 void HWConformity::fixBFMixedMode()
6014 {
6015 auto useGivenType = [](G4_INST* I, G4_Type GivenTy)
6016 {
6017 G4_Operand* dst = I->getDst();
6018 // ignore cmp's dst (?)
6019 if (dst && !dst->isNullReg() && !I->isCompare())
6020 {
6021 if (dst->getType() == GivenTy)
6022 return true;
6023 }
6024 for (int i = 0; i < I->getNumSrc(); ++i)
6025 {
6026 G4_Operand* src = I->getSrc(i);
6027 if (src && !src->isNullReg())
6028 {
6029 if (src->getType() == GivenTy)
6030 return true;
6031 }
6032 }
6033 return false;
6034 };
6035
6036 auto allowBFForInst = [](G4_INST* I, Gen4_Operand_Number OpndNum = Opnd_total_num)
6037 {
6038 // Only mul/mac/mad/add/cmp/mov/sel support BF mixed mode.
6039 switch (I->opcode())
6040 {
6041 case G4_mul:
6042 case G4_mac:
6043 {
6044 if (OpndNum == Opnd_src1)
6045 return false;
6046 return true;
6047 }
6048 case G4_mad:
6049 case G4_pseudo_mad:
6050 {
6051 if (OpndNum == Opnd_src2)
6052 return false;
6053 return true;
6054 }
6055 case G4_add:
6056 case G4_cmp:
6057 case G4_mov:
6058 case G4_sel:
6059 return true;
6060 default:
6061 break;
6062 }
6063 return false;
6064 };
6065
6066 auto skipBFCheck = [&useGivenType](G4_INST* I)
6067 {
6068 // Skip dpas/send
6069 if (I->isDpas() || I->isSend())
6070 return true;
6071
6072 // Do not use BF, skip
6073 if (!useGivenType(I, Type_BF))
6074 return true;
6075
6076 // Special case:
6077 // 1. mov d:bf s:bf --> mov d:uw s:uw
6078 // 2. mov d:f s:bf --> shl d:ud s:uw 16:ud
6079 if (I->opcode() == G4_mov && I->getSrc(0)->getType() == Type_BF)
6080 {
6081 // this will be handled by fixBFMov.
6082 return true;
6083 }
6084 return false;
6085 };
6086
6087 if (!kernel.fg.builder->hasBFMixMode())
6088 {
6089 return;
6090 }
6091
6092 const G4_ExecSize nativeES = kernel.fg.builder->getNativeExecSize();
6093 for (auto& bb : kernel.fg)
6094 {
6095 // First iteration:
6096 // 1. Legalize scalar BF operand for insts that need splitting
6097 // (If this is done in 3, we will have more than 1 scalar mov.)
6098 // mul (16|M0) d<1>:f s0<1;1,0>:bf s1<0;1,0>:bf
6099 // ==>
6100 // (W) mov (1|M0) t<1>:f s1<0;1,0>:bf
6101 // mul (16|M0) d<1>:f s0<1;1,0>:bf t<0;1,0>:f
6102 // 2. split instructions (case 7)
6103 // add (16|M0) d:bf s0:bf s1:bf
6104 // ==>
6105 // add (8|M0) d:bf s0:bf s1:bf
6106 // add (8|M8) d.8:bf s0.8:bf s1.8:bf
6107 // 3. legalize operands by using cvt mov to BF or from BF. (case 1&2&3)
6108 // mul (8|M0) d:bf s0:bf s1:bf
6109 // ==>
6110 // mov (8|M0) s:f s1:bf
6111 // mul (8|M0) t:bf s0:bf s:f
6112 // Note pure BF insts will be handled in the second iteration.
6113 INST_LIST_ITER nextII = bb->begin();
6114 for (auto II = nextII, IE = bb->end(); II != IE; II = nextII)
6115 {
6116 ++nextII;
6117 G4_INST* Inst = *II;
6118 if (skipBFCheck(Inst))
6119 continue;
6120
6121 const bool isBFAllowedInst = allowBFForInst(Inst);
6122 const G4_ExecSize currES = Inst->getExecSize();
6123 std::list<INST_LIST_ITER> instsToSplit;
6124
6125 // 1. Handle illegal BF scalar by generating mov
6126 // First generate mov for scalars instead of splitting first and
6127 // than generating mov. Doing so would need just one mov.
6128 bool changed = false;
6129 if (currES > nativeES)
6130 {
6131 // If inst's execsize <= nativeES, it doesn't need splitting,
6132 // as its operand takes one GRF at most.
6133 for (int i = 0, nsrc = (int)Inst->getNumSrc(); i < nsrc; ++i)
6134 {
6135 G4_Operand* S = Inst->getSrc(i);
6136 Gen4_Operand_Number opndNum = Inst->getSrcOperandNum(i);
6137 if (S->getType() == Type_BF && S->isSrcRegRegion())
6138 {
6139 if (S->asSrcRegRegion()->getRegion()->isScalar()
6140 && (!isBFAllowedInst || !allowBFForInst(Inst, opndNum)))
6141 {
6142 G4_Operand* newSrc = insertMovBefore(II, i, Type_F, bb);
6143 Inst->setSrc(newSrc, i);
6144 changed = true;
6145 }
6146 }
6147 else if (S->getType() == Type_BF && S->isImm())
6148 {
6149 assert(false && "BF immediate not supported!");
6150 }
6151 }
6152 }
6153
6154 // If changed, check if it still uses BF. Skip if not.
6155 if (changed && !useGivenType(Inst, Type_BF))
6156 {
6157 continue;
6158 }
6159
6160 // 2. Split instruction (case 7) if needed
6161 // Now, BF operands are all non-scalar for insts that need splitting.
6162 // We split inst under the following:
6163 // 1. If an inst, which don't support BF, has BF operands. Those BF operands
6164 // must be replaced with F operands (by inserting mov to convert BF to F).
6165 // If replacing a BF operand with a F operand makes it cross 2 GRF, it must
6166 // be splitted (currES * F" > 2 GRF); or
6167 // 2. Split if currES > nativeES for insts that support BF. (case 7)
6168 std::list<INST_LIST_ITER> instsToCheck;
6169 if ((!isBFAllowedInst && (TypeSize(Type_F) * currES) > (getGRFSize() * 2))
6170 || (isBFAllowedInst && currES > nativeES))
6171 {
6172 if (currES == g4::SIMD32)
6173 {
6174 splitSIMD32Inst(II, bb);
6175 if (isBFAllowedInst && nativeES == g4::SIMD8)
6176 {
6177 // need to split again.
6178 INST_LIST_ITER prev_it = std::prev(II);
6179 evenlySplitInst(prev_it, bb);
6180 instsToCheck.push_back(std::prev(prev_it));
6181 instsToCheck.push_back(prev_it);
6182 evenlySplitInst(II, bb);
6183 }
6184 }
6185 else
6186 {
6187 evenlySplitInst(II, bb);
6188 }
6189 instsToCheck.push_back(std::prev(II));
6190 instsToCheck.push_back(II);
6191 }
6192 else
6193 {
6194 instsToCheck.push_back(II);
6195 }
6196
6197 // 3. Change BF operands, which are not legal, to F by generating mov.
6198 // (isBFAllowedInst should be still valid to check if any new instruction
6199 // from splitting is BF allowed or not.)
6200 for (auto LI : instsToCheck)
6201 {
6202 INST_LIST_ITER thisII = LI;
6203 G4_INST* tI = *thisII;
6204 for (int i = 0, nsrc = (int)tI->getNumSrc(); i < nsrc; ++i)
6205 {
6206 G4_Operand* S = tI->getSrc(i);
6207 Gen4_Operand_Number opndNum = tI->getSrcOperandNum(i);
6208 if (S->getType() == Type_BF
6209 && (!isBFAllowedInst || !allowBFForInst(tI, opndNum)))
6210 {
6211 G4_Operand* newSrc = insertMovBefore(thisII, i, Type_F, bb);
6212 tI->setSrc(newSrc, i);
6213 }
6214 }
6215
6216 G4_DstRegRegion* Dst = tI->getDst();
6217 if (!isBFAllowedInst && Dst && !Dst->isNullReg() && Dst->getType() == Type_BF)
6218 {
6219 G4_DstRegRegion* newDst = insertMovAfter(thisII, Dst, Type_F, bb);
6220 tI->setDest(newDst);
6221
6222 auto movII = std::next(II);
6223 instsToSplit.push_back(movII);
6224 G4_INST* movI = *movII;
6225
6226 Inst->transferUse(movI);
6227 Inst->addDefUse(movI, Opnd_src0);
6228 }
6229 }
6230 instsToCheck.clear();
6231 }
6232
6233 // Second iteration:
6234 // Legalize regions by using mov.
6235 nextII = bb->begin();
6236 for (auto II = nextII, IE = bb->end(); II != IE; II = nextII)
6237 {
6238 ++nextII;
6239 G4_INST* Inst = *II;
6240 if (skipBFCheck(Inst))
6241 continue;
6242
6243 // Because of the first iteration above, this inst must support bf mixed mode.
6244 assert(allowBFForInst(Inst));
6245
6246 const G4_ExecSize currES = Inst->getExecSize();
6247 bool changed = false;
6248 // case 4: broadcast of bf is not supported!
6249 // As this bf operand is changed to F. At the end of loop, need to check
6250 // if this inst still has both BF and F, and "changed" is for this purpose.
6251 // case 8: pure BF is not allowed.
6252 for (int i = 0, nsrc = (int)Inst->getNumSrc(); i < nsrc; ++i)
6253 {
6254 G4_Operand* S = Inst->getSrc(i);
6255 if (S->getType() == Type_BF)
6256 {
6257 assert(S->isSrcRegRegion());
6258 G4_SrcRegRegion* srcReg = S->asSrcRegRegion();
6259 if ((srcReg->getRegion()->isScalar() && currES > g4::SIMD1) // broadcast BF scalar
6260 || (i == (nsrc - 1) && !useGivenType(Inst, Type_F))) // pure BF.
6261 {
6262 // Insert bf->f, which is just a left-shift.
6263 uint32_t nelts = (uint32_t)(srcReg->getRegion()->isScalar() ? g4::SIMD1 : currES);
6264 G4_Declare* newDcl = builder.createTempVar(nelts,
6265 Type_UD, (nelts == 1) ? Even_Word : GRFALIGN, "cvtF", false);
6266 G4_DstRegRegion* newDst = builder.createDst(newDcl->getRegVar(), Type_UD);
6267 srcReg->setType(Type_UW);
6268 G4_INST* shlInst = builder.createBinOp(G4_shl,
6269 (nelts== 1) ? g4::SIMD1 : currES,
6270 newDst, S, builder.createImm(16, Type_UD), InstOpt_WriteEnable, false);
6271 bb->insertBefore(II, shlInst);
6272
6273 // srcMod, if present, must be on the promoted F operand!
6274 G4_SrcModifier sMod = srcReg->getModifier();
6275 srcReg->setModifier(Mod_src_undef);
6276 G4_SrcRegRegion* newSrc = builder.createSrc(
6277 newDcl->getRegVar(), 0, 0,
6278 (nelts == 1) ? builder.getRegionScalar() : builder.getRegionStride1(), Type_F);
6279 newSrc->setModifier(sMod);
6280 Inst->setSrc(newSrc, i);
6281
6282 Gen4_Operand_Number opndNum = Inst->getSrcOperandNum(i);
6283 Inst->transferDef(shlInst, opndNum, Opnd_src0);
6284 shlInst->addDefUse(Inst, opndNum);
6285
6286 changed = true;
6287 }
6288 }
6289 }
6290
6291 if (changed)
6292 {
6293 // Check again if there is still BF type, if not, we are done.
6294 if (!useGivenType(Inst, Type_BF))
6295 {
6296 continue;
6297 }
6298 }
6299
6300 if (currES == g4::SIMD1)
6301 {
6302 // Done
6303 continue;
6304 }
6305
6306 for (int i = 0, nsrc = (int)Inst->getNumSrc(); i < nsrc; ++i)
6307 {
6308 G4_Operand* S = Inst->getSrc(i);
6309 if (S->getType() == Type_F
6310 && (S->isImm() || (S->isSrcRegRegion() && S->asSrcRegRegion()->getRegion()->isScalar())))
6311 {
6312 continue;
6313 }
6314
6315 assert(S->isSrcRegRegion());
6316 G4_SrcRegRegion* sReg = S->asSrcRegRegion();
6317
6318 // case 6: Packed bfloat16 source and destination when register offset is 0 or 8.
6319 // (also for Float dst/src alignment)
6320 // Note that for F, enforce it to have subRegOff = 0 (too restrictive?)
6321 bool isPackedSrc = (sReg->getRegion()->isContiguous(Inst->getExecSize())
6322 && (sReg->getSubRegOff() == 0 || (sReg->getType() == Type_BF && sReg->getSubRegOff() == nativeES)));
6323 if (isPackedSrc)
6324 {
6325 continue;
6326 }
6327
6328 G4_Operand* newSrc = insertMovBefore(II, i, sReg->getType(), bb, GRFALIGN);
6329 Inst->setSrc(newSrc, i);
6330 }
6331
6332 if (Inst->isCompare())
6333 {
6334 // Ignore compare's dst.
6335 continue;
6336 }
6337
6338 G4_DstRegRegion* dst = Inst->getDst();
6339 uint32_t subOff = dst->getSubRegOff();
6340 // case 5
6341 bool isUnpackedDst = (dst->getType() == Type_BF
6342 && dst->getHorzStride() == 2 && (subOff == 0 || subOff == 1));
6343 // case 6, note for F, force it to have subOff = 0
6344 bool isPackedDst = (dst->getHorzStride() == 1
6345 && (subOff == 0 || (subOff == nativeES && dst->getType() == Type_BF)));
6346 if (!(isPackedDst || isUnpackedDst))
6347 {
6348 // case 5 Unpacked bfloat16 destination with stride 2 when register offset is 0 or 1.
6349 G4_DstRegRegion* newDst = insertMovAfter(II, dst, dst->getType(), bb, GRFALIGN);
6350 Inst->setDest(newDst);
6351
6352 auto movII = std::next(II);
6353 G4_INST* movI = *movII;
6354
6355 Inst->transferUse(movI, false);
6356 Inst->addDefUse(movI, Opnd_src0);
6357 }
6358 }
6359 }
6360 }
6361
chkHWConformity()6362 void HWConformity::chkHWConformity()
6363 {
6364 fixDataLayout();
6365
6366 fixBFMixedMode();
6367
6368 for (auto bb : kernel.fg)
6369 {
6370 curBB = bb;
6371 fixIntToHFMove(bb);
6372 #ifdef _DEBUG
6373 verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
6374 #endif
6375 fixAddcSubb(bb);
6376 #ifdef _DEBUG
6377 verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
6378 #endif
6379
6380 fixMADInst(bb);
6381
6382 #ifdef _DEBUG
6383 verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
6384 #endif
6385 // fix source operand first to avoid redundant MOVs if this fix is done after
6386 // reducing execution size.
6387 // used by 3d. Mainly to fix sel with two imm sources
6388 fixOpndTypeAlign(bb);
6389
6390 #ifdef _DEBUG
6391 verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
6392 #endif
6393
6394 fixInstExecSize(bb);
6395
6396 #ifdef _DEBUG
6397 verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
6398 #endif
6399
6400 fixMixedHFInst(bb);
6401
6402 #ifdef _DEBUG
6403 verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
6404 #endif
6405 fixSADA2Inst(bb);
6406
6407 #ifdef _DEBUG
6408 verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
6409 #endif
6410
6411 fixSendInst(bb);
6412
6413 if (builder.avoidDstSrcOverlap())
6414 {
6415 fixOverlapInst(bb);
6416 }
6417
6418 if (builder.avoidSrc1Src2Overlap())
6419 {
6420 fixsrc1src2Overlap(bb);
6421 }
6422
6423 #ifdef _DEBUG
6424 verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
6425 #endif
6426
6427 conformBB(bb);
6428
6429 #ifdef _DEBUG
6430 verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
6431 #endif
6432 }
6433
6434 if (builder.avoidDstSrcOverlap())
6435 {
6436 PointsToAnalysis p(kernel.Declares, kernel.fg.getNumBB());
6437 p.doPointsToAnalysis(kernel.fg);
6438
6439 avoidDstSrcOverlap(p);
6440 }
6441 }
6442
hasBadRegion(G4_INST * inst)6443 bool HWConformity::hasBadRegion(G4_INST* inst)
6444 {
6445 if (inst->getImplAccDst() || inst->getImplAccSrc())
6446 return false;
6447 bool badRegion = false;
6448
6449 #define G4_MAX_ADDR_IMM 511
6450 #define GENX_MAX_H_STRIDE 4
6451 for (unsigned int srcNum = 0, n_srcs = inst->getNumSrc(); srcNum < n_srcs; srcNum++)
6452 {
6453 if (!(inst->getSrc(srcNum)->isSrcRegRegion()))
6454 {
6455 continue;
6456 }
6457 const RegionDesc* rd = inst->getSrc(srcNum)->asSrcRegRegion()->getRegion();
6458 if (rd->isRegionWH())
6459 {
6460 badRegion = true;
6461 break;
6462 }
6463 if (rd->horzStride == GENX_MAX_H_STRIDE && rd->width > 1)
6464 {
6465 badRegion = true;
6466 break;
6467 }
6468 G4_SrcRegRegion* expandSrcRegion = inst->getSrc(srcNum)->asSrcRegRegion();
6469 if (expandSrcRegion->getRegAccess() != Direct)
6470 {
6471 const RegionDesc* origRegion = expandSrcRegion->getRegion();
6472 short secondSubRegOffDiff = 0, secondAddrImmedDiff = 0;
6473
6474 if (origRegion->width == 1)
6475 {
6476 secondSubRegOffDiff = origRegion->vertStride;
6477 }
6478 else
6479 {
6480 secondSubRegOffDiff = origRegion->horzStride;
6481 }
6482 secondAddrImmedDiff = (short)(secondSubRegOffDiff * expandSrcRegion->getTypeSize());
6483 if ((expandSrcRegion->getAddrImm() + secondAddrImmedDiff) > G4_MAX_ADDR_IMM)
6484 {
6485 badRegion = true;
6486 break;
6487 }
6488 }
6489 }
6490 return badRegion;
6491 }
6492
canSplitInst(G4_INST * inst,G4_INST * use_op)6493 bool HWConformity::canSplitInst(G4_INST* inst, G4_INST* use_op)
6494 {
6495 if ((inst->getPredicate() && inst->getExecSize() < g4::SIMD16) || hasBadRegion(inst))
6496 return false;
6497
6498 G4_CondMod* condMod = inst->getCondMod();
6499 if (condMod)
6500 {
6501 return false;
6502 }
6503
6504 for (int i = 0; i < inst->getNumSrc(); i++)
6505 {
6506 G4_Operand* src = inst->getSrc(i);
6507 if (src->isAccReg())
6508 {
6509 // don't split inst with explicit acc
6510 return false;
6511 }
6512 if (src->isSrcRegRegion() &&
6513 src->asSrcRegRegion()->getRegion()->vertStride == 32 &&
6514 src->asSrcRegRegion()->getRegion()->width == 1)
6515 {
6516 // don't split the source into even/odd since verstride can't exceed 32
6517 // ToDo: check for horizontal stride as well?
6518 return false;
6519 }
6520 }
6521
6522 return true;
6523 }
6524
canSplitByteDst(G4_opcode op)6525 bool HWConformity::canSplitByteDst(G4_opcode op)
6526 {
6527 switch (op)
6528 {
6529 case G4_mac:
6530 case G4_mach:
6531 case G4_cmp:
6532 case G4_mad:
6533 case G4_sad2:
6534 case G4_sada2:
6535 case G4_line:
6536 case G4_send:
6537 case G4_sendc:
6538 return false;
6539 default:
6540 return true;
6541 }
6542 }
6543 // split one instruction into 2 if its dstination is packed byte and execution type is W.
6544 // for example:
6545 // add <16> V1(0,0)<1>:b V1(0,0)<16;16,1>:w V2(0,0)<16;16,1>:w
6546 // ==>
6547 // add <8> V1(0,0)<2>:b V1(0,0)<16;8,2>:w V2(0,0)<16;8,2>:w
6548 // add <8> V1(0,1)<2>:b V1(0,1)<16;8,2>:w V2(0,1)<16;8,2>:w
6549
6550 // if predicate is used for instruction, the definition of this predicate is tracked and the
6551 // corresponding instruction is checked to see if it can do the same split.
splitInstListForByteDst(INST_LIST_ITER it,G4_BB * bb,uint16_t extypesize)6552 bool HWConformity::splitInstListForByteDst(INST_LIST_ITER it, G4_BB* bb, uint16_t extypesize)
6553 {
6554 G4_INST* inst = *it;
6555 G4_opcode inst_op = inst->opcode();
6556 G4_DstRegRegion* dst = inst->getDst();
6557 // check if we can split the inst
6558 if (!canSplitByteDst(inst_op) ||
6559 inst->getExecSize() == g4::SIMD1 ||
6560 (!bb->isAllLaneActive() && !inst->isWriteEnableInst()) ||
6561 dst->getByteOffset() % extypesize != 0 ||
6562 dst->getHorzStride() != 1 ||
6563 extypesize != TypeSize(Type_W))
6564 {
6565 return false;
6566 }
6567
6568 if (inst->getPredicate() || inst->getCondMod())
6569 {
6570 return false;
6571 }
6572
6573 // recursively the inst that defines its predicate can be split
6574 INST_LIST expandOpList;
6575 bool canSplit = canSplitInst(inst, NULL);
6576 if (canSplit)
6577 {
6578 expandOpList.push_back(inst);
6579 }
6580
6581 G4_INST* currInst = inst;
6582 while (canSplit && currInst->getPredicate())
6583 {
6584 // look for predicate def inst
6585 uint16_t defNum = 0;
6586 G4_INST* defInst = NULL;
6587
6588 // FIXME: should be currInst->defInstList.begin()?
6589 for (auto def_iter = inst->def_begin(), end = inst->def_end(); def_iter != end; def_iter++)
6590 {
6591 if ((*def_iter).second == Opnd_pred)
6592 {
6593 defNum++;
6594 defInst = (*def_iter).first;
6595 }
6596 }
6597 if (defNum != 1 || !defInst->getCondMod())
6598 {
6599 canSplit = false;
6600 break;
6601 }
6602 if (canSplit)
6603 {
6604 if (!bb->isAllLaneActive() && !defInst->isWriteEnableInst())
6605 {
6606 canSplit = false;
6607 }
6608 else
6609 {
6610 canSplit = canSplitInst(defInst, currInst);
6611 }
6612 }
6613 // check if def inst can be split
6614 if (!canSplit)
6615 {
6616 break;
6617 }
6618 else
6619 {
6620 expandOpList.push_back(defInst);
6621 currInst = defInst;
6622 }
6623 }
6624
6625 // split inst into two
6626 INST_LIST_ITER new_iter = it;
6627 new_iter++;
6628 if (canSplit)
6629 {
6630 while (!expandOpList.empty())
6631 {
6632 G4_INST* expand_op = expandOpList.front();
6633 expandOpList.pop_front();
6634 // find location of expand_op in instruction list
6635 do
6636 {
6637 new_iter--;
6638 if ((*new_iter) == expand_op)
6639 {
6640 break;
6641 }
6642 } while (new_iter != bb->begin());
6643
6644 MUST_BE_TRUE(new_iter != bb->end(), "Cannot find predicate definition function in BB.");
6645 new_iter++;
6646 G4_INST* secondHalfOp = splitInstWithByteDst(expand_op);
6647 MUST_BE_TRUE(secondHalfOp, "Error in spliting instruction.");
6648 bb->insertBefore(new_iter, secondHalfOp);
6649 }
6650 }
6651
6652
6653 return canSplit;
6654 }
6655
splitInstWithByteDst(G4_INST * expand_op)6656 G4_INST* HWConformity::splitInstWithByteDst(G4_INST* expand_op)
6657 {
6658 G4_ExecSize newExecSize {expand_op->getExecSize() / 2};
6659
6660 if (expand_op->getPredicate())
6661 {
6662 expand_op->getPredicate()->splitPred();
6663 }
6664 if (expand_op->getCondMod())
6665 {
6666 expand_op->getCondMod()->splitCondMod();
6667 }
6668 G4_INST* expand_sec_half_op = builder.createInternalInst(
6669 builder.duplicateOperand(expand_op->getPredicate()),
6670 expand_op->opcode(),
6671 builder.duplicateOperand(expand_op->getCondMod()),
6672 expand_op->getSaturate(),
6673 newExecSize,
6674 NULL,
6675 NULL,
6676 NULL,
6677 NULL,
6678 expand_op->getOption());
6679 MUST_BE_TRUE(expand_sec_half_op != NULL, ERROR_MEM_ALLOC);
6680
6681 expand_op->setExecSize(newExecSize);
6682
6683 if (expand_op->getDst() && !expand_op->hasNULLDst())
6684 {
6685 G4_DstRegRegion* old_dst = expand_op->getDst();
6686 short secondSubRegOff = old_dst->getSubRegOff() + 1;
6687
6688 G4_DstRegRegion* newDstOpnd = nullptr;
6689
6690 if (!old_dst->isIndirect())
6691 {
6692 newDstOpnd = builder.createDst(
6693 old_dst->getBase(),
6694 old_dst->getRegOff(),
6695 old_dst->getSubRegOff(),
6696 old_dst->getHorzStride() * 2,
6697 old_dst->getType());
6698 }
6699 else
6700 {
6701 newDstOpnd = builder.createIndirectDst(
6702 old_dst->getBase(),
6703 old_dst->getSubRegOff(),
6704 old_dst->getHorzStride() * 2,
6705 old_dst->getType(),
6706 old_dst->getAddrImm());
6707 secondSubRegOff -= 1;
6708 }
6709
6710 expand_op->setDest(newDstOpnd);
6711
6712 G4_DstRegRegion* secondDstOpnd = nullptr;
6713
6714 if (!old_dst->isIndirect())
6715 {
6716 secondDstOpnd = builder.createDst(
6717 old_dst->getBase(),
6718 old_dst->getRegOff(),
6719 secondSubRegOff,
6720 old_dst->getHorzStride() * 2,
6721 old_dst->getType());
6722 }
6723 else
6724 {
6725 secondDstOpnd = builder.createIndirectDst(
6726 old_dst->getBase(),
6727 secondSubRegOff,
6728 old_dst->getHorzStride() * 2,
6729 old_dst->getType(),
6730 old_dst->getAddrImm() + 1);
6731 }
6732
6733 expand_sec_half_op->setDest(secondDstOpnd);
6734 }
6735 else
6736 {
6737 expand_sec_half_op->setDest(expand_op->getDst());
6738 }
6739
6740 for (int k = 0, n_srcs = expand_op->getNumSrc(); k < n_srcs; k++)
6741 {
6742 G4_Operand* expand_src = expand_op->getSrc(k);
6743
6744 if (!expand_src)
6745 continue;
6746
6747 if ((expand_op->isMath() && k == 1 && expand_src->isNullReg()) ||
6748 expand_src->isImm()) {
6749 expand_sec_half_op->setSrc(expand_src, k);
6750 }
6751 else if (expand_src->isSrcRegRegion()) {
6752 G4_SrcRegRegion* expandSrcRegion = expand_src->asSrcRegRegion();
6753
6754 if (expandSrcRegion->isScalar()) {
6755 expand_sec_half_op->setSrc(builder.duplicateOperand(expand_src), k);
6756 }
6757 else {
6758 short secondSubRegOffDiff = 0, secondAddrImmedDiff = 0;
6759
6760 const RegionDesc* origRegion = expandSrcRegion->getRegion();
6761 const RegionDesc* newRegion = NULL;
6762
6763 if (origRegion->width == 1)
6764 {
6765 newRegion = builder.createRegionDesc(origRegion->vertStride * 2, origRegion->width, origRegion->horzStride);
6766 secondSubRegOffDiff = origRegion->vertStride;
6767 }
6768 else
6769 {
6770 unsigned short newWD = origRegion->width / 2;
6771 secondSubRegOffDiff = origRegion->horzStride;
6772 newRegion = builder.createRegionDesc(
6773 (newWD == 1 && newExecSize == 1) ? 0 : origRegion->vertStride,
6774 newWD, (newWD == 1) ? 0 : origRegion->horzStride * 2);
6775 }
6776 secondAddrImmedDiff = (short)(secondSubRegOffDiff * expand_src->getTypeSize());
6777 expandSrcRegion->setRegion(newRegion);
6778
6779 bool directSrc = (expandSrcRegion->getRegAccess() == Direct);
6780 if (secondAddrImmedDiff >= (int)numEltPerGRF<Type_UB>())
6781 {
6782 secondSubRegOffDiff =
6783 (short)((secondAddrImmedDiff - numEltPerGRF<Type_UB>()) / expand_src->getTypeSize());
6784 }
6785 G4_SrcRegRegion* secondSrcOpnd = builder.createSrcRegRegion(
6786 expandSrcRegion->getModifier(),
6787 expandSrcRegion->getRegAccess(),
6788 expandSrcRegion->getBase(),
6789 expandSrcRegion->getRegOff() + ((directSrc && secondAddrImmedDiff >= (int)numEltPerGRF<Type_UB>()) ? 1 : 0),
6790 expandSrcRegion->getSubRegOff() + (directSrc ? secondSubRegOffDiff : 0),
6791 newRegion,
6792 expandSrcRegion->getType());
6793 if (expandSrcRegion->getRegAccess() != Direct)
6794 {
6795 secondSrcOpnd->setImmAddrOff(expandSrcRegion->getAddrImm() + secondAddrImmedDiff);
6796 }
6797 expand_sec_half_op->setSrc(secondSrcOpnd, k);
6798 }
6799 }
6800 }
6801 expand_sec_half_op->inheritDIFrom(expand_op);
6802
6803 if (expand_op->getPredicate() || expand_op->getCondMod())
6804 {
6805 if (expand_op->getMaskOffset() == 0)
6806 {
6807 expand_sec_half_op->setMaskOption(InstOpt_M8);
6808 }
6809 else if (expand_op->getMaskOffset() == 16)
6810 {
6811 expand_sec_half_op->setMaskOption(InstOpt_M24);
6812 }
6813 else if (!(expand_op->opcode() == G4_sel && !(expand_op->getPredicate()) && expand_op->getCondMod()))
6814 {
6815 expand_sec_half_op->setMaskOption(newExecSize > 8 ? InstOpt_M16 : InstOpt_M8);
6816 }
6817 }
6818 return expand_sec_half_op;
6819 }
6820
6821 // in addition, fix the source region to follow the region restriction:
6822 // 1. ExecSize must be greater than or equal to Width. -- no check for this one
6823 // 2. If ExecSize = Width and HorzStride ? 0, VertStride must be set to Width * HorzStride.
6824 // 3. If ExecSize = Width and HorzStride = 0, there is no restriction on VertStride.
6825 // 4. If Width = 1, HorzStride must be 0 regardless of the values of ExecSize and VertStride.
6826 // 5. If ExecSize = Width = 1, both VertStride and HorzStride must be 0. This defines a scalar.
6827 // 6. If VertStride = HorzStride = 0, Width must be 1 regardless of the value of ExecSize.
6828 // 7. Dst.HorzStride must not be 0. -- this needs not to be checked.
6829 // 8. VertStride must be used to cross GRF register boundaries. This rule implies that
6830 // elements within a 'Width' cannot cross GRF boundaries.
fixSrcRegion(G4_INST * inst)6831 void HWConformity::fixSrcRegion(G4_INST* inst)
6832 {
6833 bool comprInst = isCompressedInst(inst);
6834 for (int i = 0; i < G4_MAX_SRCS; i++)
6835 {
6836 if (inst->getSrc(i) && inst->getSrc(i)->isSrcRegRegion() && !inst->getSrc(i)->isNullReg())
6837 {
6838 G4_SrcRegRegion* src = inst->getSrc(i)->asSrcRegRegion();
6839 const RegionDesc* srcRegion = src->getRegion();
6840 if (srcRegion->isRegionWH() || srcRegion->isRegionV() || srcRegion->isRegionSW())
6841 {
6842 // normalize VxH regions if possible
6843 if (srcRegion->isRegionWH() && srcRegion->width == inst->getExecSize())
6844 {
6845 // r[a0.0]<E, S> -> r[a0.0]<S;1,0>
6846 src->setRegion(builder.createRegionDesc(srcRegion->horzStride, 1, 0));
6847 }
6848 // ToDo: add other legalization
6849 continue;
6850 }
6851
6852 //ToDo: most of these checks should be obsolete at this point
6853 uint16_t vs = srcRegion->vertStride, wd = srcRegion->width, hs = srcRegion->horzStride;
6854 uint8_t exSize = inst->getExecSize();
6855 MUST_BE_TRUE(inst->isSend() || exSize >= wd, " Bad source region: Width is greater than execution size.");
6856 if (comprInst)
6857 {
6858 if (inst->getSrc(i)->getTypeSize() > G4_WSIZE &&
6859 wd == exSize &&
6860 vs == wd && hs == 1)
6861 {
6862 vs = wd = exSize / 2;
6863 }
6864 }
6865 if (wd == exSize && hs != 0 && vs != wd * hs)
6866 {
6867 // <V;E,H> --> <V*H;E,H>
6868 vs = wd * hs;
6869 }
6870 if (wd == 1)
6871 {
6872 // <V;1,H> -> <V;1,0> or <0;1,0>
6873 hs = 0;
6874 if (1 == exSize)
6875 vs = 0;
6876 }
6877 if (vs == 0 && hs == 0)
6878 {
6879 // <0;N,0> -> <0;1,0>
6880 wd = 1;
6881 }
6882 if (hs == 0 &&
6883 ((inst->getSrc(i)->getTypeSize() == G4_WSIZE &&
6884 exSize == 32 && vs == 32 && wd == 32) ||
6885 (inst->getSrc(i)->getTypeSize() == G4_DSIZE &&
6886 exSize == 16 && vs == 16 && wd == 16)))
6887 {
6888 vs = 0;
6889 wd = 1;
6890 }
6891
6892 // check cross GRF (rule 2H)
6893 // TODO! for the following two cases, split the instruction:
6894 // source region is like <8;4,1>
6895 // source region is like <2;4,1>
6896 if (src->getRegAccess() == Direct && src->crossGRF() && hs != 0)
6897 {
6898 // TODO: this is a temp fix
6899 if ((builder.getPlatform() == GENX_BDW || builder.getPlatform() == GENX_CHV) && vs < wd * hs)
6900 continue;
6901 // check number of elements in first GRF.
6902 uint16_t execTypeSize = hs * src->getElemSize();
6903 uint16_t sizeInFirstGRF = numEltPerGRF<Type_UB>() - src->getLeftBound() % numEltPerGRF<Type_UB>();
6904 uint16_t vertSize = vs * src->getTypeSize();
6905 uint16_t numEle = (sizeInFirstGRF + execTypeSize - 1) / execTypeSize;
6906 uint16_t rowSize = wd * execTypeSize;
6907
6908 if (sizeInFirstGRF <= vertSize)
6909 {
6910 if (numEle >= wd)
6911 {
6912 numEle = wd;
6913 }
6914 }
6915 else if (vs > wd)
6916 {
6917 numEle = sizeInFirstGRF / vertSize * wd +
6918 ((sizeInFirstGRF % vertSize > rowSize) ? wd : (sizeInFirstGRF % vertSize + execTypeSize - 1) / execTypeSize);
6919 }
6920 // wd is used to cross GRF, change to <vs;1,0>
6921 if (numEle < wd || (wd >= vs && numEle % wd != 0))
6922 {
6923
6924 wd = 1;
6925 if (hs == 0)
6926 {
6927 vs = 1;
6928 }
6929 else
6930 {
6931 vs = hs;
6932 }
6933 hs = 0;
6934 }
6935 }
6936
6937 if (vs != srcRegion->vertStride || wd != srcRegion->width || hs != srcRegion->horzStride)
6938 {
6939 G4_SrcRegRegion* origSrc = inst->getSrc(i)->asSrcRegRegion();
6940 origSrc->setRegion(builder.createRegionDesc(vs, wd, hs));
6941 }
6942 }
6943 }
6944 if (inst->getDst() && !inst->hasNULLDst())
6945 {
6946 MUST_BE_TRUE(inst->getDst()->getHorzStride() != 0,
6947 "Bad source region: Width is greater than execution size.");
6948 }
6949 }
6950
6951 //
6952 //single entry point for HW conformity checks
6953 //
HWConformityChk(IR_Builder & builder,G4_Kernel & kernel,Mem_Manager & mem)6954 void HWConformityChk(IR_Builder& builder, G4_Kernel& kernel, Mem_Manager& mem)
6955 {
6956 HWConformity conformity(builder, kernel, mem);
6957 conformity.chkHWConformity();
6958 }
6959
markPackedByteReference(G4_Kernel & kernel,G4_Operand * opnd,G4_INST * inst)6960 bool HWConformity::markPackedByteReference(G4_Kernel& kernel, G4_Operand* opnd, G4_INST* inst)
6961 {
6962 G4_Declare* dcl = NULL, * topdcl = NULL;
6963 bool foundOptCandidate = false;
6964
6965 if ((opnd->isSrcRegRegion() || opnd->isDstRegRegion()))
6966 {
6967 if (opnd->getBase() && opnd->getBase()->isRegVar())
6968 {
6969 dcl = opnd->getBase()->asRegVar()->getDeclare();
6970 topdcl = dcl->getRootDeclare();
6971 }
6972 }
6973
6974 if (topdcl != NULL &&
6975 topdcl->getRegFile() == G4_GRF &&
6976 !(topdcl->getAddressed()))
6977 {
6978 if (topdcl->doNotWiden() || inst->mayExceedTwoGRF())
6979 {
6980 //send has no regioning so it is certainly illegal to change data layout
6981 setAccessPattern(topdcl, ACCESS_PATTERN_INVALID);
6982 return false;
6983 }
6984
6985 if (opnd->isDstRegRegion() &&
6986 // check if the opnd has pre-assigned physical regsiter
6987 !(topdcl->getRegVar()->isPhyRegAssigned()) &&
6988 // check if the opnd is global
6989 !(kernel.fg.globalOpndHT.isOpndGlobal(opnd)) &&
6990 // check if the opnd is used as packed byte
6991 opnd->getTypeSize() == 1 &&
6992 !hasDedicateAlignRegionConformity(inst) &&
6993 dcl->getElemSize() == 1 &&
6994 opnd->asDstRegRegion()->getHorzStride() == 1 &&
6995 // check if the instruction is a raw mov
6996 !inst->isRawMov() &&
6997 // check if the instruction execution type is word
6998 // (This should be the most common case that can benefit
6999 // from this optimization. It could be extended to other
7000 // cases like D execution type).
7001 TypeSize(inst->getExecType()) == 2)
7002 {
7003 unsigned int leftBound = opnd->asDstRegRegion()->getLeftBound();
7004 unsigned int rightBound = opnd->asDstRegRegion()->getRightBound();
7005
7006 if (((rightBound * 2 / numEltPerGRF<Type_UB>() - leftBound * 2 / numEltPerGRF<Type_UB>()) > 1) ||
7007 (builder.getPlatform() == GENX_BDW &&
7008 (rightBound * 2 / numEltPerGRF<Type_UB>() != leftBound * 2 / numEltPerGRF<Type_UB>())))
7009 {
7010 setAccessPattern(topdcl, ACCESS_PATTERN_INVALID);
7011 }
7012 else if (getAccessPattern(topdcl) == ACCESS_PATTERN_UNDEF)
7013 {
7014 setAccessPattern(topdcl, ACCESS_PATTERN_PACKED_BYTE);
7015 foundOptCandidate = true;
7016 }
7017 }
7018 else if (opnd->isSrcRegRegion() &&
7019 // check if the opnd has pre-assigned physical regsiter
7020 !(opnd->asSrcRegRegion()->getBase()->asRegVar()->isPhyRegAssigned()) &&
7021 // check if the opnd is global
7022 !(kernel.fg.globalOpndHT.isOpndGlobal(opnd)) &&
7023 // check if the opnd is used as packed byte
7024 opnd->getTypeSize() == 1 &&
7025 dcl->getElemSize() == 1 &&
7026 opnd->asSrcRegRegion()->getRegion()->isContiguous(inst->getExecSize()))
7027 {
7028 unsigned int leftBound = opnd->asSrcRegRegion()->getLeftBound();
7029 unsigned int rightBound = opnd->asSrcRegRegion()->getRightBound();
7030
7031 if (((rightBound * 2 / numEltPerGRF<Type_UB>() - leftBound * 2 / numEltPerGRF<Type_UB>()) > 1) ||
7032 (builder.getPlatform() == GENX_BDW &&
7033 (rightBound * 2 / numEltPerGRF<Type_UB>() != leftBound * 2 / numEltPerGRF<Type_UB>())))
7034 {
7035 setAccessPattern(topdcl, ACCESS_PATTERN_INVALID);
7036 }
7037 }
7038 else
7039 {
7040 setAccessPattern(topdcl, ACCESS_PATTERN_INVALID);
7041 }
7042 }
7043
7044 return foundOptCandidate;
7045 }
7046
fixPackedByteReference(IR_Builder & builder,G4_Operand * opnd)7047 G4_Operand* HWConformity::fixPackedByteReference(IR_Builder& builder, G4_Operand* opnd)
7048 {
7049 G4_Operand* newOpnd = NULL;
7050 G4_Declare* topdcl = NULL;
7051
7052 if (opnd->isDstRegRegion() ||
7053 opnd->isSrcRegRegion())
7054 {
7055 topdcl = GetTopDclFromRegRegion(opnd);
7056 }
7057
7058 if (topdcl != NULL &&
7059 getAccessPattern(topdcl) == ACCESS_PATTERN_PACKED_BYTE)
7060 {
7061 if (opnd->isDstRegRegion())
7062 {
7063 short dst_regoff = opnd->asDstRegRegion()->getRegOff();
7064 short dst_subregoff = opnd->asDstRegRegion()->getSubRegOff();
7065 short off = (dst_regoff * numEltPerGRF<Type_UB>() + dst_subregoff) * 2;
7066
7067 dst_regoff = off / numEltPerGRF<Type_UB>();
7068 dst_subregoff = off % numEltPerGRF<Type_UB>();
7069
7070 G4_DstRegRegion* newDstOpnd = builder.createDst(
7071 opnd->getBase()->asRegVar(),
7072 dst_regoff,
7073 dst_subregoff,
7074 2,
7075 opnd->getType());
7076 newOpnd = newDstOpnd;
7077 }
7078 else if (opnd->isSrcRegRegion())
7079 {
7080 short src_regoff = opnd->asSrcRegRegion()->getRegOff();
7081 short src_subregoff = opnd->asSrcRegRegion()->getSubRegOff();
7082 short off = (src_regoff * numEltPerGRF<Type_UB>() + src_subregoff) * 2;
7083
7084 src_regoff = off / numEltPerGRF<Type_UB>();
7085 src_subregoff = off % numEltPerGRF<Type_UB>();
7086
7087 const RegionDesc* rd = builder.getRegionStride2();
7088 G4_SrcRegRegion* newSrcOpnd = builder.createSrcRegRegion(opnd->asSrcRegRegion()->getModifier(),
7089 Direct,
7090 opnd->getBase()->asRegVar(),
7091 src_regoff,
7092 src_subregoff,
7093 rd,
7094 opnd->getType());
7095 newOpnd = newSrcOpnd;
7096 }
7097 }
7098
7099 return newOpnd;
7100 }
7101
fixDataLayout()7102 void HWConformity::fixDataLayout()
7103 {
7104 bool changeDataLayout = false;
7105
7106 for (auto& bb : kernel.fg)
7107 {
7108 for (auto& inst : *bb)
7109 {
7110 if (G4_Inst_Table[inst->opcode()].n_dst == 1)
7111 {
7112 G4_Operand* dst = inst->getDst();
7113
7114 if (dst)
7115 {
7116 bool foundOptCandidate = markPackedByteReference(kernel, dst, inst);
7117 if (changeDataLayout == false && foundOptCandidate)
7118 {
7119 changeDataLayout = true;
7120 }
7121 }
7122 }
7123
7124 for (int i = 0; i < inst->getNumSrc(); i++)
7125 {
7126 G4_Operand* src = inst->getSrc(i);
7127
7128 if (src)
7129 {
7130 markPackedByteReference(kernel, src, inst);
7131 }
7132 }
7133 }
7134 }
7135
7136 if (changeDataLayout)
7137 {
7138 for (auto& dcl : kernel.Declares)
7139 {
7140 G4_Declare* topdcl = dcl->getRootDeclare();
7141
7142 if (getAccessPattern(topdcl) == ACCESS_PATTERN_PACKED_BYTE)
7143 {
7144 dcl->setTotalElems(dcl->getTotalElems() * 2);
7145
7146 if (dcl != topdcl)
7147 {
7148 G4_Declare* aliasDcl = dcl->getAliasDeclare();
7149 unsigned int aliasOffset = dcl->getAliasOffset();
7150 dcl->setAliasDeclare(aliasDcl, aliasOffset * 2);
7151 }
7152 }
7153 }
7154
7155 for (auto& bb : kernel.fg)
7156 {
7157 for (auto& inst : *bb)
7158 {
7159 if (G4_Inst_Table[inst->opcode()].n_dst == 1)
7160 {
7161 G4_Operand* dst = inst->getDst();
7162 G4_Operand* newDst = NULL;
7163
7164 if (dst)
7165 {
7166 newDst = fixPackedByteReference(builder, dst);
7167 if (newDst)
7168 {
7169 inst->setDest(newDst->asDstRegRegion());
7170 }
7171 }
7172 }
7173
7174 for (int i = 0; i < inst->getNumSrc(); i++)
7175 {
7176 G4_Operand* src = inst->getSrc(i);
7177 G4_Operand* newSrc = NULL;
7178
7179 if (src)
7180 {
7181 newSrc = fixPackedByteReference(builder, src);
7182 if (newSrc)
7183 {
7184 inst->setSrc(newSrc, i);
7185 }
7186 }
7187 }
7188 }
7189 }
7190 }
7191 }
7192
7193 // maintain def-use chain for current inst and the MOV inst generated for its dst
maintainDU4TempMov(G4_INST * inst,G4_INST * newInst)7194 void HWConformity::maintainDU4TempMov(G4_INST* inst, G4_INST* newInst)
7195 {
7196 if (newInst->getPredicate())
7197 {
7198 inst->transferDef(newInst, Opnd_pred, Opnd_pred);
7199 }
7200
7201 inst->transferUse(newInst);
7202
7203 inst->addDefUse(newInst, Opnd_src0);
7204 }
7205
expandPlaneMacro(IR_Builder & builder,INST_LIST_ITER it,G4_BB * bb,bool secondHalf)7206 static void expandPlaneMacro(IR_Builder& builder, INST_LIST_ITER it, G4_BB* bb, bool secondHalf)
7207 {
7208 G4_INST* inst = *it;
7209 G4_DstRegRegion* dst = inst->getDst();
7210 G4_SrcRegRegion* src0 = inst->getSrc(0)->asSrcRegRegion();
7211 G4_SrcRegRegion* src1 = inst->getSrc(1)->asSrcRegRegion();
7212
7213 G4_SrcRegRegion* srcP = builder.createSrcRegRegion(src0->getModifier(), Direct, src0->getBase(),
7214 src0->getRegOff(), src0->getSubRegOff(), builder.getRegionScalar(), src0->getType());
7215 G4_SrcRegRegion* srcQ = builder.createSrcRegRegion(src0->getModifier(), Direct, src0->getBase(),
7216 src0->getRegOff(), src0->getSubRegOff() + 1, builder.getRegionScalar(), src0->getType());
7217 G4_SrcRegRegion* srcR = builder.createSrcRegRegion(src0->getModifier(), Direct, src0->getBase(),
7218 src0->getRegOff(), src0->getSubRegOff() + 3, builder.getRegionScalar(), src0->getType());
7219
7220 auto u = builder.createSrcWithNewRegOff(src1, src1->getRegOff() + (secondHalf ? 2 : 0));
7221 auto v = builder.createSrcWithNewRegOff(src1, src1->getRegOff() + (secondHalf ? 3 : 1));
7222 if (getGRFSize() == 64)
7223 {
7224 u = builder.createSrcRegRegion(src1->getModifier(), Direct, src1->getBase(),
7225 src1->getRegOff() + (secondHalf ? 1 : 0), src1->getSubRegOff(), src1->getRegion(), src1->getType(), src1->getAccRegSel());
7226 v = builder.createSrcRegRegion(src1->getModifier(), Direct, src1->getBase(),
7227 src1->getRegOff() + (secondHalf ? 1 : 0), src1->getSubRegOff() + 8, src1->getRegion(), src1->getType(), src1->getAccRegSel());
7228 }
7229
7230 uint32_t options = inst->getOption();
7231 if (inst->getExecSize() == g4::SIMD16)
7232 {
7233 options &= ~InstOpt_QuarterMasks;
7234 int maskOffset = inst->getMaskOffset() + (secondHalf ? 8 : 0);
7235 switch (maskOffset)
7236 {
7237 case 0:
7238 options |= InstOpt_M0;
7239 break;
7240 case 8:
7241 options |= InstOpt_M8;
7242 break;
7243 case 16:
7244 options |= InstOpt_M16;
7245 break;
7246 case 24:
7247 options |= InstOpt_M24;
7248 break;
7249 default:
7250 MUST_BE_TRUE(false, "unexpected offset value");
7251 }
7252 }
7253
7254 G4_Declare* tmpVal = builder.hasNFType() ? nullptr : builder.createTempVar(8, Type_F, Any);
7255 G4_DstRegRegion* accDst = builder.hasNFType() ?
7256 builder.createDst(builder.phyregpool.getAcc0Reg(), 0, 0, 1, Type_NF) :
7257 builder.createDstRegRegion(tmpVal, 1);
7258 G4_INST* madInst = builder.createInternalInst(
7259 nullptr, G4_mad, nullptr, g4::NOSAT, g4::SIMD8,
7260 accDst, srcR, u, srcP,
7261 options | InstOpt_WriteEnable);
7262 bb->insertBefore(it, madInst);
7263
7264 G4_Predicate* pred = inst->getPredicate() ? builder.duplicateOperand(inst->getPredicate()) : nullptr;
7265 G4_CondMod* condMod = inst->getCondMod() ? builder.duplicateOperand(inst->getCondMod()) : nullptr;
7266 G4_SrcRegRegion* accSrc = builder.hasNFType() ?
7267 builder.createSrc(builder.phyregpool.getAcc0Reg(), 0, 0, builder.getRegionStride1(), Type_NF) :
7268 builder.createSrcRegRegion(tmpVal, builder.getRegionStride1());
7269 G4_DstRegRegion* newDst = builder.createDst(dst->getBase(),
7270 dst->getRegOff() + (secondHalf ? 1 : 0), dst->getSubRegOff(), dst->getHorzStride(), dst->getType());
7271 if (getGRFSize() == 64)
7272 {
7273 newDst = builder.createDst(dst->getBase(),
7274 dst->getRegOff(), dst->getSubRegOff() + (secondHalf ? 8 : 0), dst->getHorzStride(), dst->getType());
7275 }
7276 G4_INST* secondMadInst = builder.createInternalInst(
7277 pred, G4_mad, condMod, inst->getSaturate(), g4::SIMD8,
7278 newDst, accSrc, v, srcQ, options);
7279 bb->insertBefore(it, secondMadInst);
7280 }
7281
7282 // Replace plane with a macro sequence:
7283 // pln dest:f src0:f src1:f
7284 // -->
7285 // mad acc0:nf src0.3:f src1:f src0.0:f
7286 // mad dest:f acc0:nf src1+1:f src0.1:f
7287 // simd16 pln also needs to be split as the macro is simd8 only
7288
expandPlaneInst(INST_LIST_ITER it,G4_BB * bb)7289 void HWConformity::expandPlaneInst(INST_LIST_ITER it, G4_BB* bb)
7290 {
7291 G4_INST* inst = *it;
7292 MUST_BE_TRUE(inst->opcode() == G4_pln, "expect a plane inst");
7293 MUST_BE_TRUE(inst->getSrc(0)->isSrcRegRegion(), "src0 must be source reg region");
7294
7295 G4_DstRegRegion* dst = inst->getDst();
7296 if (dst->getRegAccess() == IndirGRF || dst->getHorzStride() > 1)
7297 {
7298 inst->setDest(insertMovAfter(it, dst, dst->getType(), bb));
7299 }
7300 G4_SrcRegRegion* src0 = inst->getSrc(0)->asSrcRegRegion();
7301 if (src0->getRegAccess() == IndirGRF)
7302 {
7303 // insert move to make src0 direct
7304 inst->setSrc(insertMovBefore(it, 0, src0->getType(), bb), 0);
7305 }
7306 G4_SrcRegRegion* src1 = inst->getSrc(1)->asSrcRegRegion();
7307 if (src1->getRegAccess() == IndirGRF)
7308 {
7309 // insert move to make src1 direct
7310 inst->setSrc(insertMovBefore(it, 1, src1->getType(), bb), 1);
7311 }
7312
7313 expandPlaneMacro(builder, it, bb, false);
7314 if (inst->getExecSize() == g4::SIMD16)
7315 {
7316 expandPlaneMacro(builder, it, bb, true);
7317 }
7318
7319 it = bb->erase(it);
7320 }
7321
7322 // plane does not support pln with non-packed dst.
7323 // also fix up plane sources, which don't support modifiers
7324 // returns true if the original plane is deleted
fixPlaneInst(INST_LIST_ITER it,G4_BB * bb)7325 bool HWConformity::fixPlaneInst(INST_LIST_ITER it, G4_BB* bb)
7326 {
7327
7328 G4_INST* inst = *it;
7329 if (inst->opcode() == G4_pln)
7330 {
7331 if (!builder.doPlane())
7332 {
7333 expandPlaneInst(it, bb);
7334 return true;
7335 }
7336 G4_DstRegRegion* dst = inst->getDst();
7337 if (dst->getHorzStride() != 1)
7338 {
7339 G4_DstRegRegion* newDst = insertMovAfter(it, dst, dst->getType(), bb);
7340 inst->setDest(newDst);
7341 }
7342
7343 G4_Operand* src0 = inst->getSrc(0);
7344 G4_Operand* src1 = inst->getSrc(1);
7345
7346 // Source modifiers are not supported for pln instruction
7347 if (src0 &&
7348 ((src0->isSrcRegRegion() &&
7349 src0->asSrcRegRegion()->getModifier() != Mod_src_undef) ||
7350 !builder.isOpndAligned(src0, 16)))
7351 {
7352 // src0 needs a temp
7353 G4_Declare* tmpDcl = builder.createTempVar(4, Type_F,
7354 GRFALIGN);
7355
7356 // Before:
7357 // pln (16) dst, (mod)src0, src1
7358 //
7359 // After:
7360 // mov (4) tmp(0,0):f (mod)src0(r)<4;4,1>:f
7361 // pln (16) dst, tmp(0,0)<0;1,0>, src1
7362 G4_DstRegRegion* dstRgn = builder.createDst(
7363 tmpDcl->getRegVar(),
7364 0,
7365 0,
7366 1,
7367 Type_F);
7368
7369 const RegionDesc* rd = builder.createRegionDesc(4, 4, 1);
7370 G4_SrcRegRegion* srcRgn = builder.createSrcRegRegion(
7371 src0->asSrcRegRegion()->getModifier(),
7372 Direct,
7373 src0->asSrcRegRegion()->getBase(),
7374 src0->asSrcRegRegion()->getRegOff(),
7375 src0->asSrcRegRegion()->getSubRegOff(),
7376 rd,
7377 Type_F);
7378
7379 G4_INST* newInst = builder.createMov(g4::SIMD4, dstRgn, srcRgn, InstOpt_NoOpt, false);
7380
7381 bb->insertBefore(it, newInst);
7382
7383 rd = builder.getRegionScalar();
7384 G4_SrcRegRegion* newSrcRgn = builder.createSrc(
7385 tmpDcl->getRegVar(),
7386 0,
7387 0,
7388 rd,
7389 Type_F);
7390
7391 inst->setSrc(newSrcRgn, 0);
7392 inst->transferDef(newInst, Opnd_src0, Opnd_src0);
7393 newInst->addDefUse(inst, Opnd_src0);
7394 }
7395
7396 if (src1 && src1->isSrcRegRegion() && src1->asSrcRegRegion()->getModifier() != Mod_src_undef)
7397 {
7398 // src1 needs a temp
7399 // For pln instruction src2 is implied from src1 and exec_size
7400 // When exec_size = 8, src2 is 1 GRF after src1 with size = 1 GRF
7401 // When exec_size = 16, src2 is 2 GRFs after src1 with size = 2 GRFs
7402 unsigned short numGRFsToCopy = inst->getExecSize() == g4::SIMD8 ? 2 : 4;
7403
7404 G4_Declare* tmpDcl = builder.createTempVar((unsigned short)(numEltPerGRF<Type_UB>() / TypeSize(Type_F) * numGRFsToCopy), Type_F,
7405 Any);
7406
7407 // Before:
7408 // pln (16) dst, src0, (mod)src1
7409 //
7410 // After:
7411 // mov (16) tmp(0,0):f (mod)src1(r)<8;8,1>:f
7412 // mov (16) tmp(2,0):f (mod)src1(r+2)<8;8,1>:f <-- only if exec_size = 16
7413 // pln (16) dst, src0, tmp(0,0)
7414 for (int i = 0; i < numGRFsToCopy; i += 2)
7415 {
7416 G4_DstRegRegion* dstRgn = builder.createDst(
7417 tmpDcl->getRegVar(),
7418 (short)i,
7419 0,
7420 1,
7421 Type_F);
7422
7423 const RegionDesc* rd = builder.createRegionDesc(8, 8, 1);
7424 G4_SrcRegRegion* srcRgn = builder.createSrcRegRegion(
7425 src1->asSrcRegRegion()->getModifier(),
7426 Direct,
7427 src1->asSrcRegRegion()->getBase(),
7428 src1->asSrcRegRegion()->getRegOff() + i,
7429 0,
7430 rd,
7431 Type_F);
7432
7433 G4_INST* newInst = builder.createMov(g4::SIMD16, dstRgn, srcRgn, InstOpt_NoOpt, false);
7434
7435 bb->insertBefore(it, newInst);
7436
7437 if (i == 0)
7438 {
7439 G4_SrcRegRegion* newSrcRgn = builder.createSrc(
7440 tmpDcl->getRegVar(),
7441 0,
7442 0,
7443 rd,
7444 Type_F);
7445
7446 inst->setSrc(newSrcRgn, 1);
7447 inst->transferDef(newInst, Opnd_src1, Opnd_src0);
7448 }
7449 newInst->addDefUse(inst, Opnd_src1);
7450 }
7451 }
7452 }
7453 return false;
7454 }
7455
fixImm64(INST_LIST_ITER i,G4_BB * bb)7456 void HWConformity::fixImm64(INST_LIST_ITER i,
7457 G4_BB* bb)
7458 {
7459 G4_INST* inst = *i;
7460 for (int j = 0, n_srcs = inst->getNumSrc(); j < n_srcs; j++)
7461 {
7462 G4_Operand* src = inst->getSrc(j);
7463 if (!src ||
7464 !(src->isImm()) ||
7465 src->getTypeSize() != 8)
7466 {
7467 continue;
7468 }
7469 // a 64bit immediate is supported ONLY for a MOV operation
7470 bool needsSplit = false;
7471
7472 if (VISA_WA_CHECK(builder.getPWaTable(), WaDisallow64BitImmMov))
7473 {
7474 needsSplit = true;
7475 }
7476 if (needsSplit)
7477 {
7478 char* immPtr = NULL;
7479 double dfValue = 0.0f;
7480 int64_t qValue = 0;
7481
7482 if (IS_DFTYPE(src->getType()))
7483 {
7484 dfValue = src->asImm()->getDouble();
7485 immPtr = (char*)&dfValue;
7486 }
7487 else
7488 {
7489 qValue = src->asImm()->getInt();
7490 immPtr = (char*)&qValue;
7491 }
7492 unsigned int lowValue = *((unsigned int*)(immPtr));
7493 unsigned int highValue = *((unsigned int*)(immPtr + 4));
7494 G4_Imm* lowImm = builder.createImm((int64_t)lowValue, Type_UD);
7495 G4_Imm* highImm = builder.createImm((int64_t)highValue, Type_UD);
7496
7497 G4_Declare* defDcl = NULL;
7498
7499 defDcl = builder.createTempVar(1, src->getType(), Eight_Word);
7500 G4_Declare* dcl = builder.createTempVar(2, Type_UD, Eight_Word);
7501 dcl->setAliasDeclare(defDcl, 0);
7502
7503 G4_DstRegRegion* dstRegion = builder.createDstRegRegion(dcl, 1);
7504 G4_INST* lowMovInst = builder.createMov(g4::SIMD1, dstRegion, lowImm, InstOpt_WriteEnable, false);
7505
7506 bb->insertBefore(i, lowMovInst);
7507
7508 auto newDst = builder.createDst(dcl->getRegVar(), 0, 1, 1, dcl->getElemType());
7509 G4_INST* highMovInst = builder.createMov(g4::SIMD1, newDst, highImm, InstOpt_WriteEnable, false);
7510 bb->insertBefore(i, highMovInst);
7511
7512 inst->transferDef(lowMovInst, Gen4_Operand_Number(j + 1), Opnd_src0);
7513 lowMovInst->addDefUse(inst, Gen4_Operand_Number(j + 1));
7514 inst->transferDef(highMovInst, Gen4_Operand_Number(j + 1), Opnd_src0);
7515 highMovInst->addDefUse(inst, Gen4_Operand_Number(j + 1));
7516
7517 unsigned short vs = 0, hs = 0, wd = 1; // gen7_5: always 0;1,0
7518 G4_SrcRegRegion* new_src = builder.createSrcRegRegion(defDcl,
7519 builder.createRegionDesc(vs, wd, hs));
7520 inst->setSrc(new_src, j);
7521 }
7522 else
7523 {
7524 if (inst->opcode() != G4_mov)
7525 {
7526 inst->setSrc(insertMovBefore(i, j, src->getType(), bb), j);
7527 }
7528 }
7529 }
7530 }
7531
7532 // Check if the source of def_inst is redefined before inst
checkSrcDefInst(G4_INST * inst,G4_INST * def_inst,uint32_t srcNum)7533 G4_INST* HWConformity::checkSrcDefInst(G4_INST* inst,
7534 G4_INST* def_inst,
7535 uint32_t srcNum)
7536 {
7537 G4_INST* valid_inst = def_inst;
7538
7539 if (def_inst != NULL)
7540 {
7541 MUST_BE_TRUE(def_inst->opcode() == G4_mov, "def inst must be a mov instruction");
7542
7543 G4_INST* def_inst1 = NULL;
7544 for (auto def_it1 = inst->def_begin(), end = inst->def_end(); def_it1 != end; def_it1++)
7545 {
7546 if ((*def_it1).second == srcNum + 1)
7547 {
7548 def_inst1 = (*def_it1).first;
7549 }
7550 }
7551
7552 if (def_inst1 != NULL)
7553 {
7554 G4_INST* def_inst2 = NULL;
7555 for (auto def_it2 = def_inst->def_begin(), end2 = def_inst->def_end(); def_it2 != end2; def_it2++)
7556 {
7557 if ((*def_it2).second == Opnd_src0)
7558 {
7559 def_inst2 = (*def_it2).first;
7560 }
7561 }
7562
7563 if (def_inst1 != def_inst2)
7564 {
7565 valid_inst = NULL;
7566 }
7567 }
7568 }
7569
7570 return valid_inst;
7571 }
7572
7573 /*
7574 Helper function for fixMixedHFInst
7575 It assumes dst is not null and is of type DstRegRegion.
7576 This check must be done before this method is called.
7577 */
helperGenerateTempDst(G4_BB * bb,INST_LIST_ITER instIter,G4_INST * inst,uint8_t hStride,G4_Type tempDstType,G4_SubReg_Align subAlign)7578 void HWConformity::helperGenerateTempDst(
7579 G4_BB* bb,
7580 INST_LIST_ITER instIter,
7581 G4_INST* inst,
7582 uint8_t hStride,
7583 G4_Type tempDstType,
7584 G4_SubReg_Align subAlign)
7585 {
7586 G4_DstRegRegion* dst = inst->getDst();
7587 G4_ExecSize execSize = inst->getExecSize();
7588 uint8_t dstSize = execSize * TypeSize(tempDstType);
7589 //create a new temp with horizontal stride of 1 (packed)
7590 //create a move to dst.
7591
7592 uint32_t numElt = execSize == 1 ? 1 : execSize * hStride;
7593 if (numElt > 1 && isLowPrecisionFloatTy(tempDstType) && hStride == 1 && subAlign < Eight_Word)
7594 subAlign = Eight_Word;
7595 subAlign = getDclAlignment(dstSize, inst, execSize == 1);
7596
7597 G4_Declare* dcl = builder.createTempVar(numElt, tempDstType, subAlign);
7598
7599 G4_DstRegRegion* dstRegion = builder.createDstRegRegion(dcl, hStride);
7600 inst->setDest(dstRegion);
7601
7602 const RegionDesc* region =
7603 execSize == 1 ?
7604 builder.getRegionScalar() :
7605 builder.createRegionDesc(execSize * hStride, execSize, hStride);
7606 G4_SrcRegRegion* srcRegion = builder.createSrcRegRegion(dcl, region);
7607
7608 //creating a mov from temp dst to final destination using original options of fixed instruction
7609 G4_INST* movInst = builder.createMov(
7610 execSize, dst, srcRegion, inst->getMaskOption(), false);
7611
7612 ++instIter;
7613 //inserting mov after fixed instruction
7614 bb->insertBefore(instIter, movInst);
7615
7616 /*
7617 Need to remove dst from uses list of mulh, and add them to movInst useList
7618 add movInst to uselist of mulh.
7619 Add mulh to def instruction list of movInst
7620 */
7621 inst->transferUse(movInst);
7622 inst->addDefUse(movInst, Opnd_src0);
7623 }
7624
7625 /*
7626 Not Implemented rules:
7627
7628 3: (Does this mean align1 doesn't support replication?)
7629 In Align16 mode, replicate is supported and is coissueable.
7630
7631 4: (handled in reduce execution size)
7632 No simd16 in mixed mode when destination is packed f16 for both Align1 and Align16.
7633
7634 mad(8) r3.xyzw:hf r4.xyzw:f r6.xyzw:hf r7.xyzw:hf
7635
7636 add(8) r20.0<1>:hf r3<8;8,1>:f r6.0<8;8,1>:hf {Q1}
7637
7638 5: (we are not producing this type of code)
7639 No accumulator read access for align16 mixed float
7640
7641 6: (we do not generate code like this)
7642 [DevCHV, DevSKL+]: When source is float from accumulator register and destination is half float with a stride of 1, the source must register aligned. i.e., source must have offset zero.
7643
7644 7: (doesn't seem like it is applicable to our code)
7645 In Align16, vertical stride can never be zero for f16
7646
7647 8.a: (handled by another check)
7648 Math operations for mixed mode,
7649 - In Align16, only packed format is supported
7650
7651 11. (handled in reduce execution size)
7652 [DevCHV, DevSKL, DevBXT]: No simd16 in mixed mode when destination is f32. Instruction Execution size must be no more than 8.
7653
7654 */
fixMixedHFInst(G4_BB * bb)7655 void HWConformity::fixMixedHFInst(G4_BB* bb)
7656 {
7657 for (auto instIter = bb->begin(); instIter != bb->end(); ++instIter)
7658 {
7659 G4_INST* inst = *instIter;
7660
7661 if (inst->mayExceedTwoGRF() || !inst->getDst())
7662 {
7663 continue;
7664 }
7665
7666 if (VISA_WA_CHECK(builder.getPWaTable(), WaSrc1ImmHfNotAllowed))
7667 {
7668 G4_Operand* tSrc1 = inst->getSrc(1);
7669 if (tSrc1 && tSrc1->isImm() && tSrc1->getType() == Type_HF)
7670 {
7671 inst->setSrc(insertMovBefore(instIter, 1, Type_HF, bb), 1);
7672 }
7673 }
7674
7675 if (builder.hasPartialMixMode() && inst->getNumSrc() > 1)
7676 {
7677 bool isPureBF = true;
7678 if (inst->getDst()->getType() != Type_BF)
7679 {
7680 isPureBF = false;
7681 }
7682 for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; ++i)
7683 {
7684 if (inst->getSrc(i)->getType() != Type_BF)
7685 {
7686 isPureBF = false;
7687 break;
7688 }
7689 }
7690 if (isPureBF)
7691 {
7692 // pure BF arithmetic instruction is not supported, we make src0 F
7693 replaceSrc(instIter, 0, Type_F, bb);
7694 }
7695
7696 // no HF on mad src2 or mul src1
7697 if (inst->isMixedMode())
7698 {
7699 auto canSwapSource = [](G4_INST* inst)
7700 {
7701 int srcPos = inst->opcode() == G4_mad ? 2 : 1;
7702 G4_Operand* src = inst->getSrc(srcPos);
7703 G4_Operand* otherSrc = inst->getSrc(srcPos - 1);
7704 if (src->isImm() || otherSrc->getType() != Type_F)
7705 {
7706 // swapping won't work
7707 return false;
7708 }
7709 if (inst->opcode() == G4_mad)
7710 {
7711 // src2 has more restrictive regioning, so we can swap only when
7712 // src1 is scalar or has contiguous region
7713 if (otherSrc->isSrcRegRegion())
7714 {
7715 G4_SrcRegRegion* other = otherSrc->asSrcRegRegion();
7716 if (other->getRegion()->isScalar() ||
7717 other->getRegion()->isContiguous(inst->getExecSize()))
7718 {
7719 return true;
7720 }
7721 }
7722 return false;
7723 }
7724 else
7725 {
7726 // swapping is always legal for mul
7727 return true;
7728 }
7729 };
7730 if (inst->opcode() == G4_mad)
7731 {
7732 if (isLowPrecisionFloatTy(inst->getSrc(2)->getType()))
7733 {
7734 if (canSwapSource(inst))
7735 {
7736 inst->swapSrc(1, 2);
7737 }
7738 else
7739 {
7740 inst->setSrc(insertMovBefore(instIter, 2, Type_F, bb), 2);
7741 }
7742 }
7743 // at this point src2 must be F. Dst must be aligned to
7744 // same subreg as src2 if src2 is non-scalar
7745 bool nonScalarSrc2 = inst->getSrc(2)->isSrcRegRegion() &&
7746 !inst->getSrc(2)->asSrcRegRegion()->getRegion()->isScalar();
7747 if (nonScalarSrc2)
7748 {
7749 if (!builder.isOpndAligned(inst->getDst(), numEltPerGRF<Type_UB>()))
7750 {
7751 replaceDst(instIter, Type_F, GRFALIGN);
7752 }
7753 if (!builder.isOpndAligned(inst->getSrc(2), numEltPerGRF<Type_UB>()))
7754 {
7755 inst->setSrc(insertMovBefore(instIter, 2, inst->getSrc(2)->getType(), bb, GRFALIGN), 2);
7756 }
7757 }
7758 }
7759 else if (inst->opcode() == G4_mul && isLowPrecisionFloatTy(inst->getSrc(1)->getType()))
7760 {
7761 if (canSwapSource(inst))
7762 {
7763 inst->swapSrc(0, 1);
7764 }
7765 else
7766 {
7767 inst->setSrc(insertMovBefore(instIter, 1, Type_F, bb), 1);
7768 }
7769 }
7770 }
7771 }
7772
7773 // The execution size must be no more than 8 when half-floats are used in source or destination operand.
7774 // ToDO: move this to fixmathinst
7775 if (inst->getExecSize() > builder.getNativeExecSize())
7776 {
7777 if (inst->opcode() == G4_math &&
7778 inst->getDst()->getType() == Type_HF &&
7779 inst->getSrc(0)->getType() == Type_HF &&
7780 (!inst->getSrc(1) || inst->getSrc(1)->getType() == Type_HF))
7781 {
7782 evenlySplitInst(instIter, bb);
7783 }
7784 }
7785
7786 G4_DstRegRegion* dst = inst->getDst();
7787 if (INST_FLOAT_SRC_ONLY(inst->opcode()) && dst && !dst->isNullReg() && isLowPrecisionFloatTy(dst->getType()))
7788 {
7789 helperGenerateTempDst(bb, instIter, inst, 1, Type_F);
7790 }
7791
7792 if (!inst->isMixedMode())
7793 continue;
7794
7795 if (inst->getDst() && !inst->getDst()->isNullReg())
7796 dst = inst->getDst();
7797
7798 if ((VISA_WA_CHECK(builder.getPWaTable(), WaMixModeSelInstDstNotPacked) ||
7799 VISA_WA_CHECK(builder.getPWaTable(), WaFloatMixedModeSelNotAllowedWithPackedDestination)) &&
7800 inst->opcode() == G4_sel &&
7801 dst &&
7802 (VISA_WA_CHECK(builder.getPWaTable(), WaMixModeSelInstDstNotPacked) || dst->getHorzStride() == 1) &&
7803 dst->getType() == Type_HF)
7804 {
7805 helperGenerateTempDst(bb, instIter, inst, 1, Type_F);
7806 }
7807
7808 if (!inst->isMixedMode())
7809 continue;
7810
7811 if (builder.getPlatform() >= GENX_CHV)
7812 {
7813 // no SIMD16 mix mode instruction
7814 if (inst->getExecSize() > builder.getNativeExecSize() && inst->isMixedMode())
7815 {
7816 evenlySplitInst(instIter, bb, false);
7817 //instruction was split, and new instruction inserted before
7818 //going back to previous instruction to double check it still confirms.
7819 --instIter;
7820 inst = *instIter;
7821 }
7822 }
7823
7824 /*
7825 12: [DevCHV, DevSKL]: Indirect Addressing on source is not supported when source and destination data types are mixed float.
7826 */
7827 if (builder.getPlatform() == GENX_CHV || builder.getPlatform() == GENX_SKL)
7828 {
7829 for (uint8_t i = 0; i < inst->getNumSrc(); ++i)
7830 {
7831 G4_Operand* src = inst->getSrc(i);
7832 if (src == nullptr || !src->isSrcRegRegion() || !src->asSrcRegRegion()->isIndirect())
7833 {
7834 continue;
7835 }
7836 inst->setSrc(insertMovBefore(instIter, i, src->getType(), bb), i);
7837 }
7838 }
7839
7840 if (inst->getDst()->getBase()->isRegVar() &&
7841 inst->getDst()->getType() == Type_HF &&
7842 inst->getDst()->getHorzStride() == 1)
7843 {
7844 inst->getDst()->getBase()->asRegVar()->getDeclare()->setSubRegAlign(Eight_Word);
7845 }
7846 }
7847 }
7848
7849 // Fix for packed half types on BDW.
7850 // Conversions from F to packed HF are not supported on this platform,
7851 // only unpacked HF is supported on destination.
7852 // When we encounter an instruction with HF type on destination with <1> stride
7853 // and float on source, add an additional mov that handles unpacking.
fixPackedHFConversions(INST_LIST_ITER it,G4_BB * bb)7854 void HWConformity::fixPackedHFConversions(INST_LIST_ITER it, G4_BB* bb)
7855 {
7856 G4_INST* inst = *it;
7857 G4_DstRegRegion* dst = inst->getDst();
7858 if (dst && dst->getType() == Type_HF && dst->getHorzStride() == 1 &&
7859 TypeSize(inst->getExecType()) > 2)
7860 {
7861 helperGenerateTempDst(bb, it, inst, 2, Type_HF);
7862 }
7863 }
7864
fixSrc2(INST_LIST_ITER it,G4_BB * bb,bool swapSrc0and2)7865 void HWConformity::fixSrc2(INST_LIST_ITER it, G4_BB* bb, bool swapSrc0and2)
7866 {
7867 G4_INST* inst = *it;
7868 int srcPos = swapSrc0and2 ? 0 : 2; // unfortunate side effect of vISA mad and Gen mad having difference src order
7869 assert(inst->getNumSrc() == 3 && "expect 3-src inst");
7870 if (builder.noSrc2Regioning())
7871 {
7872 auto src = inst->getSrc(srcPos);
7873 // we have to make sure src2 and dst are aligned
7874 // Promote src2's type to f if mix mode is supported.
7875 // e.g.,
7876 // mad (4) r10.0<1>:f src0 src1 r12.0<1>:hf --> f
7877 // mad (4) r10.0<2>:hf src0 src1 r12.0<1>:hf --> f
7878 // mad (4) r10.0<1>:hf src0 src1 r12.0<2>:hf --> hf
7879 // mad (4) r10.0<2>:hf src0 src1 r12.1<2>:hf --> f
7880 // ditto for 3-src inst with int types
7881 G4_Type srcTy = src->getType();
7882 unsigned short dstEltSz = inst->getDst()->getExecTypeSize();
7883 if (dstEltSz >= 4)
7884 {
7885 if (IS_SIGNED_INT(srcTy))
7886 {
7887 srcTy = Type_D;
7888 }
7889 else if (IS_UNSIGNED_INT(srcTy))
7890 {
7891 srcTy = Type_UD;
7892 }
7893 else if (builder.hasMixMode() && builder.getMixModeType() == srcTy)
7894 {
7895 // we can change operand type to F to save one move
7896 srcTy = Type_F;
7897 }
7898 }
7899 inst->setSrc(insertMovBefore(it, srcPos, srcTy, bb, GRFALIGN), srcPos);
7900
7901 // Check if dst stride aligns with src2.
7902 if (dstEltSz != TypeSize(srcTy))
7903 {
7904 replaceDst(it, inst->getDst()->getType(), GRFALIGN);
7905 }
7906 }
7907 }
7908
fixVxHFloat64b(INST_LIST_ITER it,G4_BB * bb)7909 void HWConformity::fixVxHFloat64b(INST_LIST_ITER it, G4_BB* bb)
7910 {
7911 // at this point VxH region should only be on src0
7912 G4_INST* inst = *it;
7913 G4_SrcRegRegion* src0 = inst->getSrc(0) && inst->getSrc(0)->isSrcRegRegion() ?
7914 inst->getSrc(0)->asSrcRegRegion() : nullptr;
7915
7916 if (src0 && src0->getRegAccess() == IndirGRF && src0->getRegion()->isRegionWH())
7917 {
7918 auto type = src0->getType();
7919 // additionally check for int->float type conversion
7920 // FIXME: replace with SWSB's pipe check functions
7921 bool isFloatPipe = type == Type_HF || type == Type_F;
7922 if (inst->opcode() == G4_mov)
7923 {
7924 isFloatPipe |= TypeSize(type) < 8 && (inst->getDst()->getType() == Type_HF || inst->getDst()->getType() == Type_F);
7925 }
7926 if (isFloatPipe)
7927 {
7928 auto intType = TypeSize(type) == 4 ? Type_UD : Type_UW;
7929 if (inst->isRawMov())
7930 {
7931 // directly change the dst/src type to int
7932 inst->getDst()->setType(intType);
7933 src0->setType(intType);
7934 }
7935 else
7936 {
7937 // generate a copy move using int type
7938 // FIXME: code is a bit hacky, may want to change insertMovBefore
7939 // so that we could specify the move type
7940 auto origType = src0->getType();
7941 auto origMod = src0->getModifier();
7942 src0->setType(intType);
7943 src0->setModifier(Mod_src_undef);
7944 auto newSrc = insertMovBefore(it, 0, intType, bb);
7945 newSrc->asSrcRegRegion()->setType(origType);
7946 newSrc->asSrcRegRegion()->setModifier(origMod);
7947 inst->setSrc(newSrc, 0);
7948 }
7949 }
7950 else if (TypeSize(type) == 8)
7951 {
7952 int numDwords = inst->getExecSize() * 2;
7953 G4_Declare* tmpSrc = builder.createTempVar(numDwords / 2, src0->getType(), Any);
7954 const RegionDesc* newRegion = builder.getRegionStride1();
7955 copyDwordsIndirect(tmpSrc, src0, numDwords, bb, it);
7956 G4_SrcRegRegion* tmpSrcOpnd = builder.createSrcRegRegion(src0->getModifier(),
7957 Direct, tmpSrc->getRegVar(), 0, 0, newRegion, tmpSrc->getElemType());
7958 inst->setSrc(tmpSrcOpnd, 0);
7959 }
7960 }
7961 }
7962
fixIntToHFMove(G4_BB * bb)7963 bool HWConformity::fixIntToHFMove(G4_BB* bb)
7964 {
7965 // int to HF move requires dst to have stride 2, which would result in
7966 // an illegal SIMD32 inst. So we split in this case
7967 // we put it in a separate pass so that the split instructions may be legalized later
7968 bool changed = false;
7969 for (auto I = bb->begin(), E = bb->end(); I != E; ++I)
7970 {
7971 auto inst = *I;
7972 if (inst->opcode() == G4_mov && inst->getDst()->getType() == Type_HF &&
7973 IS_INT(inst->getSrc(0)->getType()))
7974 {
7975 if (inst->getExecSize() * 2 * 2 > getGRFSize() * 2)
7976 {
7977 evenlySplitInst(I, bb);
7978 changed = true;
7979 }
7980 }
7981 }
7982 return changed;
7983 }
7984
fixPredCtrl(INST_LIST_ITER it,G4_BB * bb)7985 void HWConformity::fixPredCtrl(INST_LIST_ITER it, G4_BB* bb)
7986 {
7987 G4_INST* inst = *it;
7988 G4_Predicate* pred = inst->getPredicate();
7989 if (pred && (pred->getControl() == PRED_ANY_WHOLE || pred->getControl() == PRED_ALL_WHOLE))
7990 {
7991 // we need WA if pred's size is greater than inst's exec size
7992 // and the platform does not support predctrl group size (indicated by the fact we
7993 // have PRED_ANY_WHOLE and PRED_ALL_WHOLE)
7994 // The case where pred size is less than inst's exec size is already undefined
7995 // even with predCtrl group size..
7996 G4_Declare* flagDcl = pred->getTopDcl();
7997 if (flagDcl->getNumberFlagElements() > inst->getExecSize())
7998 {
7999 // convert
8000 // (f0.any32h) sel (1) ...
8001 // into
8002 // cmp (1) [ne] f1 f0 0
8003 // (f1) sel (1) ...
8004 // and
8005 // (f0.all32h) sel (1) ...
8006 // into
8007 // cmp (1) [e] f1 f0 0xFFFFFFFF
8008 //
8009 // if f0 happens to be < 16 elements we have to clear upper bits as well in case it has garbage values
8010 assert(!inst->getCondMod() && "currently don't handle an instruction with conditional modifier");
8011 assert((inst->isWriteEnableInst() || bb->isAllLaneActive()) && "don't handle instruction in SIMD CF for now");
8012 G4_Declare* tmpFlag = builder.createTempFlag(1);
8013 G4_Type flagType = flagDcl->getNumberFlagElements() == 32 ? Type_UD : Type_UW;
8014 uint32_t allOneMask = (uint32_t)((1ULL << flagDcl->getNumberFlagElements()) - 1);
8015 G4_Declare* cmpSrc0Flag = flagDcl;
8016 if (flagDcl->getNumberFlagElements() < 16)
8017 {
8018 // clear the upper bit of the flag
8019 auto andInst = builder.createBinOp(G4_and, g4::SIMD1, builder.createDstRegRegion(tmpFlag, 1),
8020 builder.createSrcRegRegion(flagDcl, builder.getRegionScalar()),
8021 builder.createImm(allOneMask, Type_UW), InstOpt_WriteEnable, false);
8022 bb->insertBefore(it, andInst);
8023 cmpSrc0Flag = tmpFlag;
8024 }
8025 G4_CondMod* condMod = builder.createCondMod(pred->getControl() == PRED_ANY_WHOLE ? Mod_ne : Mod_e,
8026 tmpFlag->getRegVar(), 0);
8027
8028 G4_Imm* immVal = builder.createImm(pred->getControl() == PRED_ANY_WHOLE ? 0 : allOneMask, flagType);
8029 // cmp needs to be as wide as the original inst but is uniform and NoMask otherwise
8030 auto cmpInst = builder.createInternalInst(
8031 nullptr, G4_cmp, condMod, g4::NOSAT, inst->getExecSize(),
8032 builder.createNullDst(flagType),
8033 builder.createSrc(cmpSrc0Flag->getRegVar(), 0, 0, builder.getRegionScalar(), flagType),
8034 immVal,
8035 InstOpt_WriteEnable);
8036 bb->insertBefore(it, cmpInst);
8037 inst->setPredicate(builder.createPredicate(pred->getState(), tmpFlag->getRegVar(), 0));
8038 }
8039 }
8040 }
8041
8042 // emulate mov F BF
8043 // with
8044 // shl UD UW 16
fixBFMove(INST_LIST_ITER i,G4_BB * bb)8045 bool HWConformity::fixBFMove(INST_LIST_ITER i, G4_BB* bb)
8046 {
8047 G4_INST* inst = *i;
8048 if (inst->opcode() != G4_mov)
8049 {
8050 return false;
8051 }
8052 G4_Operand* src0 = inst->getSrc(0);
8053
8054 if (inst->getDst()->getType() == Type_BF)
8055 {
8056 // allow BF->BF moves as they may be introduced during HW conformity
8057 // we will change their type to HF later
8058 assert((src0->getType() == Type_F || src0->getType() == Type_BF) &&
8059 "Only F->BF conversion is supported");
8060 assert(!inst->getPredicate() && !inst->getCondMod() && !inst->getSaturate() &&
8061 "F->BF move does not support pred/cond mod/sat");
8062 if (src0->isSrcRegRegion())
8063 {
8064 assert(src0->asSrcRegRegion()->getModifier() == Mod_src_undef &&
8065 "F->BF move does not support source modifier");
8066 }
8067 if (src0->getType() == Type_BF)
8068 {
8069 // change type of copy move to uw
8070 inst->getDst()->setType(Type_UW);
8071 src0->asSrcRegRegion()->setType(Type_UW);
8072 }
8073 return false;
8074 }
8075
8076 if (src0->getType() == Type_BF)
8077 {
8078 assert(inst->getDst()->getType() == Type_F && "Only BF->F conversion is supported");
8079 assert(!inst->getPredicate() && !inst->getCondMod() && !inst->getSaturate() &&
8080 "BF->F move does not support pred/cond mod/sat");
8081 // don't support BF imm for now
8082 assert(src0->isSrcRegRegion() &&
8083 src0->asSrcRegRegion()->getModifier() == Mod_src_undef &&
8084 "F->BF move does not support source modifier");
8085
8086 auto src0RR = src0->asSrcRegRegion();
8087
8088 src0RR->setType(Type_UW);
8089 G4_SrcRegRegion* newSrc0 = src0RR;
8090
8091 inst->getDst()->setType(Type_UD);
8092 auto newDst = inst->getDst();
8093
8094 auto shlInst = builder.createBinOp(G4_shl,
8095 inst->getExecSize(), newDst, newSrc0, builder.createImm(16, Type_UW), inst->getOption(), false);
8096 bb->insertBefore(i, shlInst);
8097 bb->erase(i);
8098
8099 return true;
8100 }
8101
8102 return false;
8103 }
8104
isFloatOr64b(G4_INST * inst)8105 bool HWConformity::isFloatOr64b(G4_INST* inst)
8106 {
8107 auto dst = inst->getDst();
8108 auto dstTy = dst->getType();
8109
8110 bool goFloatPipe = IS_TYPE_FLOAT_ALL(dstTy) || TypeSize(dstTy) >= 8;
8111
8112 if (!goFloatPipe)
8113 {
8114 for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; ++i)
8115 {
8116 auto src = inst->getSrc(i);
8117 if (src)
8118 {
8119 bool nonScalarFloat = IS_TYPE_FLOAT_ALL(src->getType()) &&
8120 src->isSrcRegRegion() && !src->asSrcRegRegion()->isScalar();
8121 // Q type may be mixed with other int (e.g., D = Q + D), so always needs checking as we may need
8122 // to fix the other operands.
8123 // float type only needs checking if it's non-scalar
8124 // ToDo: consider skipping all mixed mode as they should already confirm to region rules
8125 if (IS_QTYPE(src->getType()) || nonScalarFloat)
8126 {
8127 goFloatPipe = true;
8128 break;
8129 }
8130 }
8131 }
8132 }
8133 return goFloatPipe;
8134 }
8135
getSrcStride(G4_SrcRegRegion * src)8136 uint16_t HWConformity::getSrcStride(G4_SrcRegRegion* src)
8137 {
8138 uint16_t srcStride = 0;
8139 src->getRegion()->isSingleStride(src->getInst()->getExecSize(), srcStride);
8140 srcStride *= src->getTypeSize();
8141 return srcStride;
8142 };
8143
change64bStride2CopyToUD(INST_LIST_ITER it,G4_BB * bb)8144 void HWConformity::change64bStride2CopyToUD(INST_LIST_ITER it, G4_BB* bb)
8145 {
8146 G4_INST* inst = *it;
8147 G4_Operand* src = inst->getSrc(0);
8148 MUST_BE_TRUE(src != nullptr && src->isSrcRegRegion(), "source must be a SrcRegRegion");
8149 G4_SrcRegRegion* origSrc = src->asSrcRegRegion();
8150 G4_Type execType = inst->getDst()->getType();
8151 uint16_t stride = inst->getDst()->getHorzStride();
8152 short dstRegOff = inst->getDst()->getRegOff();
8153 short dstSubRegOff = inst->getDst()->getSubRegOff();
8154
8155 assert((execType == Type_Q || execType == Type_DF) && "Only 64b data type support");
8156 execType = Type_UD;
8157 dstSubRegOff *= 2;
8158
8159 G4_DstRegRegion* newDst = builder.createDst(
8160 inst->getDst()->getBase(),
8161 dstRegOff,
8162 dstSubRegOff + 1,
8163 stride * 2,
8164 execType);
8165 G4_SrcRegRegion* newSrc = builder.createSrcRegRegion(origSrc->getModifier(), Direct, origSrc->getBase(),
8166 origSrc->getRegOff(), origSrc->getSubRegOff() * 2 + 1, builder.createRegionDesc(2, 1, 0), Type_UD);
8167 inst->setSrc(newSrc, 0);
8168 inst->setDest(newDst);
8169
8170 G4_DstRegRegion* newDst1 = builder.createDst(
8171 inst->getDst()->getBase(),
8172 dstRegOff,
8173 dstSubRegOff,
8174 stride * 2,
8175 execType);
8176 G4_SrcRegRegion* newSrc1 = builder.createSrcRegRegion(origSrc->getModifier(), Direct, origSrc->getBase(),
8177 origSrc->getRegOff(), origSrc->getSubRegOff() * 2, builder.createRegionDesc(2, 1, 0), Type_UD);
8178
8179 G4_INST* movInst = builder.createMov(inst->getExecSize(), newDst1, newSrc1, inst->getOption(), false);
8180
8181 INST_LIST_ITER iter = it;
8182 iter++;
8183 bb->insertBefore(it, movInst);
8184 }
8185
8186 // on XeHP_SDV we have to make sure each source element is alignd to each dst element
8187 // for all float/64b inst (packed HF is ok in mixed mode inst)
8188 // For all violating instructions, we align each operand to the execution type
8189 // for float copy moves we could directly convert their type to int
fixUnalignedRegions(INST_LIST_ITER it,G4_BB * bb)8190 void HWConformity::fixUnalignedRegions(INST_LIST_ITER it, G4_BB* bb)
8191 {
8192 G4_INST* inst = *it;
8193 if (!inst->getDst() || inst->isSend() || inst->isDpas() ||
8194 hasDedicateAlignRegionConformity(it) ||
8195 inst->getExecSize() == g4::SIMD1)
8196 {
8197 // only check non-scalar ALU instructions
8198 return;
8199 }
8200
8201 if (!isFloatOr64b(inst))
8202 {
8203 return;
8204 }
8205 auto dst = inst->getDst();
8206 auto dstTy = dst->getType();
8207 G4_Type execTy = inst->getExecType();
8208 if (TypeSize(dstTy) > TypeSize(execTy))
8209 {
8210 // getExecType() does not take dst ty into account, while we have to consider the widest type
8211 // in all operands here
8212 execTy = dstTy;
8213 }
8214 auto execTyWidth = TypeSize(execTy);
8215
8216 // input must be a 64b copy move with packed dst and singly-strided src
8217 // this works for both direct and indirect dst and src
8218 auto change64bCopyToUD = [this](G4_INST* movInst, uint16_t srcStride)
8219 {
8220 auto oldSrc = movInst->getSrc(0)->asSrcRegRegion();
8221 G4_SrcRegRegion* movSrc = nullptr;
8222 if (oldSrc->getRegAccess() == Direct)
8223 {
8224 // change region, type, and subreg offset
8225 movSrc = builder.createSrcRegRegion(oldSrc->getModifier(), Direct, oldSrc->getBase(),
8226 oldSrc->getRegOff(), oldSrc->getSubRegOff() * 2, builder.createRegionDesc(srcStride * 2, 2, 1), Type_UD);
8227 }
8228 else
8229 {
8230 // change region and type
8231 movSrc = builder.createIndirectSrc(oldSrc->getModifier(), oldSrc->getBase(), oldSrc->getRegOff(),
8232 oldSrc->getSubRegOff(), builder.createRegionDesc(srcStride * 2, 2, 1), Type_UD, oldSrc->getAddrImm());
8233 }
8234 movInst->setSrc(movSrc, 0);
8235
8236 auto oldDst = movInst->getDst();
8237 G4_DstRegRegion* movDst = nullptr;
8238 if (oldDst->getRegAccess() == Direct)
8239 {
8240 movDst = builder.createDst(oldDst->getBase(), oldDst->getRegOff(), oldDst->getSubRegOff() * 2, oldDst->getHorzStride(), Type_UD, oldDst->getAccRegSel());
8241 }
8242 else
8243 {
8244 movDst = builder.createIndirectDst(oldDst->getBase(), oldDst->getSubRegOff(), oldDst->getHorzStride(), Type_UD, oldDst->getAddrImm());
8245 }
8246 movInst->setDest(movDst);
8247 movInst->setExecSize(G4_ExecSize(movInst->getExecSize() * 2u));
8248 movInst->setOptionOn(InstOpt_WriteEnable);
8249 // caller guarantees movInst is not predicated, so we can reset its mask offset to 0
8250 // this is to avoid a bug where changing
8251 // mov (8|M24) r2.0<1>:q
8252 // -->
8253 // mov (16|M24) r2.0<1>:ud
8254 // would result in illegal mask offset for SIMD16
8255 movInst->setMaskOption(InstOpt_M0);
8256 };
8257
8258 if (inst->isRawMov())
8259 {
8260 // we can do better for float/64b copy moves by directly changing their type
8261 bool done = true;
8262 if (inst->getSrc(0)->isSrcRegRegion() && !inst->getSrc(0)->asSrcRegRegion()->isScalar())
8263 {
8264 auto src0RR = inst->getSrc(0)->asSrcRegRegion();
8265 int dstStride = TypeSize(dstTy) * inst->getDst()->getHorzStride();
8266 int srcStride = getSrcStride(src0RR);
8267 if (dstStride != srcStride || !builder.isOpndAligned(inst->getSrc(0), getGRFSize()) ||
8268 !builder.isOpndAligned(inst->getDst(), getGRFSize()))
8269 {
8270 bool isNoMaskInst = !inst->getPredicate() && (inst->isWriteEnableInst() || bb->isAllLaneActive());
8271 if (execTyWidth < 8)
8272 {
8273 auto intType = TypeSize(dstTy) == 4 ? Type_UD : Type_UW;
8274 inst->getDst()->setType(intType);
8275 src0RR->setType(intType);
8276 }
8277 else if (isNoMaskInst && inst->getDst()->getHorzStride() == 1 && srcStride != 0)
8278 {
8279 // for packed 64b copy moves that are not under divergent CF, we can
8280 // change its type to UD
8281 change64bCopyToUD(inst, srcStride / inst->getSrc(0)->getTypeSize());
8282 }
8283 else if (isNoMaskInst && inst->getDst()->getHorzStride() == 2 && execTyWidth == 8 &&
8284 src0RR->getRegion()->isContiguous(inst->getExecSize()))
8285 {
8286 change64bStride2CopyToUD(it, bb);
8287 }
8288 else if (execTyWidth == 8 && IS_TYPE_INT(dstTy) && IS_TYPE_INT(src0RR->getType()) && srcStride != 0 && !src0RR->isIndirect())
8289 {
8290 // we can split 64b moves with single source stride into 2UD moves
8291 // ToDo: check if this subsumes the previous else if
8292 emulate64bMov(it, bb);
8293 }
8294 else
8295 {
8296 // a move we don't know how to handle without inserting more moves
8297 done = false;
8298 }
8299 }
8300 }
8301 if (done)
8302 {
8303 // the move is ok at this point
8304 return;
8305 }
8306 }
8307
8308 // some operands may have fixed offset (e.g., input), and we can directly check if all operands have the same sub-reg
8309 // for simplicity we require all operands to have same type and are packed.
8310 {
8311 bool goodOperand = true;
8312 if (inst->getDst()->getHorzStride() != 1)
8313 {
8314 goodOperand = false;
8315 }
8316 for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; ++i)
8317 {
8318 if (inst->getSrc(i)->isSrcRegRegion())
8319 {
8320 auto srcRR = inst->getSrc(i)->asSrcRegRegion();
8321 if (srcRR->getType() != inst->getDst()->getType() ||
8322 (!srcRR->isScalar() && !srcRR->getRegion()->isContiguous(inst->getExecSize())))
8323 {
8324 goodOperand = false;
8325 break;
8326 }
8327 }
8328 }
8329 uint32_t commonOffset = 0;
8330 if (goodOperand && hasSameSubregOffset(inst, commonOffset) && commonOffset != 0)
8331 {
8332 //for some strange reason HW requires null operands to have the same subreg offset as other operands as well
8333 if (inst->getDst()->isNullReg())
8334 {
8335 inst->setDest(builder.createDst(builder.phyregpool.getNullReg(), 0, commonOffset / dst->getTypeSize(), 1, dst->getType()));
8336 }
8337 return;
8338 }
8339 }
8340
8341 if (inst->getExecSize() == g4::SIMD2 && inst->getNumSrc() != 3)
8342 {
8343 if (inst->getDst()->getAccRegSel() != ACC_UNDEFINED)
8344 {
8345 // this instruction is internally generated, no need to check
8346 return;
8347 }
8348
8349 // split currently can't handle packed imm
8350 // Also don't split src byte type since scalar byte to float conversion is not allowed
8351 auto canSplit = [](G4_INST* inst)
8352 {
8353 if (inst->getPredicate() || inst->getCondMod())
8354 {
8355 return false;
8356 }
8357 for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; ++i)
8358 {
8359 auto ty = inst->getSrc(i)->getType();
8360 if (IS_VINTTYPE(ty) || ty == Type_VF || IS_BTYPE(ty))
8361 {
8362 return false;
8363 }
8364 }
8365 return true;
8366 };
8367 if (canSplit(inst))
8368 {
8369 auto prevIt = it == bb->begin() ? it : std::prev(it);
8370 if (evenlySplitInst(it, bb))
8371 {
8372 // split introduces new moves which may need fixing
8373 // after splitting it points to the second instruction
8374 INST_LIST_ITER splitIt = std::prev(it);
8375 INST_LIST_ITER insertIt = prevIt == bb->begin() ? prevIt : std::next(prevIt);
8376 while (insertIt != splitIt)
8377 {
8378 fixUnalignedRegions(insertIt, bb);
8379 insertIt++;
8380 }
8381 }
8382 return;
8383 }
8384 }
8385
8386 // fix Dst if necessary
8387 // some special mix mode dst are allowed provided the instruction has F type:
8388 // r1.0<2>:bf
8389 // r1.1<2>:bf
8390 // r1.0<1>:bf
8391 // r1.8<1>:bf
8392 bool isSpecialMixModeDst = false;
8393 bool canDoPackedFtoHFMove = builder.hasFtoPackedHFMove() && inst->opcode() == G4_mov && inst->getExecSize() >= builder.getNativeExecSize() &&
8394 dstTy == Type_HF && !dst->isIndirect();
8395 if ((builder.getMixModeType() == dstTy || canDoPackedFtoHFMove) && IS_FTYPE(execTy))
8396 {
8397 uint16_t offset = 0;
8398 bool isAligned = builder.isOpndAligned(dst, offset, getGRFSize() / 2);
8399 if (dst->getHorzStride() == 1)
8400 {
8401 isSpecialMixModeDst = isAligned;
8402 }
8403 else if (dst->getHorzStride() == 2)
8404 {
8405 isSpecialMixModeDst = isAligned || (offset % 32) == 2;
8406 }
8407 }
8408
8409 if (canDoPackedFtoHFMove && isSpecialMixModeDst)
8410 {
8411 if (inst->getExecSize() > builder.getNativeExecSize())
8412 {
8413 evenlySplitInst(it, bb);
8414 }
8415 return;
8416 }
8417
8418 auto dstStride = TypeSize(dstTy) * dst->getHorzStride();
8419 uint16_t dstAlign = inst->getSrc(0)->getType() == Type_VF ? 16 : getGRFSize();
8420 if (dst->getRegAccess() == Direct && !isSpecialMixModeDst &&
8421 (!builder.isOpndAligned(dst, dstAlign) || dstStride != execTyWidth))
8422 {
8423 inst->setDest(insertMovAfter(it, dst, dst->getType(), bb, GRFALIGN));
8424 if (IS_TYPE_FLOAT_ALL(dst->getType()) || dst->getTypeSize() == 8)
8425 {
8426 // the move may need more fixing
8427 fixUnalignedRegions(std::next(it), bb);
8428 }
8429 }
8430 else if (dst->getRegAccess() == IndirGRF && dst->getType() == Type_F)
8431 {
8432 // Since we can't know if an indirect dst is aligned or not,
8433 // The proper fix is to insert a move then change its type to int.
8434 // FIXME: not sure how to handle fp64 yet
8435 inst->setDest(insertMovAfter(it, dst, dst->getType(), bb, GRFALIGN));
8436 // the move may need more fixing
8437 fixUnalignedRegions(std::next(it), bb);
8438 }
8439
8440 auto getUnsignedType = [](int numByte)
8441 {
8442 switch (numByte)
8443 {
8444 case 1:
8445 return Type_UB;
8446 case 2:
8447 return Type_UW;
8448 case 4:
8449 return Type_UD;
8450 case 8:
8451 return Type_UQ;
8452 default:
8453 assert(false && "illegal type width");
8454 return Type_UD;
8455 }
8456 };
8457
8458 // generate a move where each element is aligned to execTyWidth
8459 // e.g.,
8460 // mov (8) V1<1>:q V2<1;1,0>:ud
8461 // becomes
8462 // mov (8) tmp<2>:ud V2<1;1,0>:ud
8463 // mov (8) V1<1>:q tmp<2;1,0>:ud
8464 // or
8465 // add (8) V1<1>:f V2<2;1,0>:f V3<1;1,0>:f
8466 // becomes
8467 // mov (8) tmp<1>:ud V2<2;1,0>:ud
8468 // add (8) V1<1>:f tmp<1;1,0>:f V3<1;1,0>:f
8469 // note that for float types we have to do the move in int since the move may be illegal otherwise
8470 auto doAlignMove = [&](G4_INST* inst, int srcPos, int stride)
8471 {
8472 // caller must ensure src is a srcregregion
8473 bool movNeedsFix = false;
8474 auto src = inst->getSrc(srcPos)->asSrcRegRegion();
8475 auto srcTy = src->getType();
8476 auto tmpTy = getUnsignedType((int)TypeSize(srcTy));
8477 auto movSrcTy = tmpTy;
8478 auto newSrcTy = srcTy;
8479 if (stride == 8 || (tmpTy == Type_UB &&
8480 builder.getNativeExecSize() > g4::SIMD8 &&
8481 (stride == 2 || stride == 4)))
8482 {
8483 // use UW as the tmp, and divide the stride by 2
8484 // there are two reasons for this transform,
8485 // 1) stride 8 is not supported
8486 // 2) avoid read-modify-write on bytes
8487 // mov (4) V1<4>:uw V2:ub
8488 // then use <4;1,0>:uw in the original inst
8489 tmpTy = (srcTy == Type_UB) ? Type_UW : Type_W;
8490 movSrcTy = srcTy;
8491 stride = stride / 2;
8492 newSrcTy = tmpTy;
8493 }
8494 auto tmp = builder.createTempVar(inst->getExecSize() * stride, tmpTy, GRFALIGN);
8495 auto movSrc = builder.createSrcRegRegion(*src);
8496 movSrc->setModifier(Mod_src_undef);
8497 movSrc->setType(movSrcTy);
8498 auto movInst = builder.createMov(inst->getExecSize(),
8499 builder.createDstRegRegion(tmp, stride), movSrc, inst->getOption(), false);
8500 if (movSrc->getTypeSize() == 8)
8501 {
8502 assert(stride == 1 && "expect dst stride to be 1 here");
8503 // the move instruction is itself illegal due to the source region being non-contiguous/not GRF-aligned
8504 // if the region is singly-strided, we can change it into a UD move, e.g.,
8505 // mov (8) V1<1>:q V2<2;1,0>:q
8506 // becomes
8507 // (W) mov (16) V1<1>:ud V2<4;2,1>:ud
8508 uint16_t srcStride = 0;
8509 if (movSrc->getRegion()->isSingleStride(inst->getExecSize(), srcStride))
8510 {
8511 change64bCopyToUD(movInst, srcStride);
8512 }
8513 else
8514 {
8515 movNeedsFix = true;
8516 }
8517 }
8518 bb->insertBefore(it, movInst);
8519 if (movNeedsFix)
8520 {
8521 // try splitting the move as last resort
8522 // it may be successful if we are not in SIMD CF
8523 evenlySplitInst(std::prev(it), bb);
8524 }
8525 auto newSrc = builder.createSrcRegRegion(src->getModifier(), Direct, tmp->getRegVar(), 0, 0,
8526 builder.createRegionDesc(stride, 1, 0), newSrcTy);
8527 inst->setSrc(newSrc, srcPos);
8528 };
8529
8530 for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; ++i)
8531 {
8532 G4_SrcRegRegion* src = inst->getSrc(i) && inst->getSrc(i)->isSrcRegRegion() ?
8533 inst->getSrc(i)->asSrcRegRegion() : nullptr;
8534 if (src)
8535 {
8536 if (IS_BTYPE(src->getType()) && (src->getRegion()->isRegionWH() || src->getRegion()->isScalar()))
8537 {
8538 // no scalar byte when dst is float
8539 // byte src with DF dst is handled by fixMov
8540 inst->setSrc(insertMovBefore(it, 0, inst->getDst()->getTypeSize() == 4 ? Type_D : Type_W, bb), 0);
8541 }
8542 else if (!src->getRegion()->isRegionWH() && !src->getRegion()->isScalar())
8543 {
8544 // indirect VxH operands are handled elsewhere
8545 auto srcStride = getSrcStride(src);
8546 bool isMixModeSrc = isLowPrecisionFloatTy(src->getType()) && IS_FTYPE(execTy);
8547 bool isMixModePackedSrc = isMixModeSrc && srcStride == 2;
8548 uint16_t alignment = isMixModePackedSrc ? (getGRFSize() / 2) : getGRFSize();
8549 // for mix mode the source must be packed, otherwise srcStride shoudl be == sizeof(exec type)
8550 if (!builder.isOpndAligned(src, alignment) || (isMixModeSrc ? !isMixModePackedSrc : srcStride != execTyWidth))
8551 {
8552 int stride = (int)(isMixModeSrc ? 1 : execTyWidth / src->getTypeSize());
8553 doAlignMove(inst, i, stride);
8554 }
8555 }
8556 }
8557 }
8558 }
8559
8560 // emulate mov HF BF8
8561 // with
8562 // shl UW UB 8
fixFcvt(INST_LIST_ITER i,G4_BB * bb)8563 bool HWConformity::fixFcvt(INST_LIST_ITER i, G4_BB* bb)
8564 {
8565 G4_INST* inst = *i;
8566 if (inst->opcode() != G4_fcvt)
8567 {
8568 return false;
8569 }
8570
8571 if (inst->getDst()->getType() == Type_UB)
8572 {
8573 assert((inst->getSrc(0)->getType() == Type_HF) &&
8574 "Only HF->BF8 conversion is supported");
8575 assert(!inst->getPredicate() && !inst->getCondMod() && !inst->getSaturate() &&
8576 "HF->BF8 move does not support pred/cond mod/sat");
8577 assert(inst->getSrc(0)->isSrcRegRegion() &&
8578 "HF->BF8 currently supports non-imm source only");
8579 assert(inst->getSrc(0)->asSrcRegRegion()->getRegAccess() == Direct &&
8580 inst->getSrc(0)->asSrcRegRegion()->getModifier() == Mod_src_undef &&
8581 "HF->BF8 move does not support source modifier");
8582
8583 // fix regioning <0;1,0> to <1;1,0> for execution sizes higher than 1.
8584 if (inst->getSrc(0)->asSrcRegRegion()->getRegion()->isScalar() &&
8585 inst->getExecSize() != g4::SIMD1)
8586 {
8587 inst->getSrc(0)->asSrcRegRegion()->setRegion(builder.getRegionStride1());
8588 inst->setSrc(insertMovBefore(i, 0, inst->getSrc(0)->getType(), bb, ThirtyTwo_Word), 0);
8589 INST_LIST_ITER newMovIter = i;
8590 newMovIter--;
8591 G4_INST* newMovInst = *newMovIter;
8592 newMovInst->getSrc(0)->asSrcRegRegion()->setRegion(builder.getRegionScalar());
8593 }
8594 assert(inst->getSrc(0)->asSrcRegRegion()->getRegion()->isContiguous(inst->getExecSize()) &&
8595 "HF->BF8 only support <1;1,0> regioning");
8596 if (inst->getDst()->getHorzStride() != 1)
8597 {
8598 replaceDst(i, inst->getDst()->getType(), ThirtyTwo_Word);
8599 INST_LIST_ITER newMovIter = i;
8600 newMovIter++;
8601 G4_INST* newMovInst = *newMovIter;
8602 newMovInst->getSrc(0)->asSrcRegRegion()->setType(Type_UB);
8603 newMovInst->getDst()->asDstRegRegion()->setType(Type_UB);
8604 if (inst->getExecSize() != g4::SIMD1)
8605 {
8606 newMovInst->getSrc(0)->asSrcRegRegion()->setRegion(builder.getRegionStride1());
8607 }
8608 inst->getDst()->setHorzStride(1);
8609 inst->setOptionOn(InstOpt_WriteEnable);
8610 }
8611 if (!builder.isOpndAligned(inst->getDst(), 64) ||
8612 !inst->isWriteEnableInst())
8613 {
8614 replaceDst(i, inst->getDst()->getType(), ThirtyTwo_Word);
8615 INST_LIST_ITER newMovIter = i;
8616 newMovIter++;
8617 G4_INST* newMovInst = *newMovIter;
8618 newMovInst->getSrc(0)->asSrcRegRegion()->setType(Type_UB);
8619 newMovInst->getDst()->asDstRegRegion()->setType(Type_UB);
8620 inst->setOptionOn(InstOpt_WriteEnable);
8621 }
8622 if (!builder.isOpndAligned(inst->getSrc(0), 64))
8623 {
8624 inst->setSrc(insertMovBefore(i, 0, inst->getSrc(0)->getType(), bb, ThirtyTwo_Word), 0);
8625 }
8626 return true;
8627 }
8628
8629 if (inst->getSrc(0)->getType() == Type_UB)
8630 {
8631 assert((inst->getDst()->getType() == Type_HF) &&
8632 "Only BF8->HF conversion is supported");
8633 assert(!inst->getPredicate() && !inst->getCondMod() && !inst->getSaturate() &&
8634 "BF8->HF move does not support pred/cond mod/sat");
8635 // don't support QF imm for now
8636 assert(inst->getSrc(0)->isSrcRegRegion() && inst->getSrc(0)->asSrcRegRegion()->getRegAccess() == Direct &&
8637 inst->getSrc(0)->asSrcRegRegion()->getModifier() == Mod_src_undef &&
8638 "BF8->HF move does not support source modifier");
8639
8640 // fix regioning <0;1,0> to <1;1,0> for execution sizes higher than 1.
8641 if (inst->getSrc(0)->asSrcRegRegion()->getRegion()->isScalar() &&
8642 inst->getExecSize() != g4::SIMD1)
8643 {
8644 inst->getSrc(0)->asSrcRegRegion()->setRegion(builder.getRegionStride1());
8645 inst->setSrc(insertMovBefore(i, 0, inst->getSrc(0)->getType(), bb, ThirtyTwo_Word), 0);
8646 INST_LIST_ITER newMovIter = i;
8647 newMovIter--;
8648 G4_INST* newMovInst = *newMovIter;
8649 newMovInst->getSrc(0)->asSrcRegRegion()->setType(Type_UB);
8650 newMovInst->getDst()->asDstRegRegion()->setType(Type_UB);
8651 newMovInst->getSrc(0)->asSrcRegRegion()->setRegion(builder.getRegionScalar());
8652 }
8653 assert(inst->getSrc(0)->asSrcRegRegion()->getRegion()->isContiguous(inst->getExecSize()) &&
8654 "BF8->HF only support <1;1,0> regioning");
8655 if (inst->getDst()->getHorzStride() != 1)
8656 {
8657 if (inst->getExecSize() != g4::SIMD1)
8658 {
8659 replaceDst(i, inst->getDst()->getType(), ThirtyTwo_Word);
8660 INST_LIST_ITER newMovIter = i;
8661 newMovIter++;
8662 G4_INST* newMovInst = *newMovIter;
8663 newMovInst->getSrc(0)->asSrcRegRegion()->setRegion(builder.getRegionStride1());
8664 }
8665 inst->getDst()->setHorzStride(1);
8666 }
8667 if (!builder.isOpndAligned(inst->getDst(), 64))
8668 {
8669 replaceDst(i, inst->getDst()->getType(), ThirtyTwo_Word);
8670 }
8671 if (!builder.isOpndAligned(inst->getSrc(0), 64))
8672 {
8673 inst->setSrc(insertMovBefore(i, 0, inst->getSrc(0)->getType(), bb, ThirtyTwo_Word), 0);
8674 INST_LIST_ITER newMovIter = i;
8675 newMovIter--;
8676 G4_INST* newMovInst = *newMovIter;
8677 newMovInst->getSrc(0)->asSrcRegRegion()->setType(Type_UB);
8678 newMovInst->getDst()->asDstRegRegion()->setType(Type_UB);
8679 }
8680
8681 inst->getSrc(0)->asSrcRegRegion()->setType(Type_UB);
8682 G4_SrcRegRegion* newSrc0 = inst->getSrc(0)->asSrcRegRegion();
8683
8684 inst->getDst()->setType(Type_UW);
8685 auto newDst = inst->getDst();
8686
8687 auto shlInst = builder.createBinOp(G4_shl,
8688 inst->getExecSize(), newDst, newSrc0, builder.createImm(8, Type_UW), inst->getOption(), false);
8689 bb->insertBefore(i, shlInst);
8690 bb->erase(i);
8691
8692 return true;
8693 }
8694 if (inst->getSrc(0)->getType() == Type_UD)
8695 {
8696 // fcvt a:F b:tf32
8697 // --> mov a:f b:f (tf32 format is valid f)
8698 G4_Operand* newSrc;
8699 if (inst->getSrc(0)->isImm())
8700 {
8701 float newF = inst->getSrc(0)->asImm()->getFloat();
8702 newSrc = builder.createImm(newF);
8703 }
8704 else
8705 {
8706 G4_SrcRegRegion* regSrc = inst->getSrc(0)->asSrcRegRegion();
8707 regSrc->setType(Type_F);
8708 newSrc = regSrc;
8709 }
8710 auto newDst = inst->getDst();
8711 auto movInst = builder.createMov(inst->getExecSize(), newDst, newSrc, inst->getOption(), false);
8712 bb->insertBefore(i, movInst);
8713 bb->erase(i);
8714 return true;
8715 }
8716
8717 if (inst->getDst()->getType() == Type_UD)
8718 {
8719 // fcvt a:tf32 b:f
8720 // Make sure dst/src0 have the same subreg offset and stride, except for scalar broadcast.
8721 G4_Operand* src0 = inst->getSrc(0);
8722 if (src0->isSrcRegRegion() && !src0->asSrcRegRegion()->getRegion()->isScalar())
8723 {
8724 G4_SrcRegRegion* regSrc0 = inst->getSrc(0)->asSrcRegRegion();
8725 G4_DstRegRegion* regDst = inst->getDst();
8726 uint16_t srcSingleStride;
8727 // Note that regSrc0 must not be scalar here!
8728 if (!regSrc0->getRegion()->isSingleStride(inst->getExecSize(), srcSingleStride))
8729 {
8730 // set it to an invalid value as it has no single (uniform) stride
8731 srcSingleStride = 0xFFFF;
8732 }
8733 if (srcSingleStride != regDst->getHorzStride() || !hasSameSubregOffset(inst))
8734 {
8735 // Need to force GRF-alignment and stride = 1
8736 if (srcSingleStride != 1 || !regSrc0->checkGRFAlign())
8737 {
8738 // Make sure to do UD copy for src
8739 regSrc0->setType(Type_UD);
8740 // Insert mov before i
8741 replaceSrc(i, 0, Type_UD, bb, ThirtyTwo_Word);
8742 // must have the original type (float) for i
8743 inst->getSrc(0)->asSrcRegRegion()->setType(Type_F);
8744 }
8745 if (regDst->getHorzStride() != 1 || !regDst->checkGRFAlign())
8746 {
8747 replaceDst(i, regDst->getType(), ThirtyTwo_Word);
8748 }
8749 return true;
8750 }
8751 }
8752 }
8753
8754 return false;
8755 }
8756
8757 // on PVC there are new restrictions on using byte/word region due to XBar reduction
fixByteXBarRestriction(INST_LIST_ITER it,G4_BB * bb)8758 void HWConformity::fixByteXBarRestriction(INST_LIST_ITER it, G4_BB* bb)
8759 {
8760 G4_INST* inst = *it;
8761
8762 if (!inst->getDst() || inst->isSend() || inst->isDpas() ||
8763 inst->getExecSize() == g4::SIMD1)
8764 {
8765 // only check non-scalar ALU instructions
8766 return;
8767 }
8768
8769 // due to much stronger restriction on float-pipe operation,
8770 // assume float-op has been fixed in fixUnalignedRegions
8771 if (isFloatOr64b(inst))
8772 {
8773 return;
8774 }
8775
8776 // hardware checks restriction even on null
8777 if (inst->getDst()->isNullReg())
8778 {
8779 auto dst = inst->getDst();
8780 auto dstTy = dst->getType();
8781 auto stride = dst->getHorzStride();
8782
8783 if ((dstTy == Type_W || dstTy == Type_UW) && stride < 2)
8784 dst->setHorzStride(2);
8785 else if (dstTy == Type_B || dstTy == Type_UB)
8786 {
8787 // create a new dst with W/UW type
8788 G4_DstRegRegion* new_null = builder.createNullDst(dstTy == Type_B ? Type_W : Type_UW);
8789 new_null->setHorzStride(2);
8790 inst->setDest(new_null);
8791 }
8792 return;
8793 }
8794
8795 if (VISA_WA_CHECK(builder.getPWaTable(), Wa_22010493955) &&
8796 inst->opcode() == G4_mov && inst->getSaturate())
8797 {
8798 auto dst = inst->getDst();
8799 auto dstTy = dst->getType();
8800 if ((dstTy == Type_B || dstTy == Type_UB) &&
8801 inst->getSrc(0) && inst->getSrc(0)->isSrcRegRegion())
8802 {
8803 auto src = inst->getSrc(0)->asSrcRegRegion();
8804 auto srcTy = src->getType();
8805 if (srcTy == Type_B)
8806 {
8807 insertMovBefore(it, 0, Type_D, bb);
8808 return;
8809 }
8810 if (srcTy == Type_UB)
8811 {
8812 insertMovBefore(it, 0, Type_UD, bb);
8813 return;
8814 }
8815 }
8816 }
8817
8818 auto isDclGRFAligned = [](G4_Declare* dcl)
8819 {
8820 if (!dcl)
8821 {
8822 return false;
8823 }
8824 uint32_t offset = 0;
8825 auto rootDcl = dcl->getRootDeclare(offset);
8826 return rootDcl->getSubRegAlign() >= GRFALIGN && (offset % getGRFSize() == 0);
8827 };
8828
8829 bool needFix = false;
8830 auto dst = inst->getDst();
8831 auto dstTy = dst->getType();
8832 // FIXME: should call isOpndAligned() here, but seems later code processes subRegOff separately..
8833 bool dstAligned = (dst->getRegAccess() == Direct) && isDclGRFAligned(dst->getTopDcl());
8834 auto dstSubRegOff = dst->getSubRegOff();
8835 bool allDirect = (dst->getRegAccess() == Direct);
8836
8837 // Fix for the odd destination subregister for G4_and, G4_or, G4_xor, G4_add, G4_asr, G4_sel, G4_cmp
8838 // Adding mov instruction to change inst dst subregister to even when conditions are met:
8839 // - instruction is at least two sources and dst isn't null
8840 // - dst sub-register is odd and dst stride is at least 1
8841 // - src0 reg region exist and isn't contiguous
8842 // - dst is B/UB, src0 is B/UB or W/UW, src1 is B/UB or W/UW
8843 if (VISA_WA_CHECK(builder.getPWaTable(), Wa_22010487853) &&
8844 inst->getNumSrc() > 1 &&
8845 inst->getDst() != NULL &&
8846 inst->getDst()->getSubRegOff() % 2 &&
8847 inst->getDst()->getHorzStride() > 1 &&
8848 inst->getSrc(0)->isSrcRegRegion() &&
8849 inst->getSrc(0)->asSrcRegRegion()->getRegion()->isContiguous(inst->getExecSize()) == false &&
8850 IS_BTYPE(inst->getDst()->getType()) &&
8851 (IS_BTYPE(inst->getSrc(0)->getType()) || IS_WTYPE(inst->getSrc(0)->getType())) &&
8852 (IS_BTYPE(inst->getSrc(1)->getType()) || IS_WTYPE(inst->getSrc(1)->getType())))
8853 {
8854 auto newDstTy = inst->getDst()->getType();
8855 // if dst stride is equal 4 and b2b-DS2 rule isn't covered, changing dst type to dword
8856 if (inst->getDst()->getHorzStride() == 4)
8857 {
8858 newDstTy = Type_D;
8859 replaceDst(it, newDstTy);
8860 return;
8861 }
8862 // force a fix when we applied b2b or w2b rule
8863 needFix = true;
8864 }
8865
8866 if (VISA_WA_CHECK(builder.getPWaTable(), Wa_22010487853) &&
8867 (dstTy == Type_B || dstTy == Type_UB) && (dstSubRegOff % 2) &&
8868 dst->getHorzStride() >= 4 && inst->getExecSize() == g4::SIMD32)
8869 {
8870 assert(canSplitInst(inst, NULL));
8871 evenlySplitInst(it, bb);
8872 return;
8873 }
8874 // check src0-to-dest and src1-to-dest restrictions
8875 for (int i = 0, numSrc = inst->getNumSrc(); !needFix && i < 2 && i < numSrc; ++i)
8876 {
8877 G4_SrcRegRegion* src = inst->getSrc(i) && inst->getSrc(i)->isSrcRegRegion() ?
8878 inst->getSrc(i)->asSrcRegRegion() : nullptr;
8879 if (!src)
8880 {
8881 continue;
8882 }
8883 // check then fix the restriction on Src
8884 auto srcTy = src->getType();
8885 auto region = src->getRegion();
8886 bool srcDirect = (src->getRegAccess() == Direct);
8887 allDirect &= srcDirect;
8888 // skip VxH indirect case because src operand will be read out one element a time
8889 if (!srcDirect && region->isRegionWH())
8890 {
8891 continue;
8892 }
8893 bool srcAligned = srcDirect && isDclGRFAligned(src->getTopDcl());
8894 auto srcSubRegOff = src->getSubRegOff();
8895 auto numRows = 1;
8896 unsigned ss = (region->width == 1) ? region->vertStride : region->horzStride;
8897 // we need to check the region rule one row at a time under the following situation
8898 if (region->width > 1 && region->width * region->horzStride != region->vertStride)
8899 {
8900 numRows = inst->getExecSize() / region->width;
8901 assert((inst->getExecSize() % region->width) == 0);
8902 }
8903 for (int row = 0; row < numRows; ++row)
8904 {
8905 srcSubRegOff = (srcSubRegOff + row * region->vertStride) % (getGRFSize() / TypeSize(srcTy));
8906 dstSubRegOff = (dstSubRegOff + row * region->width * dst->getHorzStride()) % (getGRFSize() / TypeSize(dstTy));
8907 bool dstSubRegOffDwordAlign = ((dstSubRegOff % (4 / TypeSize(dstTy))) == 0);
8908 if (TypeSize(srcTy) == 2)
8909 {
8910 // w2w and w2b rules
8911 // cannot have the case of w2b packing case, i.e. dest-stride == 1
8912 assert(!(TypeSize(dstTy) == 1 && dst->getHorzStride() == 1));
8913 if ((TypeSize(dstTy) == 2 && dst->getHorzStride() == 1) ||
8914 (TypeSize(dstTy) == 1 && dst->getHorzStride() == 2))
8915 {
8916 if (numRows > 1 && !dstSubRegOffDwordAlign)
8917 {
8918 needFix = true;
8919 }
8920 else if (ss == 2)
8921 {
8922 bool Aligned = srcAligned && dstAligned
8923 && !(i == 1 && TypeSize(dstTy) == 1 && VISA_WA_CHECK(builder.getPWaTable(), Wa_16012383669))
8924 && ((dstSubRegOff % (32 / TypeSize(dstTy))) == (srcSubRegOff / TypeSize(dstTy)));
8925 needFix |= !Aligned;
8926 }
8927 else if (ss > 2)
8928 {
8929 needFix = true;
8930 }
8931 }
8932 }
8933 else if (TypeSize(srcTy) == 1)
8934 {
8935 if (TypeSize(dstTy) == 2 && dst->getHorzStride() == 1) // b2w rule
8936 {
8937 if (numRows > 1 && !dstSubRegOffDwordAlign)
8938 {
8939 needFix = true;
8940 }
8941 else if (ss == 4)
8942 {
8943 bool Aligned = srcAligned && dstAligned
8944 && ((2 * (dstSubRegOff % 16)) == (srcSubRegOff / 2));
8945 needFix |= !Aligned;
8946 }
8947 else if (ss == 8)
8948 {
8949 bool Aligned = srcAligned && dstAligned
8950 && ((2 * (dstSubRegOff % 8)) == (srcSubRegOff / 4));
8951 needFix |= !Aligned;
8952 }
8953 else if (ss > 8)
8954 {
8955 needFix = true;
8956 }
8957 }
8958 else if (TypeSize(dstTy) == 1 && dst->getHorzStride() == 2) // b2b-DS2 rule
8959 {
8960 if (numRows > 1 && !dstSubRegOffDwordAlign)
8961 {
8962 needFix = true;
8963 }
8964 else if (ss == 4)
8965 {
8966 bool Aligned = srcAligned && dstAligned
8967 && ((dstSubRegOff % 32) == (srcSubRegOff / 2));
8968 // change dstAligned to false, so we need a pack-shift
8969 // in the end of the fix
8970 if (VISA_WA_CHECK(builder.getPWaTable(), Wa_1507979211))
8971 {
8972 dstAligned &= (dstSubRegOff < 32);
8973 Aligned &= (dstSubRegOff < 32);
8974 }
8975 needFix |= !Aligned;
8976 }
8977 else if (ss == 8)
8978 {
8979 bool Aligned = srcAligned && dstAligned
8980 && ((dst->getSubRegOff() % 16) == (srcSubRegOff / 4));
8981 needFix |= !Aligned;
8982 }
8983 else if (ss > 8)
8984 {
8985 needFix = true;
8986 }
8987
8988 }
8989 else if (TypeSize(dstTy) == 1 && dst->getHorzStride() == 1 && region->width != 2) // b2b-DS1 rule
8990 {
8991 if (numRows > 1 && !dstSubRegOffDwordAlign)
8992 {
8993 needFix = true;
8994 }
8995 else if (ss == 2)
8996 {
8997 bool Aligned = srcAligned && dstAligned
8998 && ((dstSubRegOff % 32) == (srcSubRegOff / 2));
8999 needFix |= !Aligned;
9000 }
9001 else if (ss == 4)
9002 {
9003 bool Aligned = srcAligned && dstAligned
9004 && ((dstSubRegOff % 16) == (srcSubRegOff / 4));
9005 // change dstAligned to false, so we need a pack-shift
9006 // in the end of the fix
9007 if (VISA_WA_CHECK(builder.getPWaTable(), Wa_1507979211))
9008 {
9009 dstAligned &= (dstSubRegOff < 32);
9010 Aligned &= (dstSubRegOff < 32);
9011 }
9012 needFix |= !Aligned;
9013 }
9014 else if (ss > 4)
9015 {
9016 needFix = true;
9017 }
9018
9019 }
9020 else if (TypeSize(dstTy) == 1 && dst->getHorzStride() == 1 && region->width == 2) // b2b-DS1 rule
9021 {
9022 if (numRows > 1 && !dstSubRegOffDwordAlign)
9023 {
9024 needFix = true;
9025 }
9026 else if (region->horzStride + region->vertStride >= 4)
9027 {
9028 if (region->horzStride == 2 && region->vertStride == 4)
9029 {
9030 bool Aligned = srcAligned && dstAligned
9031 && ((dstSubRegOff % 32) == (srcSubRegOff / 2));
9032 // change dstAligned to false, so we need a pack-shift
9033 // in the end of the fix
9034 if (VISA_WA_CHECK(builder.getPWaTable(), Wa_1507979211))
9035 {
9036 dstAligned &= (dstSubRegOff < 32);
9037 Aligned &= (dstSubRegOff < 32);
9038 }
9039 needFix |= !Aligned;
9040 }
9041 else if (region->horzStride == 4 && region->vertStride == 8)
9042 {
9043 bool Aligned = srcAligned && dstAligned
9044 && ((dstSubRegOff % 16) == (srcSubRegOff / 4));
9045 // change dstAligned to false, so we need a pack-shift
9046 // in the end of the fix
9047 if (VISA_WA_CHECK(builder.getPWaTable(), Wa_1507979211))
9048 {
9049 dstAligned &= (dstSubRegOff < 32);
9050 Aligned &= (dstSubRegOff < 32);
9051 }
9052 needFix |= !Aligned;
9053
9054 }
9055 else
9056 {
9057 needFix = true;
9058 }
9059 }
9060 else if (region->horzStride == 2)
9061 {
9062 // DS==1 && W==2 && HS==2 && VS == 0 or 1
9063 needFix = true;
9064 }
9065 }
9066 }
9067 }
9068 }
9069
9070 if (needFix)
9071 {
9072 if (inst->getExecSize() == g4::SIMD2 && allDirect && inst->getNumSrc() != 3)
9073 {
9074 // just split the inst
9075 evenlySplitInst(it, bb);
9076 return;
9077 }
9078
9079 auto scale = 4 / TypeSize(dstTy);
9080 const RegionDesc* unpackRegion = builder.createRegionDesc(scale, 1, 0);
9081 dstSubRegOff = dst->getSubRegOff() % (getGRFSize() / TypeSize(dstTy));
9082
9083 // compute the sub-reg-offset we need to use
9084 short tmpSSR = 0;
9085 if (TypeSize(dstTy) == 2)
9086 {
9087 tmpSSR = 2 * (dstSubRegOff % 16);
9088 }
9089 else
9090 {
9091 assert(TypeSize(dstTy) == 1);
9092 if (dst->getHorzStride() == 2)
9093 {
9094 tmpSSR = 2 * (dstSubRegOff % 32);
9095 }
9096 else
9097 {
9098 assert(dst->getHorzStride() == 1);
9099 tmpSSR = 4 * (dstSubRegOff % 16);
9100 }
9101 }
9102 auto tempSize = std::max(inst->getExecSize() * scale + tmpSSR, getGRFSize() / TypeSize(dstTy));
9103
9104 // Replace the dest with a temp, same-type, offset == 0
9105 // stride == 2 for word-type; stride == 4 for byte-type
9106 // Add a B2B or W2W pack-move from temp.0(stride; 1, 0) to the original-dest.sub(ds)
9107 // however, if the original-dest is NOT grf-aligned, we need another B2B or W2W
9108 // to shift the location of packed bytes or words after packing.
9109 if (dstAligned && (tempSize <= (unsigned short)(getGRFSize() * 2)))
9110 {
9111 G4_Declare* unpackDcl = builder.createTempVar(tempSize, dstTy, GRFALIGN);
9112
9113 G4_SrcRegRegion* unpackSrc = builder.createSrc(
9114 unpackDcl->getRegVar(),
9115 0,
9116 tmpSSR,
9117 unpackRegion,
9118 unpackDcl->getElemType());
9119
9120 G4_Predicate* pred = NULL;
9121 if (inst->opcode() != G4_sel)
9122 {
9123 pred = inst->getPredicate();
9124 inst->setPredicate(NULL);
9125 // maintainDU4TempMov will update def-use
9126 }
9127 unsigned int new_option = inst->getMaskOption();
9128
9129 auto pos = it;
9130 pos++;
9131
9132 // insert the packing move
9133 G4_INST* packInst = builder.createMov(inst->getExecSize(), dst, unpackSrc, new_option, false);
9134 packInst->setPredicate(pred);
9135 bb->insertBefore(pos, packInst);
9136
9137 // update def-use info
9138 maintainDU4TempMov(inst, packInst);
9139 // change the destination of the original instruction
9140 if (dstTy == Type_UW || dstTy == Type_W || inst->getSaturate() || (tmpSSR % scale))
9141 {
9142 auto tmpDst = builder.createDst(
9143 unpackDcl->getRegVar(),
9144 0,
9145 tmpSSR,
9146 scale,
9147 unpackDcl->getElemType());
9148 inst->setDest(tmpDst);
9149 }
9150 else
9151 {
9152 // use dword destination to avoid read-modify-write
9153 G4_Declare* tmpDstDcl =
9154 builder.createTempVar(tempSize / scale,
9155 (dstTy == Type_UB) ? Type_UD : Type_D, GRFALIGN);
9156 tmpDstDcl->setAliasDeclare(unpackDcl, 0);
9157 auto tmpDst = builder.createDst(
9158 tmpDstDcl->getRegVar(),
9159 0,
9160 tmpSSR / scale,
9161 1,
9162 tmpDstDcl->getElemType());
9163 inst->setDest(tmpDst);
9164 }
9165 }
9166 else
9167 {
9168 G4_Declare* unpackDcl = builder.createTempVar(inst->getExecSize() * scale, dstTy, GRFALIGN);
9169 G4_SrcRegRegion* unpackSrc = builder.createSrcRegRegion(unpackDcl, unpackRegion);
9170 G4_Predicate* pred = NULL;
9171 if (inst->opcode() != G4_sel)
9172 {
9173 pred = inst->getPredicate();
9174 inst->setPredicate(NULL);
9175 // maintainDU4TempMov will update def-use
9176 }
9177 unsigned int new_option = inst->getMaskOption();
9178 auto pos = it;
9179 pos++;
9180 auto dstride = dst->getHorzStride();
9181 const RegionDesc* shiftRegion = builder.createRegionDesc(dstride, 1, 0);
9182 G4_Declare* shiftDcl = builder.createTempVar(inst->getExecSize() * dstride, dstTy, GRFALIGN);
9183 G4_SrcRegRegion* shiftSrc = builder.createSrcRegRegion(shiftDcl, shiftRegion);
9184 auto packTmp = builder.createDstRegRegion(shiftDcl, dstride);
9185 // pack
9186 G4_INST* packInst = builder.createMov(inst->getExecSize(), packTmp, unpackSrc, new_option, false);
9187 packInst->setPredicate(pred);
9188 bb->insertBefore(pos, packInst);
9189 // then shift the bytes and words location
9190 G4_INST* shiftInst = builder.createMov(inst->getExecSize(), dst, shiftSrc, new_option, false);
9191 shiftInst->setPredicate(pred);
9192 bb->insertBefore(pos, shiftInst);
9193 // update propagation info
9194 maintainDU4TempMov(inst, shiftInst);
9195 // change the destination of the original instruction
9196 if (dstTy == Type_UW || dstTy == Type_W || inst->getSaturate())
9197 {
9198 inst->setDest(builder.createDstRegRegion(unpackDcl, scale));
9199 }
9200 else
9201 {
9202 // situations we use dword-tmp to reduce byte-read-mod-write
9203 G4_Declare* tmpDstDcl =
9204 builder.createTempVar(inst->getExecSize(),
9205 (dstTy == Type_UB) ? Type_UD : Type_D, GRFALIGN);
9206 tmpDstDcl->setAliasDeclare(unpackDcl, 0);
9207 inst->setDest(builder.createDstRegRegion(tmpDstDcl, 1));
9208 }
9209 }
9210 }
9211 }
9212
fixSrnd(INST_LIST_ITER it,G4_BB * bb)9213 bool HWConformity::fixSrnd(INST_LIST_ITER it, G4_BB* bb)
9214 {
9215 G4_INST* inst = *it;
9216 if (inst->opcode() != G4_srnd)
9217 {
9218 return false;
9219 }
9220
9221 bool changed = false; // return value
9222
9223 // case 1. src0 cannot be imm.
9224 // case 2. subreg must be zero (must be grf-aligned)
9225 // case 3. For HF->BF8, both dst and src must be packed
9226 // srnd: https://gfxspecs.intel.com/Predator/Home/Index/67451
9227 G4_DstRegRegion* dst = inst->getDst();
9228 uint32_t execsize = inst->getExecSize();
9229 bool Packed = (dst->getType() == Type_UB);
9230 if (!dst->checkGRFAlign() || // case 2
9231 (Packed && dst->getHorzStride() != 1)) // case 3
9232 {
9233 G4_Declare* dcl = builder.createTempVar(execsize, dst->getType(), GRFALIGN);
9234 G4_SrcRegRegion* srcRegion = builder.createSrcRegRegion(
9235 dcl,
9236 execsize == 1 ? builder.getRegionScalar() : builder.getRegionStride1());
9237 uint32_t newOption = InstOpt_WriteEnable | inst->getMaskOption();
9238 G4_INST* newInst = builder.createMov(G4_ExecSize(execsize), dst, srcRegion, newOption, false);
9239 bb->insertAfter(it, newInst);
9240
9241 G4_DstRegRegion* newDst = builder.createDstRegRegion(dcl, 1);
9242 inst->setDest(newDst);
9243 changed = true;
9244 }
9245
9246 G4_Operand* opnd0 = inst->getSrc(0);
9247 if (opnd0->isImm() || // case 1
9248 !opnd0->asSrcRegRegion()->checkGRFAlign() || // case 2
9249 (Packed && !opnd0->asSrcRegRegion()->getRegion()->isContiguous(execsize))) // case 3
9250 {
9251 G4_Operand* newSrc0 = insertMovBefore(it, 0, opnd0->getType(), bb, GRFALIGN);
9252 inst->setSrc(newSrc0, 0);
9253 G4_INST* newMovInst = *(std::prev(it));
9254 newMovInst->setNoMask(true);
9255 changed = true;
9256 }
9257
9258 G4_Operand* opnd1 = inst->getSrc(1);
9259 if (opnd1->isSrcRegRegion() &&
9260 (!opnd1->asSrcRegRegion()->checkGRFAlign() || // case 2
9261 (Packed && !opnd1->asSrcRegRegion()->getRegion()->isContiguous(execsize)))) // case 3
9262 {
9263 G4_Operand* newSrc1 = insertMovBefore(it, 1, opnd1->getType(), bb, GRFALIGN);
9264 inst->setSrc(newSrc1, 1);
9265 G4_INST* newMovInst = *(std::prev(it));
9266 newMovInst->setNoMask(true);
9267 changed = true;
9268 }
9269 return changed;
9270 }
9271
fixShiftInsts(INST_LIST_ITER i,G4_BB * bb)9272 void HWConformity::fixShiftInsts(INST_LIST_ITER i, G4_BB* bb)
9273 {
9274 G4_INST* inst = *i;
9275 if (inst->opcode() != G4_shl && inst->opcode() != G4_shr && inst->opcode() != G4_asr)
9276 {
9277 return;
9278 }
9279
9280 auto dst = inst->getDst();
9281 auto src0 = inst->getSrc(0);
9282 auto src1 = inst->getSrc(1);
9283
9284 bool needWA = false;
9285
9286 if (builder.getPlatform() == GENX_PVCXT && !IS_QTYPE(dst->getType()) && !IS_QTYPE(src0->getType()) && IS_QTYPE(src1->getType()))
9287 {
9288 needWA = true;
9289 }
9290
9291 if (builder.getOption(vISA_forceSrc0ToQwForQwShlWA) && inst->opcode() == G4_shl && IS_QTYPE(dst->getType()) && !IS_QTYPE(src0->getType()))
9292 {
9293 needWA = true;
9294 }
9295
9296 if (needWA)
9297 {
9298 G4_Operand* newSrc0 = insertMovBefore(i, 0, IS_SIGNED_INT(src0->getType()) ? Type_Q : Type_UQ, bb);
9299 inst->setSrc(newSrc0, 0);
9300 }
9301 }
9302
hasDedicateAlignRegionConformity(const G4_INST * I) const9303 bool HWConformity::hasDedicateAlignRegionConformity(const G4_INST *I) const
9304 {
9305 switch (I->opcode())
9306 {
9307 case G4_fcvt:
9308 return true;
9309 case G4_srnd:
9310 return true;
9311 default:
9312 break;
9313 }
9314 return false;
9315 }
9316
9317 // get rid of source modifiers on this inst[srcPos]
fixSrc1Region(INST_LIST_ITER it,G4_BB * bb)9318 void HWConformity::fixSrc1Region(INST_LIST_ITER it, G4_BB* bb)
9319 {
9320 G4_INST* inst = *it;
9321 G4_Operand* src1 = inst->getSrc(1);
9322
9323 // need extra move if horzStride >= 4
9324 if (src1->isSrcRegRegion() && src1->asSrcRegRegion()->getRegion()->horzStride >= 4)
9325 {
9326 G4_Operand* new_src1 = insertMovBefore(it, 1, src1->getType(), bb);
9327 inst->setSrc(new_src1, 1);
9328 }
9329 }
9330
fixMadwInst(INST_LIST_ITER it,G4_BB * bb)9331 INST_LIST_ITER HWConformity::fixMadwInst(INST_LIST_ITER it, G4_BB* bb)
9332 {
9333 G4_INST* madwInst = *it;
9334 auto execSize = madwInst->getExecSize();
9335 MUST_BE_TRUE(madwInst->opcode() == G4_madw, "expect madw instruction");
9336
9337 MUST_BE_TRUE(builder.getPlatform() >= GENX_PVC || execSize != g4::SIMD32, "SIMD32 is not supported on this platform for madw");
9338
9339 auto dst = madwInst->getDst();
9340 MUST_BE_TRUE(IS_DTYPE(dst->getType()), "dst only supports DW type");
9341
9342 auto src0 = madwInst->getSrc(0);
9343 auto src1 = madwInst->getSrc(1);
9344 auto src2 = madwInst->getSrc(2);
9345 MUST_BE_TRUE(IS_DTYPE(src0->getType()) && IS_DTYPE(src1->getType()) && IS_DTYPE(src2->getType()), "only DW-type sources are supported");
9346
9347 // src1 does not support modifier
9348 checkSrcMod(it, bb, 1);
9349
9350 // fix src1 region: stride can't exceed 4, otherwise the stride of src1 in the expanded mul will be invalid
9351 fixSrc1Region(it, bb);
9352 src1 = madwInst->getSrc(1);
9353
9354 // fix modifier for src0
9355 if (!builder.supportSrcModforMul())
9356 {
9357 checkSrcMod(it, bb, 0);
9358 src0 = madwInst->getSrc(0);
9359 }
9360
9361 // sat cannot be used at all in the macro sequence
9362 // make the dst GRF-aligned before expanding to macro
9363 if (madwInst->getSaturate() ||
9364 dst->getHorzStride() != 1 ||
9365 isPreAssignedRegOffsetNonZero<G4_DstRegRegion>(dst) ||
9366 !builder.isOpndAligned(dst, getGRFSize()))
9367 {
9368 // add tmp mov instructions
9369 int dstLowGRFNum = (int)std::ceil((float)(execSize * dst->getExecTypeSize()) / getGRFSize());
9370 int dstTotalGRFNum = dstLowGRFNum * 2;
9371
9372 G4_Declare* newDstDcl = builder.createTempVar(numEltPerGRF(dst->getType()) * dstTotalGRFNum, dst->getType(), GRFALIGN);
9373
9374 // add a tmp mov for low results in dst
9375 G4_Declare* lowMovSrcDcl = builder.createTempVar(numEltPerGRF(dst->getType()) * dstLowGRFNum, dst->getType(), GRFALIGN);
9376 lowMovSrcDcl->setAliasDeclare(newDstDcl, 0);
9377 G4_SrcRegRegion* lowMovSrc = builder.createSrcRegRegion(lowMovSrcDcl, builder.getRegionStride1());
9378 auto dstLow = builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff(), dst->getHorzStride(), dst->getType());
9379 G4_INST* lowMovInst = builder.createMov(execSize, dstLow, lowMovSrc, madwInst->getMaskOption(), false);
9380 lowMovInst->setPredicate(madwInst->getPredicate());
9381 lowMovInst->setSaturate(madwInst->getSaturate());
9382 auto insertIter = bb->insertAfter(it, lowMovInst);
9383 maintainDU4TempMov(madwInst, lowMovInst);
9384
9385 // add a tmp mov for high results in dst
9386 G4_Declare* hiMovSrcDcl = builder.createTempVar(numEltPerGRF(dst->getType()) * dstLowGRFNum, dst->getType(), GRFALIGN);
9387 hiMovSrcDcl->setAliasDeclare(newDstDcl, dstLowGRFNum * getGRFSize());
9388 G4_SrcRegRegion* hiMovSrc = builder.createSrcRegRegion(hiMovSrcDcl, builder.getRegionStride1());
9389 auto dstHi = builder.createDst(dst->getBase(), dst->getRegOff() + dstLowGRFNum, dst->getSubRegOff(), dst->getHorzStride(), dst->getType());
9390 G4_INST* hiMovInst = builder.createMov(execSize, dstHi, hiMovSrc, madwInst->getMaskOption(), false);
9391 hiMovInst->setPredicate(madwInst->getPredicate());
9392 hiMovInst->setSaturate(madwInst->getSaturate());
9393 bb->insertAfter(insertIter, hiMovInst);
9394 maintainDU4TempMov(madwInst, hiMovInst);
9395
9396 G4_DstRegRegion* newDst = builder.createDstRegRegion(newDstDcl, 1);
9397 madwInst->setDest(newDst);
9398 madwInst->setPredicate(nullptr);
9399 madwInst->setSaturate(g4::NOSAT);
9400 dst = newDst;
9401 }
9402
9403 INST_LIST_ITER retIter = it;
9404 if (builder.noMulOrMadwExpandingBeforeScheduler() && builder.getOption(vISA_expandMadwPostSchedule))
9405 {
9406 // Here just create tmp variables to fix srcMod, cond modifier, saturate, etc. And Madw->Mul+Mach+Addc+Add expanding
9407 // will be done in expandMadwPostSchedule pass.
9408
9409 // need extra mov if dst is acc and src0 is indirect
9410 if (!builder.accDstforIndirectSrc())
9411 {
9412 if (src0->isSrcRegRegion() && src0->asSrcRegRegion()->getRegAccess() == IndirGRF)
9413 {
9414 madwInst->setSrc(insertMovBefore(it, 0, src0->getType(), bb), 0);
9415 }
9416 }
9417
9418 // add implicit acc dst to the madw instruction as acc will be used as dst of the expanded mul after local scheduling.
9419 // it is a must to fix the WAR/WAW issue of acc in local scheduling.
9420 G4_DstRegRegion* accDstOpnd = builder.createDst(builder.phyregpool.getAcc0Reg(), 0, 0, 1, madwInst->getDst()->getType());
9421 madwInst->setImplAccDst(accDstOpnd);
9422
9423 retIter = std::next(it);
9424 }
9425 else
9426 {
9427 // SOA layout of dst:(dst_hi32:d, dst_lo32:d)
9428 // if src2 is not immediate value of zero, then expand MADW((dst_hi32, dst_lo32) = src0 * src1 + src2) to:
9429 // mul (16) acc0.0<1>:d src0<1;1,0>:d src1<2;1,0>:uw
9430 // mach (16) dst_hi32<1>:d src0<1;1,0>:d src1<1;1,0>:d
9431 // addc (16) dst_lo32<1>:d acc0.0<1;1,0>:d src2<1;1,0>:d // Low 32 bits
9432 // add (16) dst_hi32<1>:d acc0.0<1;1,0>:d dst_hi32<1;1,0>:d // High 32 bits
9433 // otherwise, expand to:
9434 // mul (16) acc0.0<1>:d src0<1;1,0>:d src1<2;1,0>:uw
9435 // mach (16) dst_hi32<1>:d src0<1;1,0>:d src1<1;1,0>:d // High 32 bits
9436 // mov (16) dst_lo32<1>:d acc0.0<1;1,0>:d // Low 32 bits
9437
9438 uint32_t origOptions = madwInst->getOption();
9439 G4_Predicate* origPredicate = madwInst->getPredicate();
9440 G4_Type tmpType = (IS_UNSIGNED_INT(src0->getType()) && IS_UNSIGNED_INT(src1->getType()) && IS_UNSIGNED_INT(src2->getType())) ? Type_UD : Type_D;
9441
9442 // 1, create a new mul inst
9443 G4_DstRegRegion* accDstOpnd = builder.createDst(builder.phyregpool.getAcc0Reg(), 0, 0, 1, tmpType);
9444 auto newMul = builder.createBinOp(G4_mul, execSize,
9445 accDstOpnd, builder.duplicateOperand(src0), builder.duplicateOperand(src1), origOptions, false);
9446 auto startIter = bb->insertBefore(it, newMul);
9447 madwInst->copyDefsTo(newMul, false);
9448 // change src1 type to uw type
9449 fixMulSrc1(std::prev(it), bb);
9450
9451 // 2, create a mach inst
9452 int DstHiRegOffset = (int)std::ceil((float)(execSize * TypeSize(tmpType)) / getGRFSize());
9453 G4_DstRegRegion* dstHi32 = builder.createDst(dst->getBase(), dst->getRegOff() + DstHiRegOffset, dst->getSubRegOff(), 1, tmpType);
9454 G4_INST* machInst = builder.createMach(execSize,
9455 dstHi32, builder.duplicateOperand(src0), builder.duplicateOperand(src1), origOptions, tmpType);
9456 machInst->setPredicate(origPredicate);
9457 *it = machInst;
9458 madwInst->transferUse(machInst);
9459 madwInst->removeAllDefs();
9460 newMul->addDefUse(machInst, Opnd_implAccSrc);
9461
9462 auto endIter = it;
9463 // optimize: only do multiply if src2 is imme 0
9464 if (src2->isImm() && src2->asImm()->getImm() == 0)
9465 {
9466 // 3, create a mov inst
9467 auto dstLo32 = builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff(), 1, tmpType);
9468 auto accSrcOpndMov = builder.createSrc(builder.phyregpool.getAcc0Reg(), 0, 0,
9469 execSize == g4::SIMD1 ? builder.getRegionScalar() : builder.getRegionStride1(), tmpType);
9470 auto movInst = builder.createMov(execSize, dstLo32, accSrcOpndMov, origOptions, false);
9471 movInst->setPredicate(origPredicate);
9472 endIter = bb->insertAfter(endIter, movInst);
9473 }
9474 else
9475 {
9476 // 3, create a addc inst
9477 auto dstLo32 = builder.createDst(dst->getBase(), dst->getRegOff(), dst->getSubRegOff(), 1, tmpType);
9478 auto accSrcOpnd = builder.createSrc(builder.phyregpool.getAcc0Reg(), 0, 0,
9479 execSize == g4::SIMD1 ? builder.getRegionScalar() : builder.getRegionStride1(), tmpType);
9480 auto addcInst = builder.createBinOp(G4_addc, execSize, dstLo32, accSrcOpnd, builder.duplicateOperand(src2), origOptions, false);
9481 addcInst->setPredicate(origPredicate);
9482 endIter = bb->insertAfter(endIter, addcInst);
9483
9484 // 4, create a add inst
9485 auto src1Add = builder.createSrc(dstHi32->getBase(), dstHi32->getRegOff(), dstHi32->getSubRegOff(),
9486 execSize == g4::SIMD1 ? builder.getRegionScalar() : builder.getRegionStride1(), tmpType);
9487 auto addInst = builder.createBinOp(G4_add, execSize, builder.duplicateOperand(dstHi32), builder.duplicateOperand(accSrcOpnd), src1Add, origOptions, false);
9488 addInst->setPredicate(origPredicate);
9489 endIter = bb->insertAfter(endIter, addInst);
9490 }
9491
9492 // split inst if execSize is larger than native execSize
9493 if (execSize > builder.getNativeExecSize())
9494 {
9495 splitDWMULInst(startIter, endIter, bb);
9496 retIter = startIter;
9497 }
9498 else
9499 {
9500 retIter = std::prev(it);
9501 }
9502 }
9503
9504 return retIter;
9505 }
9506
9507 // Currently the local copy propagation phase (newLocalDefHoisting) might be
9508 // too aggressive and could fold a0 register into a select in the float pipe
9509 // which is illegal. We try to fix the instruction in HWConformity because we
9510 // may fix it easily by just flipping the types when it is a raw MOV or a raw
9511 // SEL. This would keep the fp semantics and still save one MOV. Here's an
9512 // example pattern being dealt with.
9513 //
9514 // BEFORE:
9515 // (W&f0.0) sel (1|M0) a0.0<1>:f r5.2<0;1,0>:f r3.3<0;1,0>:f
9516 // =>
9517 // AFTER:
9518 // (W&f0.0) sel (1|M0) a0.0<1>:ud r5.2<0;1,0>:ud r3.3<0;1,0>:ud
9519 //
9520 // For others cases, to keep the fp semantics first we create a temp GRF and
9521 // set it as the new dst of the inst. Then we insert a MOV to the old dst (ARF)
9522 // using the int pipe.
9523 //
9524 // BEFORE:
9525 // (W&f0.0) sel (1|M0) (lt)f0.0 a0.0<1>:f r5.2<0;1,0>:f r3.3<0;1,0>:f
9526 // =>
9527 // AFTER:
9528 // (W&f0.0) sel (1|M0) (lt)f0.0 r2.0<0;1,0>:f r5.2<0;1,0>:f r3.3<0;1,0>:f
9529 // (W&f0.0) mov (1|M0) a0.0<1>:ud r2.0<0;1,0>:ud
fixFloatARFDst(INST_LIST_ITER it,G4_BB * bb)9530 void HWConformity::fixFloatARFDst(INST_LIST_ITER it, G4_BB* bb)
9531 {
9532 auto isDstTargetedARFInFloat = [](G4_DstRegRegion* dst) -> bool {
9533 if (!dst || !dst->getTopDcl())
9534 return false;
9535
9536 // Currently when ARF is used as an index register in dst, vISA treats
9537 // the dst as an ARF dst. Skip the IndirGRF case and return true if the
9538 // dst is ARF/FLAG with a fp type and Direct access. Here's an example
9539 // pattern.
9540 // mov (2) r[A0(0,0), 0]<4>:f V44(0,0)<1;1,0>:f
9541 auto regFile = dst->getTopDcl()->getRegFile();
9542 return (regFile == G4_ADDRESS || regFile == G4_FLAG) &&
9543 IS_TYPE_FLOAT_ALL(dst->getType()) &&
9544 (dst->getRegAccess() == Direct);
9545 };
9546
9547 auto isRawSel = [](G4_INST* inst) -> bool {
9548 return inst->opcode() == G4_sel &&
9549 inst->getDst()->getType() == inst->getSrc(0)->getType() &&
9550 inst->getDst()->getType() == inst->getSrc(1)->getType() &&
9551 inst->getCondMod() == nullptr &&
9552 (inst->getSrc(0)->isSrcRegRegion() &&
9553 inst->getSrc(0)->asSrcRegRegion()->getModifier() == Mod_src_undef) &&
9554 (inst->getSrc(1)->isImm() ||
9555 (inst->getSrc(1)->isSrcRegRegion() &&
9556 inst->getSrc(1)->asSrcRegRegion()->getModifier() == Mod_src_undef));
9557 };
9558
9559 auto getFlippedIntType = [](G4_Type floatTy) -> G4_Type {
9560 assert(IS_TYPE_FLOAT_ALL(floatTy));
9561 switch (TypeSize(floatTy)) {
9562 case 2:
9563 return Type_UW;
9564 case 4:
9565 return Type_UD;
9566 case 8:
9567 return Type_UQ;
9568 default:
9569 assert(false && "unexpected float type size.");
9570 return Type_UNDEF;
9571 }
9572 };
9573
9574 G4_INST* inst = *it;
9575 G4_DstRegRegion* dst = inst->getDst();
9576 if (!isDstTargetedARFInFloat(dst))
9577 return;
9578
9579 G4_Type floatTy = dst->getType();
9580 G4_Type intTy = getFlippedIntType(floatTy);
9581 if (inst->isRawMov() || isRawSel(inst))
9582 {
9583 // For raw MOV and raw predicate-based SEL (w/o conditional modifier),
9584 // we can just flip the types.
9585 dst->setType(intTy);
9586 for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; ++i)
9587 {
9588 auto src = inst->getSrc(i);
9589 if (src->isSrcRegRegion())
9590 {
9591 src->asSrcRegRegion()->setType(intTy);
9592 }
9593 else if (src->isImm())
9594 {
9595 inst->setSrc(builder.createImm(src->asImm()->getImm(), intTy), i);
9596 }
9597 }
9598 }
9599 else
9600 {
9601 // For others, 2 steps are required.
9602 // 1. Replace the dst with a temp var in float type.
9603 G4_Declare* newDefDcl =
9604 builder.createTempVar(1, floatTy, dst->getTopDcl()->getSubRegAlign());
9605 inst->setDest(builder.createDstRegRegion(newDefDcl, 1));
9606
9607 // 2. Create a MOV that moves the temp var to the old dst (ARF).
9608 G4_Declare* newUseDcl = builder.createTempVar(1, intTy,
9609 dst->getTopDcl()->getSubRegAlign());
9610 newUseDcl->setAliasDeclare(newDefDcl, 0);
9611 const RegionDesc* rd = inst->getExecSize() == 1 ?
9612 builder.getRegionScalar() : builder.getRegionStride1();
9613 G4_SrcRegRegion* newSrcRegion = builder.createSrcRegRegion(newUseDcl, rd);
9614 dst->setType(intTy);
9615 G4_INST* movInst = builder.createMov(inst->getExecSize(), dst, newSrcRegion, inst->getMaskOption(), false);
9616 bb->insertAfter(it, movInst);
9617 }
9618 }
9619