1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2017-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 //
10 /// GenXConstants
11 /// -------------
12 ///
13 /// GenXConstants is not in itself a pass. It contains utility functions and a
14 /// class used by other passes for constant loading.
15 ///
16 /// loadNonSimpleConstants
17 /// ^^^^^^^^^^^^^^^^^^^^^^
18 ///
19 /// The GenXPostLegalization pass calls loadNonSimpleConstants to insert a load
20 /// for any operand that is a non-simple constant. (A non-simple constant is one
21 /// that is too big or an invalid value for a constant operand.)
22 ///
23 /// It is called in two places:
24 ///
25 /// 1. in the GenXPostLegalization pass, run after legalization but
26 /// before CSE, so CSE has an opportunity to common up loaded non-simple
27 /// constants;
28 /// 2. later on in GenXCategory, to mop up non-simple constant operands
29 /// created by CSE's constant propagation.
30 ///
31 /// This does not insert a load if the constant is "big simple" (that is, it is
32 /// illegally wide but each legalized part of it is simple) and it is used in
33 /// the "old value" operand of a wrregion, or as a call arg. Inserting a load
34 /// of such a constant here would allow the load to be CSEd, which would be
35 /// counter productive as some of the uses would not be kill uses and so
36 /// coalescing would fail there.
37 ///
38 /// Phi incoming constants are not loaded here; they are loaded in
39 /// loadPhiConstants called from GenXCategory. Phi constant loads do not need to
40 /// participate in CSE as loadPhiConstants has its own commoning up tailored for
41 /// phi nodes.
42 ///
43 /// loadConstants
44 /// ^^^^^^^^^^^^^
45 ///
46 /// This is called from GenXCategory. It inserts a load for each constant
47 /// operand that is not allowed to be constant, but remains after
48 /// loadNonSimpleConstants.
49 ///
50 /// Phi incoming constants are not loaded here; they are loaded in
51 /// loadPhiConstants called from GenXCategory.
52 ///
53 /// loadPhiConstants
54 /// ^^^^^^^^^^^^^^^^
55 ///
56 /// This is called from GenXCategory, and it inserts loads for constant phi
57 /// incomings, commoning up when possible and sensible.
58 ///
59 /// Commoning up (inserting one load for multiple phi incomings with the same
60 /// constant, across one or more phi nodes) proceeds as follows:
61 ///
62 /// Firstly, we divide the phi nodes into _webs_, where each web is the maximal
63 /// set of phi nodes that are related through phi nodes and two address
64 /// instructions, so will be coalesced later on in the flow.
65 ///
66 /// Secondly, for a single web, we look for multiple uses of the same constant.
67 /// Such a constant has a load instruction inserted just once, at the end of the
68 /// nearest common dominator of all the corresponding incoming blocks.
69 ///
70 /// If that insert point is in an empty split critical edge block, we instead
71 /// insert in the block above that, in the hope that the split critical edge
72 /// block can be removed later.
73 ///
74 /// ConstantLoader
75 /// ^^^^^^^^^^^^^^
76 ///
77 /// ConstantLoader is a class that represents a constant and information on how
78 /// to load it. This is where analysis happens of whether it is a legal packed
79 /// vector, or whether it needs multiple instructions to load it. It then has
80 /// methods to insert the code to load the constant.
81 ///
82 //===----------------------------------------------------------------------===//
83 #define DEBUG_TYPE "GENX_CONSTANTS"
84
85 #include "GenXConstants.h"
86 #include "GenXGotoJoin.h"
87 #include "GenXIntrinsics.h"
88 #include "GenXUtil.h"
89
90 #include "llvm/ADT/APFloat.h"
91 #include "llvm/ADT/SmallSet.h"
92 #include "llvm/IR/BasicBlock.h"
93 #include "llvm/IR/Constants.h"
94 #include "llvm/IR/DerivedTypes.h"
95 #include "llvm/IR/Dominators.h"
96 #include "llvm/IR/Function.h"
97 #include "llvm/IR/InstIterator.h"
98 #include "llvm/IR/Instructions.h"
99 #include "llvm/IR/Intrinsics.h"
100 #include "llvm/IR/ValueMap.h"
101 #include "llvm/Support/Casting.h"
102 #include "llvm/Support/Debug.h"
103
104 #include "Probe/Assertion.h"
105
106 #include "llvmWrapper/IR/DerivedTypes.h"
107 #include "llvmWrapper/Support/MathExtras.h"
108 #include "llvmWrapper/Support/TypeSize.h"
109
110 using namespace llvm;
111 using namespace genx;
112
113 /***********************************************************************
114 * loadConstantStruct : insert instructions to load a constant struct
115 */
loadConstantStruct(Constant * C,Instruction * InsertPt,const GenXSubtarget & Subtarget,const DataLayout & DL,SmallVectorImpl<Instruction * > * AddedInstructions=nullptr)116 static Value *loadConstantStruct(
117 Constant *C, Instruction *InsertPt, const GenXSubtarget &Subtarget,
118 const DataLayout &DL,
119 SmallVectorImpl<Instruction *> *AddedInstructions = nullptr) {
120 auto ST = cast<StructType>(C->getType());
121 Value *Agg = UndefValue::get(ST);
122 for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
123 Constant *El = C->getAggregateElement(i);
124 if (isa<UndefValue>(El))
125 continue;
126 Value *LoadedEl = nullptr;
127 if (isa<StructType>(El->getType()))
128 LoadedEl =
129 loadConstantStruct(El, InsertPt, Subtarget, DL, AddedInstructions);
130 else {
131 LoadedEl = ConstantLoader(El, Subtarget, DL, nullptr, AddedInstructions)
132 .loadBig(InsertPt);
133 }
134 auto *InsertInst =
135 InsertValueInst::Create(Agg, LoadedEl, i, "loadstruct", InsertPt);
136 Agg = InsertInst;
137 if (AddedInstructions)
138 AddedInstructions->push_back(InsertInst);
139 }
140 return Agg;
141 }
142
143 /***********************************************************************
144 * loadNonSimpleConstants : for any non-simple or illegal size constant in
145 * an instruction, load it.
146 *
147 * Enter: Inst = instruction to find constant operands in
148 * AddedInstructions = 0 else vector to push added instructions onto
149 *
150 * Return: whether code was modified
151 *
152 * This does not load constants in a phi nodes. That is done in
153 * loadPhiConstants.
154 */
loadNonSimpleConstants(Instruction * Inst,const GenXSubtarget & Subtarget,const DataLayout & DL,SmallVectorImpl<Instruction * > * AddedInstructions)155 bool genx::loadNonSimpleConstants(
156 Instruction *Inst, const GenXSubtarget &Subtarget, const DataLayout &DL,
157 SmallVectorImpl<Instruction *> *AddedInstructions) {
158 bool Modified = false;
159 if (isa<PHINode>(Inst))
160 return Modified;
161 // Omit call target operand of a call.
162 unsigned NumArgs = Inst->getNumOperands();
163 auto CI = dyn_cast<CallInst>(Inst);
164 if (CI)
165 NumArgs = CI->getNumArgOperands();
166 unsigned IID = GenXIntrinsic::getAnyIntrinsicID(Inst);
167 // Do not proceed loading of genx.alloca argument since its value doesn't
168 // needed (only type matters) and always null.
169 if (IID == GenXIntrinsic::genx_alloca)
170 return Modified;
171 for (unsigned i = 0; i != NumArgs; ++i) {
172 if (isa<Constant>(Inst->getOperand(i))) {
173 Use *U = &Inst->getOperandUse(i);
174 Constant *C = dyn_cast<Constant>(*U);
175 if (!C)
176 continue;
177 if (isa<UndefValue>(C))
178 continue;
179 if (opMustBeConstant(Inst, i))
180 continue;
181 if (C->getType()->isStructTy()) {
182 *U = loadConstantStruct(C, Inst, Subtarget, DL, AddedInstructions);
183 Modified = true;
184 continue;
185 }
186
187 ConstantLoader CL(C, Subtarget, DL, Inst, AddedInstructions);
188 if (CL.needFixingSimple()) {
189 Modified = true;
190 CL.fixSimple(i);
191 continue;
192 }
193 if (CL.isSimple())
194 continue;
195 // Do not load a "big simple" constant for the "old value of vector"
196 // input of a wrregion, so it does not get CSEd. CSEing it is
197 // counter-productive because, if it has multiple uses, it will
198 // need to be two-address copied by GenXCoalescing anyway.
199 if (GenXIntrinsic::isWrRegion(IID)
200 && i == GenXIntrinsic::GenXRegion::OldValueOperandNum
201 && CL.isBigSimple())
202 continue;
203 // Similarly, do not load a "big simple" constant for a call arg.
204 if (CI && IID == GenXIntrinsic::not_any_intrinsic && CL.isBigSimple())
205 continue;
206 *U = CL.loadBig(Inst);
207 Modified = true;
208 }
209 }
210 return Modified;
211 }
212
loadConstantsForInlineAsm(CallInst * CI,const GenXSubtarget & Subtarget,const DataLayout & DL,SmallVectorImpl<Instruction * > * AddedInstructions)213 bool genx::loadConstantsForInlineAsm(
214 CallInst *CI, const GenXSubtarget &Subtarget, const DataLayout &DL,
215 SmallVectorImpl<Instruction *> *AddedInstructions) {
216 IGC_ASSERT_MESSAGE(CI->isInlineAsm(), "Inline asm expected");
217 bool Modified = false;
218 auto ConstraintsInfo = genx::getGenXInlineAsmInfo(CI);
219 Use *U;
220 for (unsigned i = 0, e = ConstraintsInfo.size(), ArgNo = 0; i != e; ++i) {
221 auto &Info = ConstraintsInfo[i];
222 if (Info.isOutput())
223 continue;
224 U = &CI->getOperandUse(ArgNo);
225 ArgNo++;
226 if (auto C = dyn_cast<Constant>(*U)) {
227 if (!isa<UndefValue>(C)) {
228 switch (Info.getConstraintType()) {
229 default:
230 *U = ConstantLoader(C, Subtarget, DL, nullptr, AddedInstructions)
231 .load(CI);
232 Modified = true;
233 break;
234 case ConstraintType::Constraint_n:
235 case ConstraintType::Constraint_i:
236 case ConstraintType::Constraint_F:
237 break;
238 }
239 }
240 }
241 }
242 return Modified;
243 }
244
245 /***********************************************************************
246 * loadConstants : load constants as required for an instruction
247 *
248 * This handles operands that are not allowed to be constant. A constant
249 * operand that needs loading because it is a non-simple constant is
250 * handled in loadNonSimpleConstants.
251 *
252 * This does not load constants in a phi nodes. That is done in
253 * loadPhiConstants.
254 */
loadConstants(Instruction * Inst,const GenXSubtarget & Subtarget,const DataLayout & DL)255 bool genx::loadConstants(Instruction *Inst, const GenXSubtarget &Subtarget,
256 const DataLayout &DL) {
257 bool Modified = false;
258 Use *U;
259 if (isa<PHINode>(Inst))
260 return Modified;
261 if (isa<BinaryOperator>(Inst) &&
262 Inst->getType()->getScalarType()->isIntegerTy(1)) {
263 // Predicate binary operator: disallow constant operands, except
264 // that xor with -1 is allowed.
265 for (unsigned oi = 0; oi != 2; ++oi)
266 if (auto C = dyn_cast<Constant>(Inst->getOperand(oi))) {
267 auto IsNot = [=]() {
268 if (oi != 1)
269 return false;
270 if (Inst->getOpcode() != Instruction::Xor)
271 return false;
272 if (!C->getType()->isVectorTy())
273 return C->isAllOnesValue();
274 Constant *C1 = C->getSplatValue();
275 return C1 && C1->isAllOnesValue();
276 };
277 if (!IsNot()) {
278 Inst->setOperand(oi, ConstantLoader(C, Subtarget, DL).load(Inst));
279 Modified = true;
280 }
281 }
282 }
283 if (isa<SelectInst>(Inst)) {
284 // select: disallow constant selector
285 U = &Inst->getOperandUse(0);
286 if (auto C = dyn_cast<Constant>(*U)) {
287 *U = ConstantLoader(C, Subtarget, DL).load(Inst);
288 Modified = true;
289 }
290 return Modified;
291 }
292 if (isa<InsertValueInst>(Inst)) {
293 // insertvalue (inserting a value into a struct): disallow constant
294 // on element operand.
295 U = &Inst->getOperandUse(1);
296 if (auto C = dyn_cast<Constant>(*U)) {
297 *U = ConstantLoader(C, Subtarget, DL).load(Inst);
298 Modified = true;
299 }
300 // Also disallow constant (other than undef) on old struct value operand.
301 // We need to load each non-undef element separately.
302 U = &Inst->getOperandUse(0);
303 if (auto C = dyn_cast<Constant>(*U))
304 if (!isa<UndefValue>(C))
305 *U = loadConstantStruct(C, Inst, Subtarget, DL);
306 return Modified;
307 }
308 if (auto Br = dyn_cast<BranchInst>(Inst)) {
309 // Conditional branch: disallow constant condition.
310 if (Br->isConditional()) {
311 if (auto C = dyn_cast<Constant>(Br->getCondition())) {
312 Br->setCondition(ConstantLoader(C, Subtarget, DL).load(Br));
313 Modified = true;
314 }
315 }
316 return Modified;
317 }
318 if (auto Ret = dyn_cast<ReturnInst>(Inst)) {
319 // Return: disallow constant return value in a subroutine (internal
320 // linkage).
321 if (Ret->getNumOperands() && Ret->getParent()->getParent()->getLinkage()
322 == GlobalValue::InternalLinkage) {
323 if (auto C = dyn_cast<Constant>(Ret->getOperand(0))) {
324 if (!C->getType()->isVoidTy() && !isa<UndefValue>(C)) {
325 Ret->setOperand(0, ConstantLoader(C, Subtarget, DL).load(Ret));
326 Modified = true;
327 }
328 }
329 }
330 return Modified;
331 }
332 auto CI = dyn_cast<CallInst>(Inst);
333 if (!CI)
334 return Modified;
335 if (CI->isInlineAsm())
336 return loadConstantsForInlineAsm(CI, Subtarget, DL, nullptr);
337 int IntrinsicID = GenXIntrinsic::getAnyIntrinsicID(CI);
338 switch (IntrinsicID) {
339 case GenXIntrinsic::not_any_intrinsic:
340 case Intrinsic::fma:
341 case GenXIntrinsic::genx_ssmad:
342 case GenXIntrinsic::genx_sumad:
343 case GenXIntrinsic::genx_usmad:
344 case GenXIntrinsic::genx_uumad:
345 case GenXIntrinsic::genx_output:
346 case GenXIntrinsic::genx_output_1:
347 // load all args for subroutine and some intrinsic calls.
348 for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
349 U = &CI->getOperandUse(i);
350 if (auto C = dyn_cast<Constant>(*U)) {
351 if (!isa<UndefValue>(C)) {
352 *U = ConstantLoader(C, Subtarget, DL).loadBig(CI);
353 Modified = true;
354 }
355 }
356 }
357 break;
358 case GenXIntrinsic::genx_constanti:
359 case GenXIntrinsic::genx_constantf:
360 break;
361 case GenXIntrinsic::genx_absi:
362 case GenXIntrinsic::genx_absf:
363 // abs modifier: disallow constant input.
364 U = &CI->getOperandUse(0);
365 if (auto C = dyn_cast<Constant>(*U)) {
366 *U = ConstantLoader(C, Subtarget, DL).load(CI);
367 Modified = true;
368 }
369 break;
370 case GenXIntrinsic::genx_rdpredregion:
371 case GenXIntrinsic::genx_any:
372 case GenXIntrinsic::genx_all:
373 // rdpredregion, any, all: disallow constant input
374 U = &CI->getOperandUse(0);
375 if (auto C = dyn_cast<Constant>(*U)) {
376 *U = ConstantLoader(C, Subtarget, DL).load(CI);
377 Modified = true;
378 }
379 break;
380 case GenXIntrinsic::genx_rdregioni:
381 case GenXIntrinsic::genx_rdregionf:
382 // rdregion: disallow constant input
383 U = &CI->getOperandUse(0);
384 if (auto C = dyn_cast<Constant>(*U)) {
385 *U = ConstantLoader(C, Subtarget, DL).loadBig(CI);
386 Modified = true;
387 }
388 // Also disallow constant vector index (constant scalar OK).
389 U = &CI->getOperandUse(GenXIntrinsic::GenXRegion::RdIndexOperandNum);
390 if (auto C = dyn_cast<Constant>(*U)) {
391 if (isa<VectorType>(C->getType())) {
392 *U = ConstantLoader(C, Subtarget, DL).load(CI);
393 Modified = true;
394 }
395 }
396 break;
397 case GenXIntrinsic::genx_wrpredpredregion:
398 // wrpredpred: disallow constant "old vector" input unless undef
399 U = &CI->getOperandUse(0);
400 if (auto C = dyn_cast<Constant>(*U)) {
401 if (!isa<UndefValue>(C)) {
402 *U = ConstantLoader(C, Subtarget, DL).loadBig(CI);
403 Modified = true;
404 }
405 }
406 break;
407 case GenXIntrinsic::genx_wrregioni:
408 case GenXIntrinsic::genx_wrregionf:
409 // wrregion: disallow constant "old vector" input unless undef
410 U = &CI->getOperandUse(0);
411 if (auto C = dyn_cast<Constant>(*U)) {
412 if (!isa<UndefValue>(C)) {
413 *U = ConstantLoader(C, Subtarget, DL).loadBig(CI);
414 Modified = true;
415 }
416 }
417 // Also disallow constant vector index (constant scalar OK).
418 U = &CI->getOperandUse(GenXIntrinsic::GenXRegion::WrIndexOperandNum);
419 if (auto C = dyn_cast<Constant>(*U)) {
420 if (isa<VectorType>(C->getType())) {
421 *U = ConstantLoader(C, Subtarget, DL).load(CI);
422 Modified = true;
423 }
424 }
425 // Also disallow constant predicate unless all ones.
426 U = &CI->getOperandUse(GenXIntrinsic::GenXRegion::PredicateOperandNum);
427 if (auto C = dyn_cast<Constant>(*U)) {
428 if (!C->isAllOnesValue()) {
429 *U = ConstantLoader(C, Subtarget, DL).load(CI);
430 Modified = true;
431 }
432 }
433 break;
434 case GenXIntrinsic::genx_simdcf_goto:
435 // goto: disallow constant predicate input, unless it is all 0. We want to
436 // allow constant all 0, as it is the encoding used for an "else", and
437 // loading the constant into a predicate register stops the finalizer's
438 // structurizer working.
439 U = &CI->getOperandUse(2);
440 if (auto C = dyn_cast<Constant>(*U)) {
441 if (!C->isNullValue()) {
442 *U = ConstantLoader(C, Subtarget, DL).load(CI);
443 Modified = true;
444 }
445 }
446 break;
447 default:
448 // Intrinsic: check intrinsic descriptor to see where constant args
449 // are allowed.
450 // Iterate through each field in the intrinsic info.
451 GenXIntrinsicInfo II(IntrinsicID);
452 // Intrinsic not found.
453 if (II.isNull())
454 return Modified;
455 unsigned MaxRawOperands = II.getTrailingNullZoneStart(CI);
456 for (GenXIntrinsicInfo::iterator i = II.begin(), e = II.end(); i != e; ++i) {
457 GenXIntrinsicInfo::ArgInfo AI = *i;
458 if (!AI.isArgOrRet() || AI.isRet())
459 continue;
460 // This field relates to an operand.
461 U = &CI->getOperandUse(AI.getArgIdx());
462 auto C = dyn_cast<Constant>(*U);
463 if (!C)
464 continue;
465 // Operand is constant.
466 // Allow constant if it is i1 or vector of i1 set to all ones; this
467 // represents an "all true" predication field.
468 if (C->getType()->getScalarType()->isIntegerTy(1) && C->isAllOnesValue())
469 continue;
470 // Allow constant if intrinsic descriptor allows it for this arg.
471 if (!AI.isImmediateDisallowed())
472 continue;
473 // If it is a RAW operand, allow the constant if it's in the trailing
474 // null region (it must be a null constant if so), or if the value
475 // is undefined and RAW_NULLALLOWED is enabled.
476 if (AI.isRaw()) {
477 if ((unsigned)AI.getArgIdx() >= MaxRawOperands) {
478 IGC_ASSERT(C->isNullValue());
479 continue;
480 }
481 if (isa<UndefValue>(C) && AI.rawNullAllowed())
482 continue;
483 }
484 // Also allow constant if it is undef in a TWOADDR
485 if (isa<UndefValue>(C) && AI.getCategory() == GenXIntrinsicInfo::TWOADDR)
486 continue;
487 // Also allow constant if it is a reserved surface index.
488 if (AI.getCategory() == GenXIntrinsicInfo::SURFACE &&
489 visa::isReservedSurfaceIndex(visa::convertToSurfaceIndex(C))) {
490 continue;
491 }
492 // Operand is not allowed to be constant. Insert code to load it.
493 *U = ConstantLoader(C, Subtarget, DL).loadBig(CI);
494 Modified = true;
495 }
496 break;
497 }
498 return Modified;
499 }
500
areConstantsEqual(const Constant * C1,const Constant * C2)501 bool genx::areConstantsEqual(const Constant *C1, const Constant *C2) {
502 IGC_ASSERT(C1 && C2);
503 // If these are same constants then it's obviously true
504 if (C1 == C2)
505 return true;
506
507 Type *C1Ty = C1->getType();
508 Type *C2Ty = C2->getType();
509
510 bool SameType = C1Ty == C2Ty;
511 // If types are not the same then compare if types are bitcastable
512 if (!SameType) {
513 if (!C1Ty->canLosslesslyBitCastTo(C2Ty))
514 return false;
515 }
516
517 // Most common case: check for zero initializers
518 if (C1->isZeroValue() && C2->isZeroValue())
519 return true;
520
521 auto *GC1 = dyn_cast<GlobalValue>(C1);
522 auto *GC2 = dyn_cast<GlobalValue>(C2);
523 // TODO: check for specific versions of each global
524 if (GC1 || GC2)
525 return false;
526
527 if (C1->getValueID() != C2->getValueID())
528 return false;
529
530 // Check contents
531
532 if (const auto *C1Seq = dyn_cast<ConstantDataSequential>(C1)) {
533 const auto *C2Seq = cast<ConstantDataSequential>(C2);
534 StringRef C1RawData = C1Seq->getRawDataValues();
535 StringRef C2RawData = C2Seq->getRawDataValues();
536 if (C1RawData.size() == C2RawData.size())
537 return (C1RawData.compare(C2RawData) == 0);
538 return false;
539 }
540
541 switch (C1->getValueID()) {
542 default:
543 // Otherwise be conservative
544 return false;
545 case Value::ConstantIntVal: {
546 const APInt &C1Int = cast<ConstantInt>(C1)->getValue();
547 const APInt &C2Int = cast<ConstantInt>(C2)->getValue();
548 return C1Int == C2Int;
549 }
550 case Value::ConstantFPVal: {
551 const APFloat &C1FP = cast<ConstantFP>(C1)->getValueAPF();
552 const APFloat &C2FP = cast<ConstantFP>(C2)->getValueAPF();
553 return C1FP.bitcastToAPInt() == C2FP.bitcastToAPInt();
554 }
555 case Value::ConstantVectorVal: {
556 const ConstantVector *C1CV = cast<ConstantVector>(C1);
557 const ConstantVector *C2CV = cast<ConstantVector>(C2);
558 unsigned NumElementsC1 =
559 cast<IGCLLVM::FixedVectorType>(C1Ty)->getNumElements();
560 unsigned NumElementsC2 =
561 cast<IGCLLVM::FixedVectorType>(C2Ty)->getNumElements();
562 if (NumElementsC1 != NumElementsC2)
563 return false;
564 for (uint64_t i = 0; i < NumElementsC1; ++i)
565 if (!areConstantsEqual(cast<Constant>(C1CV->getOperand(i)),
566 cast<Constant>(C2CV->getOperand(i))))
567 return false;
568 return true;
569 }
570 case Value::ConstantArrayVal: {
571 const ConstantArray *C1A = cast<ConstantArray>(C1);
572 const ConstantArray *C2A = cast<ConstantArray>(C2);
573 uint64_t NumElementsC1 = cast<ArrayType>(C1Ty)->getNumElements();
574 uint64_t NumElementsC2 = cast<ArrayType>(C2Ty)->getNumElements();
575 if (NumElementsC1 != NumElementsC2)
576 return false;
577 for (uint64_t i = 0; i < NumElementsC1; ++i)
578 if (!areConstantsEqual(cast<Constant>(C1A->getOperand(i)),
579 cast<Constant>(C2A->getOperand(i))))
580 return false;
581 return true;
582 }
583 }
584 }
585
586 /***********************************************************************
587 * cleanupConstantLoads : remove all genx.constant* intrinsics that have
588 * non-constant source operand
589 */
cleanupConstantLoads(Function * F)590 bool genx::cleanupConstantLoads(Function *F) {
591 bool Modified = false;
592 for (auto I = inst_begin(F), E = inst_end(F); I != E;) {
593 auto *CI = dyn_cast<CallInst>(&*I++);
594 if (!CI)
595 continue;
596 auto IID = GenXIntrinsic::getAnyIntrinsicID(CI);
597 if (IID != GenXIntrinsic::genx_constanti &&
598 IID != GenXIntrinsic::genx_constantf &&
599 IID != GenXIntrinsic::genx_constantpred)
600 continue;
601 if (isa<Constant>(CI->getOperand(0)))
602 continue;
603 CI->replaceAllUsesWith(CI->getOperand(0));
604 CI->eraseFromParent();
605 Modified = true;
606 }
607 return Modified;
608 }
609
610 /***********************************************************************
611 * loadPhiConstants : load constant incomings in phi nodes, commoning up
612 * if appropriate
613 */
loadPhiConstants(Function & F,DominatorTree * DT,const GenXSubtarget & Subtarget,const DataLayout & DL,bool ExcludePredicate)614 bool genx::loadPhiConstants(Function &F, DominatorTree *DT,
615 const GenXSubtarget &Subtarget,
616 const DataLayout &DL, bool ExcludePredicate) {
617 bool Modified = false;
618 std::set<Instruction *> Done;
619 for (BasicBlock &BB : F) {
620 for (auto bi = BB.begin();; ++bi) {
621 auto Phi = dyn_cast<PHINode>(&*bi);
622 if (!Phi)
623 break;
624 if (!Done.insert(Phi).second)
625 continue; // phi node already processed in some web
626 // Gather the web of phi nodes and two address instructions related to
627 // this one. This is an approximation to the web of instructions that
628 // will or could be coalesced.
629 // (Use Web as a worklist of phi nodes and two address instructions to
630 // use to find other phi nodes and two address instructions.)
631 //
632 // We process a web of related phi nodes at a time, rather than all phi
633 // nodes that use the constant, to avoid this situation:
634 // we try and common up two phi nodes in the same basic block (e.g. two
635 // variables both initialized to 0 before a loop), but end up having to
636 // insert a copy for one of them anyway in coalescing.
637 SmallVector<Instruction *, 4> Web;
638 Web.push_back(Phi);
639 for (unsigned wi = 0; wi != Web.size(); ++wi) {
640 auto Inst = Web[wi];
641 unsigned oi = 0, oe = 0;
642 if ((Phi = dyn_cast<PHINode>(Inst))) {
643 // Phi node: process each incoming.
644 oe = Phi->getNumIncomingValues();
645 } else {
646 if (auto *CI = dyn_cast<CallInst>(Inst)) {
647 // Two address instruction: process just the two address operand.
648 oi = *getTwoAddressOperandNum(CI);
649 oe = oi + 1;
650 } else {
651 IGC_ASSERT(isa<CastInst>(Inst));
652 oi = 0;
653 oe = 1;
654 }
655 }
656
657 auto IsPhiOrTwoAddress = [=](Value *V) {
658 if (isa<PHINode>(V))
659 return true;
660 if (auto CI = dyn_cast<CallInst>(V))
661 return getTwoAddressOperandNum(CI).hasValue();
662 return false;
663 };
664
665 // For each incoming:
666 for (; oi != oe; ++oi ) {
667 auto Incoming = Inst->getOperand(oi);
668 // If it is a phi node or two address instruction, push it into the
669 // web for processing later.
670 if (IsPhiOrTwoAddress(Incoming)) {
671 auto IncomingInst = cast<Instruction>(Incoming);
672 if (Done.insert(IncomingInst).second)
673 Web.push_back(IncomingInst);
674 } else if (!isa<Constant>(Incoming)) {
675 // For any other inst or arg, see if it has any other use in a phi
676 // node or two address inst, and push that into the web.
677 for (auto ui = Incoming->use_begin(), ue = Incoming->use_end();
678 ui != ue; ++ui) {
679 auto User = cast<Instruction>(ui->getUser());
680 // Add bitcasts into the web to process their users too
681 if (IsPhiOrTwoAddress(User) ||
682 (isa<CastInst>(User) && cast<CastInst>(User)->isNoopCast(DL)))
683 if (Done.insert(User).second)
684 Web.push_back(User);
685 }
686 }
687 }
688 // Now process each use of the result of the phi node or two address
689 // instruction. If the use is in a phi node or is a two address operand,
690 // push the user into the web.
691 for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) {
692 auto User = cast<Instruction>(ui->getUser());
693 if (IsPhiOrTwoAddress(User))
694 if (Done.insert(User).second)
695 Web.push_back(User);
696 }
697 }
698 LLVM_DEBUG(
699 dbgs() << "loadPhiConstants: Web of phi nodes and two address insts:\n";
700 for (auto wi = Web.begin(), we = Web.end(); wi != we; ++wi)
701 dbgs() << **wi << "\n"
702 );
703 // Now process the web, ignoring anything other than phi nodes.
704 // Gather the distinct constants, and every use for each one in a phi
705 // node.
706 std::map<Constant *, SmallVector<Use *, 4>> ConstantUses;
707 SmallVector<Constant *, 8> DistinctConstants;
708
709 // Fill ConstantUses map
710 // Process phis with larger types first to make sure that wider
711 // constant goes to ConstantUses map first
712 auto WebPhisRange = make_filter_range(
713 Web, [](Instruction *I) { return isa<PHINode>(I); });
714 SmallVector<Instruction *, 4> WebPhis(WebPhisRange);
715 std::sort(WebPhis.begin(), WebPhis.end(),
716 [&DL](Instruction *I1, Instruction *I2) {
717 return DL.getTypeSizeInBits(I1->getType()) >
718 DL.getTypeSizeInBits(I2->getType());
719 });
720
721 for (auto *Inst : WebPhis) {
722 auto *Phi = cast<PHINode>(Inst);
723 for (unsigned oi = 0, oe = Phi->getNumIncomingValues(); oi != oe; ++oi) {
724 Use *U = &Phi->getOperandUse(oi);
725 auto *C = dyn_cast<Constant>(*U);
726 if (!C || isa<UndefValue>(C))
727 continue;
728 // when doing this transform in pattern matching phase
729 if (ExcludePredicate) {
730 if (C->getType()->getScalarType()->isIntegerTy(1))
731 continue;
732 if (DL.getTypeSizeInBits(C->getType()) <= 256)
733 continue;
734 auto IncomingBlock = Phi->getIncomingBlock(oi);
735 if (GotoJoin::isBranchingJoinLabelBlock(IncomingBlock))
736 continue;
737 }
738
739 // Merge uses if constants are bitcastable.
740 auto EqualC = llvm::find_if(DistinctConstants, [&C](Constant *C2) {
741 return genx::areConstantsEqual(C, C2);
742 });
743 if (EqualC != DistinctConstants.end())
744 C = *EqualC;
745
746 auto Entry = &ConstantUses[C];
747 if (!Entry->size())
748 DistinctConstants.push_back(C);
749 Entry->push_back(U);
750 }
751 }
752 // Handle each distinct constant.
753 for (unsigned dci = 0, dce = DistinctConstants.size(); dci != dce; ++dci) {
754 Constant *C = DistinctConstants[dci];
755 auto Entry = &ConstantUses[C];
756 if (Entry->size() != 1) {
757 LLVM_DEBUG(
758 dbgs() << "multiple use of " << *C << "\n";
759 for (unsigned ei = 0, ee = Entry->size(); ei != ee; ++ei)
760 dbgs() << *(*Entry)[ei]->getUser() << "\n"
761 );
762 }
763 // Find the closest common dominator of the incoming blocks of all phi
764 // uses of the constant. That is where we want to insert the constant
765 // load.
766 Use *U = (*Entry)[0];
767 auto InsertBB = cast<PHINode>(U->getUser())
768 ->getIncomingBlock(U->getOperandNo());
769 for (unsigned ei = 1, ee = Entry->size(); ei != ee; ++ei) {
770 U = (*Entry)[ei];
771 auto Phi = cast<PHINode>(U->getUser());
772 auto IncomingBB = Phi->getIncomingBlock(U->getOperandNo());
773 InsertBB = DT->findNearestCommonDominator(InsertBB, IncomingBB);
774 }
775 // If that location is an empty split critical edge block, go up to its
776 // predecessor (which is also its immediate dominator) if this block is
777 // "true" successor of branching simd cf block. In this case we cannot
778 // insert anything in current block and have to create partial
779 // redundancy.
780 IGC_ASSERT(InsertBB);
781 auto *InsertTerm = InsertBB->getTerminator();
782 auto *SinglePred = InsertBB->getSinglePredecessor();
783 if (InsertTerm->getNumSuccessors() == 1 &&
784 InsertTerm == &InsertBB->front() && SinglePred &&
785 GotoJoin::isBranchingGotoJoinBlock(SinglePred))
786 InsertBB = SinglePred;
787
788 // Insert the constant load.
789 ConstantLoader CL(C, Subtarget, DL);
790 Value *Load = nullptr;
791 Instruction *InsertBefore = InsertBB->getTerminator();
792 if (!CL.isSimple())
793 Load = CL.loadBig(InsertBefore);
794 else
795 Load = CL.load(InsertBefore);
796 Modified = true;
797 // Modify the uses.
798
799 SmallDenseMap<Type *, Value *, 4> CastMap;
800 // Create cast of specific type of given value or reuse it
801 // if exists
802 auto CreateOrReuseCast = [&CastMap](Value *V, Type *Ty,
803 Instruction *InsertBefore) {
804 // No cast needed
805 if (V->getType() == Ty)
806 return V;
807 // Assume bitcastable for now
808 if (!CastMap.count(Ty))
809 CastMap[Ty] =
810 CastInst::Create(Instruction::BitCast, V, Ty,
811 V->getName() + ".cast", InsertBefore);
812 return CastMap[Ty];
813 };
814
815 for (unsigned ei = 0, ee = Entry->size(); ei != ee; ++ei) {
816 auto *U = (*Entry)[ei];
817 *U = CreateOrReuseCast(Load, U->get()->getType(), InsertBefore);
818 }
819 // replace other non-phi uses that are also dominated by the InsertBB
820 for (unsigned wi = 0, we = Web.size(); wi != we; ++wi) {
821 if (isa<PHINode>(Web[wi]))
822 continue;
823 auto CI = dyn_cast<CallInst>(Web[wi]);
824 if (CI && getTwoAddressOperandNum(CI)) {
825 auto oi = *getTwoAddressOperandNum(CI);
826 Use *U = &CI->getOperandUse(oi);
827 auto *UC = dyn_cast<Constant>(*U);
828 if (UC && UC == C) {
829 if (CI->getParent() != InsertBB && DT->dominates(InsertBB, CI->getParent()))
830 *U = CreateOrReuseCast(Load, U->get()->getType(), InsertBefore);
831 }
832 }
833 }
834 }
835 }
836 }
837 return Modified;
838 }
839
isReplicatedConstantVector(const ConstantVector * Orig,const ConstantVector * ReplicateCandidate)840 bool genx::isReplicatedConstantVector(
841 const ConstantVector *Orig, const ConstantVector *ReplicateCandidate) {
842 IGC_ASSERT(Orig && ReplicateCandidate);
843 // First compare for same element type
844 if (Orig->getType()->getElementType() !=
845 ReplicateCandidate->getType()->getElementType())
846 return false;
847
848 unsigned OrigNumElements = Orig->getType()->getNumElements();
849 unsigned CandidateNumElements =
850 ReplicateCandidate->getType()->getNumElements();
851
852 // Check replicate possibility by size: candidate should be
853 // at least larger and it's size is divisible by the size of
854 // original vector
855 if ((OrigNumElements >= CandidateNumElements) ||
856 ((CandidateNumElements % OrigNumElements) != 0))
857 return false;
858
859 // Get slices
860 unsigned NumReplicates = CandidateNumElements / OrigNumElements;
861 SmallVector<Constant *, 4> Slices;
862 for (unsigned i = 0; i < NumReplicates; i++)
863 Slices.push_back(genx::getConstantSubvector(
864 ReplicateCandidate, i * OrigNumElements, OrigNumElements));
865
866 // Compare all slices
867 return llvm::all_of(Slices,
868 [Orig](Constant *Slice) { return Slice == Orig; });
869 }
870
fixSimple(int OperandIdx)871 void ConstantLoader::fixSimple(int OperandIdx) {
872 IGC_ASSERT_MESSAGE(User, "user must be provided");
873 IGC_ASSERT_MESSAGE(NewC, "no need to fix simple case");
874 IGC_ASSERT_MESSAGE(User->getOperand(OperandIdx) == C,
875 "wrong arguments: wrong operand index was provided");
876 User->setOperand(OperandIdx, NewC);
877 C = NewC;
878 // indicate that we no longer need fix
879 NewC = nullptr;
880 }
881
882 /***********************************************************************
883 * ConstantLoader::loadNonSimple : load a non-simple constant
884 *
885 * Enter: C = constant to lower if necessary
886 * Inst = instruction it is used in (also used to insert new
887 * code before)
888 *
889 * Return: new instruction
890 */
loadNonSimple(Instruction * Inst)891 Instruction *ConstantLoader::loadNonSimple(Instruction *Inst) {
892 IGC_ASSERT(!isSimple());
893 if (!isLegalSize())
894 return loadBig(Inst);
895 if (PackedFloat) {
896 unsigned NumElts =
897 cast<IGCLLVM::FixedVectorType>(C->getType())->getNumElements();
898 SmallVector<Instruction *, 4> Quads;
899 for (unsigned i = 0, e = NumElts; i != e; i += 4) {
900 SmallVector<Constant *, 4> Quad;
901 for (unsigned j = 0; j != 4 && (i + j) < NumElts; ++j)
902 Quad.push_back(C->getAggregateElement(i + j));
903 ConstantLoader Packed(ConstantVector::get(Quad), Subtarget, DL);
904 Quads.push_back(Packed.load(Inst));
905 }
906 Value *V = UndefValue::get(C->getType());
907 unsigned Offset = 0;
908 auto DbgLoc = Inst->getDebugLoc();
909 for (auto &Q : Quads) {
910 auto *VTy = cast<IGCLLVM::FixedVectorType>(Q->getType());
911 Region R(V, &DL);
912 R.getSubregion(Offset, VTy->getNumElements());
913 V = R.createWrRegion(V, Q, "constant.quad" + Twine(Offset), Inst, DbgLoc);
914 Offset += VTy->getNumElements();
915 }
916 return cast<Instruction>(V);
917 }
918 if (PackedIntScale) {
919 auto PackTy = C->getType()->getScalarType();
920 // limit the constant-type to 32-bit because we do not want 64-bit operation
921 if (DL.getTypeSizeInBits(PackTy) > 32)
922 PackTy = Type::getInt32Ty(Inst->getContext());
923 // Load as a packed int vector with scale and/or adjust.
924 SmallVector<Constant *, 32> PackedVals;
925 for (unsigned
926 i = 0,
927 e = cast<IGCLLVM::FixedVectorType>(C->getType())->getNumElements();
928 i != e; ++i) {
929 int64_t Val = 0;
930 if (auto CI = dyn_cast<ConstantInt>(C->getAggregateElement(i))) {
931 Val = CI->getSExtValue();
932 Val -= PackedIntAdjust;
933 Val /= PackedIntScale;
934 }
935 PackedVals.push_back(ConstantInt::get(PackTy, Val, /*isSigned=*/true));
936 IGC_ASSERT(cast<ConstantInt>(PackedVals.back())->getSExtValue() >= -8
937 && cast<ConstantInt>(PackedVals.back())->getSExtValue() <= 15);
938 }
939
940 ConstantLoader Packed(ConstantVector::get(PackedVals), Subtarget, DL);
941 auto *LoadPacked = Packed.loadNonPackedIntConst(Inst);
942 if (PackedIntScale != 1) {
943 auto *SplatVal =
944 ConstantInt::get(PackTy, PackedIntScale, /*isSigned=*/true);
945 auto *CVTy = cast<IGCLLVM::FixedVectorType>(C->getType());
946 auto ElemCount = IGCLLVM::getElementCount(CVTy->getNumElements());
947 auto *Op1 = ConstantVector::getSplat(ElemCount, SplatVal);
948 LoadPacked = BinaryOperator::Create(Instruction::Mul, LoadPacked, Op1,
949 "constantscale", Inst);
950 }
951 if (PackedIntAdjust) {
952 auto *SplatVal =
953 ConstantInt::get(PackTy, PackedIntAdjust, /*isSigned=*/true);
954 auto *CVTy = cast<IGCLLVM::FixedVectorType>(C->getType());
955 auto ElemCount = IGCLLVM::getElementCount(CVTy->getNumElements());
956 auto *Op1 = ConstantVector::getSplat(ElemCount, SplatVal);
957 LoadPacked = BinaryOperator::Create(Instruction::Add, LoadPacked, Op1,
958 "constantadjust", Inst);
959 }
960 if (DL.getTypeSizeInBits(PackTy) <
961 DL.getTypeSizeInBits(C->getType()->getScalarType())) {
962 LoadPacked = CastInst::CreateSExtOrBitCast(LoadPacked, C->getType(),
963 "constantsext", Inst);
964
965 bool IsI64 =
966 C->getType()->getScalarType() == Type::getInt64Ty(Inst->getContext());
967 if (IsI64 && !allowI64Ops()) {
968 if (LoadPacked->getOpcode() == Instruction::CastOps::SExt) {
969 LoadPacked = genx::emulateI64Operation(&Subtarget, LoadPacked,
970 EmulationFlag::RAUWE);
971 }
972 }
973 }
974 return LoadPacked;
975 }
976 if (auto CC = getConsolidatedConstant(C)) {
977 // We're loading a vector of byte or short (but not i1). Use int so the
978 // instruction does not use so many channels. This may also save it being
979 // split by legalization.
980 ConstantLoader CCL(CC, Subtarget, DL);
981 Instruction *NewInst = nullptr;
982 if (CCL.isSimple())
983 NewInst = CCL.load(Inst);
984 else
985 NewInst = CCL.loadNonSimple(Inst);
986 NewInst = CastInst::Create(Instruction::BitCast, NewInst, C->getType(),
987 "constant", Inst);
988 if (AddedInstructions)
989 AddedInstructions->push_back(NewInst);
990 return NewInst;
991 }
992 auto *VT = cast<IGCLLVM::FixedVectorType>(C->getType());
993 unsigned NumElements = VT->getNumElements();
994 SmallVector<Constant *, 32> Elements;
995 unsigned UndefBits = 0;
996 if (ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(C)) {
997 // Gather the elements.
998 for (unsigned i = 0; i != NumElements; ++i) {
999 Constant *El = CDV->getElementAsConstant(i);
1000 IGC_ASSERT_MESSAGE(!isa<UndefValue>(El), "CDV element can't be undef");
1001 Elements.push_back(El);
1002 }
1003 } else {
1004 ConstantVector *CV = cast<ConstantVector>(C);
1005 // Gather the elements.
1006 for (unsigned i = 0; i != NumElements; ++i) {
1007 Constant *El = CV->getOperand(i);
1008 if (isa<UndefValue>(El))
1009 UndefBits |= 1 << i;
1010 Elements.push_back(El);
1011 }
1012 }
1013 unsigned RemainingBits = ~UndefBits
1014 & ((NumElements == 32 ? 0 : 1 << NumElements) - 1);
1015 if (!RemainingBits) {
1016 // All elements are undef. This should have been simplified away earlier,
1017 // but we need to cope with it in case it was not. Just load the first
1018 // element.
1019 RemainingBits = 1;
1020 }
1021 Instruction *Result = 0;
1022 // If it is wider than 8 elements, see if we can load any group of 8 as a
1023 // packed vector.
1024 if (NumElements > 8) {
1025 for (unsigned Idx = 0; Idx < NumElements - 4; Idx += 8) {
1026 unsigned Size = std::min(8U, NumElements - Idx);
1027 Constant *SubC = getConstantSubvector(C, Idx, Size);
1028 if (isa<UndefValue>(SubC))
1029 continue;
1030 ConstantLoader SubLoader(SubC, Subtarget, DL);
1031 if (SubLoader.PackedIntScale == 0 && !SubLoader.isPackedFloatVector())
1032 continue;
1033 Region R(C, &DL);
1034 R.getSubregion(Idx, Size);
1035 if (SubLoader.isSimple()) {
1036 Value *SubV = SubC;
1037 Result = R.createWrConstRegion(
1038 Result ? (Value *)Result : (Value *)UndefValue::get(C->getType()),
1039 SubV, "constant.split" + Twine(Idx), Inst, Inst->getDebugLoc());
1040 } else {
1041 Value* SubV = SubLoader.loadNonSimple(Inst);
1042 Result = R.createWrRegion(
1043 Result ? (Value *)Result : (Value *)UndefValue::get(C->getType()),
1044 SubV, "constant.split" + Twine(Idx), Inst, Inst->getDebugLoc());
1045 }
1046 if (AddedInstructions)
1047 AddedInstructions->push_back(Result);
1048 RemainingBits &= ~(255 << Idx);
1049 }
1050 if (!RemainingBits)
1051 return Result;
1052 }
1053
1054 // Build the splat sets, that is, the sets of elements of identical value.
1055 SmallVector<unsigned, 32> SplatSets;
1056 {
1057 ValueMap<Constant *, unsigned> SplatSetFinder;
1058 for (unsigned i = 0; i != NumElements; ++i) {
1059 Constant *El = Elements[i];
1060 if (!isa<UndefValue>(El)) {
1061 std::pair<ValueMap<Constant *, unsigned>::iterator, bool> Created
1062 = SplatSetFinder.insert(std::pair<Constant *, unsigned>(El,
1063 SplatSets.size()));
1064 if (Created.second) {
1065 // First time this Constant has been seen.
1066 SplatSets.push_back(1 << i);
1067 } else {
1068 // Add on to existing splat set.
1069 SplatSets[Created.first->second] |= 1 << i;
1070 }
1071 }
1072 }
1073 }
1074 // Remove any splat set with only a single element.
1075 unsigned NewSize = 0;
1076 for (unsigned i = 0, e = SplatSets.size(); i != e; ++i) {
1077 if (countPopulation(SplatSets[i]) >= 2)
1078 SplatSets[NewSize++] = SplatSets[i];
1079 }
1080 SplatSets.resize(NewSize);
1081 // Determine which elements are suitable for inclusion in a packed vector.
1082 // FIXME Not implemented yet. For an int vector constant, we need to
1083 // determine whether the instruction expects the operand to be signed
1084 // or unsigned.
1085
1086 // Loop constructing the constant until it is complete.
1087 do {
1088 // Find the splat set that will contribute the most elements
1089 // to the vector, taking into account what elements we can access
1090 // in a 1D region write. (Initialize BestSplatSetBits so, if no best
1091 // splat is found, we just do a single element out of RemainingBits.)
1092 //
1093 // Note that we are looking for the splat set that sets the most elements,
1094 // not the one that _usefully_ sets the most elements. For example,
1095 // Examples/sepia has a constant vector of the form
1096 // < A, B, C, 0, 0, A, B, C >
1097 // We have four splat sets {0,5} {1,6} {2,7} {3,4}, each of which
1098 // has two elements. What we want to do is set one of the A, B or C
1099 // sets first, rather than the 0s, because region restrictions mean that
1100 // we can only set such a pair if we do it first. If the loop below were
1101 // to find the splat set that _usefully_ sets the most elements, all four
1102 // sets would say "2" and we would arbitrarily pick one of them. But, if
1103 // we ask each splat set how many elements it sets, even uselessly, then
1104 // the A, B and C sets say "8" and the 0 set says "2", and we ensure that
1105 // we do one of the A, B or C sets first.
1106 // So we end up setting the constant in this order (arbitrarily picking
1107 // A first):
1108 // < A, A, A, A, A, A, A, A >
1109 // < 0, 0 >
1110 // < B >
1111 // < B >
1112 // < C >
1113 // < C >
1114 // giving five wrregion instructions rather than six.
1115 unsigned BestSplatSetBits = 1 << genx::log2(RemainingBits);
1116 unsigned BestSplatSetUsefulBits = BestSplatSetBits;
1117 unsigned BestSplatSetCount = 1;
1118 Constant *BestSplatSetConst = Elements[genx::log2(RemainingBits)];
1119 for (unsigned i = 0, e = SplatSets.size(); i != e; ++i) {
1120 unsigned Bits = getRegionBits(SplatSets[i] & RemainingBits,
1121 SplatSets[i] | RemainingBits | UndefBits, NumElements);
1122 unsigned Count = countPopulation(Bits);
1123 // For this splat set, Bits is a bitmap of the vector elements that
1124 // we can set in this splat set in a legal 1D region (possibly including
1125 // elements already set and undef elements), and Count is how many
1126 // elements that still need setting the region will set.
1127 if (Count > BestSplatSetCount) {
1128 BestSplatSetBits = Bits;
1129 BestSplatSetUsefulBits = Bits & SplatSets[i];
1130 BestSplatSetCount = Count;
1131 BestSplatSetConst = Elements[genx::log2(SplatSets[i])];
1132 }
1133 }
1134 // Now BestSplatSetBits is a bitmap of the vector elements to include in
1135 // the best splat. Set up the splatted constant.
1136 if (!Result) {
1137 // For the first time round the loop, just splat the whole vector,
1138 // whatever BestSplatBits says.
1139 Result = loadConstant(
1140 ConstantVector::getSplat(IGCLLVM::getElementCount(NumElements),
1141 BestSplatSetConst),
1142 Inst, Subtarget, DL, AddedInstructions);
1143 Result->setDebugLoc(Inst->getDebugLoc());
1144 } else {
1145 // Not the first time round the loop. Set up the splatted subvector,
1146 // and write it as a region.
1147 Region R(BestSplatSetBits,
1148 DL.getTypeSizeInBits(VT->getElementType()) / genx::ByteBits);
1149 Constant *NewConst = ConstantVector::getSplat(
1150 IGCLLVM::getElementCount(R.NumElements), BestSplatSetConst);
1151 Result = R.createWrConstRegion(Result, NewConst, "constant", Inst,
1152 Inst->getDebugLoc());
1153 if (AddedInstructions)
1154 AddedInstructions->push_back(Result);
1155 }
1156 RemainingBits &= ~BestSplatSetUsefulBits;
1157 } while (RemainingBits);
1158 return Result;
1159 }
1160
1161 /***********************************************************************
1162 * getRegionBits : determine which vector elements we can set with a
1163 * 1D region
1164 *
1165 * Enter: NeededBits = bits for vector elements we need to set
1166 * OptionalBits = bits for vector elements we could set
1167 * VecWidth = number of elements in vector
1168 *
1169 * Return: bits for vector elements to set as a legal 1D region,
1170 * maximizing how many of NeededBits are set
1171 */
getRegionBits(unsigned NeededBits,unsigned OptionalBits,unsigned VecWidth)1172 unsigned ConstantLoader::getRegionBits(unsigned NeededBits,
1173 unsigned OptionalBits, unsigned VecWidth) {
1174 if (!NeededBits)
1175 return 0;
1176 // Get the first and last element numbers in NeededBits.
1177 unsigned FirstNeeded = countTrailingZeros(NeededBits, ZB_Undefined);
1178 unsigned LastNeeded = 31 - countLeadingZeros((uint32_t)NeededBits, ZB_Undefined);
1179 // Set the max width to the min size including both those elements
1180 // rounded up to the next power of two.
1181 unsigned MaxWidth = LastNeeded - FirstNeeded + 1;
1182 unsigned LogMaxWidth = genx::log2(MaxWidth);
1183 if (MaxWidth != 1U << LogMaxWidth) {
1184 ++LogMaxWidth;
1185 MaxWidth = 1U << LogMaxWidth;
1186 }
1187 // Special case NeededBits only having one element.
1188 if (LogMaxWidth == 0)
1189 return NeededBits;
1190 // Now find the best region.
1191 unsigned BestBits = 0;
1192 unsigned BestCount = 0;
1193 // Try each stride.
1194 static const unsigned StrideBitsTable[] = { 0xffffffffU, 0x55555555U, 0x11111111U };
1195 for (unsigned LogStride = 0, Stride = 1;
1196 LogStride <= 2U && LogStride < LogMaxWidth;
1197 ++LogStride, Stride <<= 1U) {
1198 // Try each width (not including 1).
1199 for (unsigned Width = 1U << (LogMaxWidth - LogStride); Width > 1; Width >>= 1) {
1200 if (Width <= BestCount)
1201 break;
1202 // Try each start index.
1203 for (unsigned Idx = 0; Idx + (Width - 1) * Stride < VecWidth; ++Idx) {
1204 if (Idx + Width > VecWidth)
1205 break;
1206 // Calculate which indexes the region will set.
1207 unsigned Bits = StrideBitsTable[LogStride];
1208 if (Width != 32)
1209 Bits &= (1 << Width) - 1;
1210 Bits <<= Idx;
1211 // See if it sets any elements that we are not allowed to set.
1212 if (Bits & ~(NeededBits | OptionalBits))
1213 continue;
1214 // See if it sets all of NeededBits.
1215 if ((Bits & NeededBits) == NeededBits)
1216 return Bits;
1217 // See if it is the best one we have seen so far.
1218 unsigned Count = countPopulation(Bits & NeededBits);
1219 if (Count > BestCount) {
1220 BestCount = Count;
1221 BestBits = Bits;
1222 if (BestCount == Width)
1223 break;
1224 }
1225 }
1226 }
1227 }
1228 if (!BestCount) {
1229 // We could not find any region that includes more than one of NeededBits.
1230 // Just do a single element.
1231 return 1 << genx::log2(NeededBits);
1232 }
1233 return BestBits;
1234 }
1235
loadSplatConstant(Instruction * InsertPos)1236 Instruction *ConstantLoader::loadSplatConstant(Instruction *InsertPos) {
1237 // Skip scalar types, vector type with just one element, or boolean vector.
1238 auto *VTy = dyn_cast<IGCLLVM::FixedVectorType>(C->getType());
1239 if (!VTy ||
1240 VTy->getNumElements() == 1 ||
1241 VTy->getScalarType()->isIntegerTy(1))
1242 return nullptr;
1243 // Skip non-splat vector.
1244 Constant *C1 = C->getSplatValue();
1245 if (!C1)
1246 return nullptr;
1247 // Create <1 x T> constant and broadcast it through rdregion.
1248 Constant *CV = ConstantVector::get(C1);
1249 // Load that scalar constant first.
1250 ConstantLoader L(CV, Subtarget, DL);
1251 Value *V = L.load(InsertPos);
1252 // Broadcast through rdregion.
1253 Region R(V, &DL);
1254 R.Width = R.NumElements = VTy->getNumElements();
1255 R.Stride = 0;
1256 R.VStride = 0;
1257 R.Offset = 0;
1258 Instruction *NewInst = R.createRdRegion(V, ".constsplat", InsertPos, DebugLoc());
1259 if (AddedInstructions)
1260 AddedInstructions->push_back(NewInst);
1261 return NewInst;
1262 }
1263
1264 /***********************************************************************
1265 * ConstantLoader::load : insert instruction to load a constant
1266 *
1267 * We use llvm.genx.constant, rather than bitcast, because CSE has a habit
1268 * of propagating a constant bitcast back into our operand that is not
1269 * allowed to be constant.
1270 *
1271 * Enter: C = constant to load
1272 * InsertBefore = insert new instruction before here
1273 *
1274 * Return: new instruction
1275 */
load(Instruction * InsertBefore)1276 Instruction *ConstantLoader::load(Instruction *InsertBefore) {
1277 IGC_ASSERT(isSimple());
1278 // Do not splat load on byte data as HW does not support byte imm source.
1279 if (!C->getType()->getScalarType()->isIntegerTy(8))
1280 if (auto NewInst = loadSplatConstant(InsertBefore))
1281 return NewInst;
1282
1283 if (!PackedFloat && !PackedIntScale && !isa<UndefValue>(C)) { // not packed int constant or undef
1284 if (auto CC = getConsolidatedConstant(C)) {
1285 // We're loading a vector of byte or short (but not i1). Use int so the
1286 // instruction does not use so many channels. This may also save it being
1287 // split by legalization.
1288 Instruction *NewInst =
1289 loadConstant(CC, InsertBefore, Subtarget, DL, AddedInstructions);
1290 NewInst = CastInst::Create(Instruction::BitCast, NewInst, C->getType(),
1291 "constant", InsertBefore);
1292 if (AddedInstructions)
1293 AddedInstructions->push_back(NewInst);
1294 return NewInst;
1295 }
1296 }
1297
1298 // Load the constant as normal.
1299 Value *Args[] = { C }; // Args to new llvm.genx.constant
1300 Type *OverloadedTypes[] = { C->getType() };
1301 GenXIntrinsic::ID IntrinsicID = GenXIntrinsic::genx_constanti;
1302 if (C->getType()->isFPOrFPVectorTy())
1303 IntrinsicID = GenXIntrinsic::genx_constantf;
1304 else if (C->getType()->getScalarType()->isIntegerTy(1))
1305 IntrinsicID = GenXIntrinsic::genx_constantpred;
1306 Module *M = InsertBefore->getParent()->getParent()->getParent();
1307 Function *Decl = GenXIntrinsic::getGenXDeclaration(M, IntrinsicID, OverloadedTypes);
1308 Instruction *NewInst = CallInst::Create(Decl, Args, "constant", InsertBefore);
1309 if (AddedInstructions)
1310 AddedInstructions->push_back(NewInst);
1311 return NewInst;
1312 }
1313
1314 /***********************************************************************
1315 * ConstantLoader::loadNonPackedIntConst : insert instruction to load a constant
1316 * that are not packed because they have width > 8.
1317 *
1318 * Enter: C = constant to load
1319 * InsertBefore = insert new instruction before here
1320 *
1321 * Return: new instruction
1322 */
loadNonPackedIntConst(Instruction * InsertBefore)1323 Instruction *ConstantLoader::loadNonPackedIntConst(Instruction *InsertBefore) {
1324 auto *CTy = cast<IGCLLVM::FixedVectorType>(C->getType());
1325 IGC_ASSERT(CTy->isIntOrIntVectorTy());
1326 // Simple vectors are already the correct size <= 8, process common load
1327 if (isSimple())
1328 return load(InsertBefore);
1329
1330 unsigned NumElements = CTy->getNumElements();
1331 Instruction *Result = nullptr;
1332 for (unsigned Idx = 0; Idx != NumElements;) {
1333 unsigned Size =
1334 std::min(PowerOf2Floor(NumElements - Idx), (uint64_t)ImmIntVec::Width);
1335 Constant *SubC = getConstantSubvector(C, Idx, Size);
1336 Value *SubV = SubC;
1337 ConstantLoader SubLoader(SubC, Subtarget, DL);
1338 SubV = SubLoader.load(InsertBefore);
1339
1340 Region R(C, &DL);
1341 R.getSubregion(Idx, Size);
1342 Result = R.createWrRegion(
1343 Result ? (Value *)Result : (Value *)UndefValue::get(C->getType()), SubV,
1344 "constant.split" + Twine(Idx), InsertBefore, DebugLoc());
1345 Idx += Size;
1346 }
1347 return Result;
1348 }
1349 /***********************************************************************
1350 * ConstantLoader::loadBig : insert instruction to load a constant that might
1351 * be illegally sized
1352 */
loadBig(Instruction * InsertBefore)1353 Instruction *ConstantLoader::loadBig(Instruction *InsertBefore) {
1354 if (isLegalSize() || isa<UndefValue>(C)) {
1355 // Does not need legalizing.
1356 if (!isSimple())
1357 return loadNonSimple(InsertBefore);
1358 return load(InsertBefore);
1359 }
1360 IGC_ASSERT_MESSAGE(!C->getType()->getScalarType()->isIntegerTy(1),
1361 "not expecting predicate in here");
1362 if (Constant *Consolidated = getConsolidatedConstant(C)) {
1363 // Load as a consolidated constant, then bitcast to the correct type.
1364 auto Load =
1365 ConstantLoader(Consolidated, Subtarget, DL, nullptr, AddedInstructions)
1366 .loadBig(InsertBefore);
1367 IGC_ASSERT(Load);
1368 Load = CastInst::Create(Instruction::BitCast, Load, C->getType(),
1369 Load->getName() + ".cast", InsertBefore);
1370 if (AddedInstructions)
1371 AddedInstructions->push_back(Load);
1372 return Load;
1373 }
1374 auto VT = cast<IGCLLVM::FixedVectorType>(C->getType());
1375 const unsigned NumElements = VT->getNumElements();
1376 const unsigned GRFWidthInBits = Subtarget.getGRFByteSize() * genx::ByteBits;
1377 const unsigned ElementBits = DL.getTypeSizeInBits(VT->getElementType());
1378 unsigned MaxSize = 2 * GRFWidthInBits / ElementBits;
1379 MaxSize = std::min(MaxSize, 32U);
1380 Instruction *Result = nullptr;
1381 for (unsigned Idx = 0; Idx != NumElements; ) {
1382 unsigned Size = std::min(1U << genx::log2(NumElements - Idx), MaxSize);
1383 // Load this subvector constant if necessary, and insert into the overall
1384 // value with wrregion.
1385 Constant *SubC = getConstantSubvector(C, Idx, Size);
1386 Value *SubV = SubC;
1387 ConstantLoader SubLoader(SubC, Subtarget, DL);
1388 if (!SubLoader.isSimple())
1389 SubV = SubLoader.loadNonSimple(InsertBefore);
1390 Region R(C, &DL);
1391 R.getSubregion(Idx, Size);
1392 Result = R.createWrRegion(
1393 Result ? (Value *)Result : (Value *)UndefValue::get(C->getType()), SubV,
1394 "constant.split" + Twine(Idx), InsertBefore, DebugLoc());
1395 if (AddedInstructions)
1396 AddedInstructions->push_back(Result);
1397 Idx += Size;
1398 }
1399 return Result;
1400 }
1401
1402 /***********************************************************************
1403 * ConstantLoader::isLegalSize : detect if a constant is a legal size
1404 */
isLegalSize() const1405 bool ConstantLoader::isLegalSize() const {
1406 auto *VT = dyn_cast<IGCLLVM::FixedVectorType>(C->getType());
1407 if (!VT)
1408 return true;
1409 const int NumBits = DL.getTypeSizeInBits(VT);
1410 if (!llvm::isPowerOf2_32(NumBits))
1411 return false;
1412 const int GRFSizeInBits = Subtarget.getGRFByteSize() * genx::ByteBits;
1413 if (NumBits > GRFSizeInBits * 2)
1414 return false; // bigger than 2 GRFs
1415 if (VT->getNumElements() > 32)
1416 return false; // 64 bytes not allowed
1417 return true;
1418 }
1419
1420 /***********************************************************************
1421 * ConstantLoader::isBigSimple : detect if a constant is either simple,
1422 * or would be simple after being split into legal sizes
1423 *
1424 * This does not do a thorough check so it misses some cases of a constant
1425 * that would split into simple constants.
1426 */
isBigSimple() const1427 bool ConstantLoader::isBigSimple() const {
1428 IGC_ASSERT_MESSAGE(!needFixingSimple(),
1429 "simple case shall be fixed first before this call");
1430 if (isa<UndefValue>(C))
1431 return true; // undef is simple
1432 auto VT = dyn_cast<VectorType>(C->getType());
1433 if (!VT)
1434 return true; // scalar always simple
1435 if (C->getSplatValue())
1436 return true; // splat constant always simple
1437 if (DL.getTypeSizeInBits(VT->getElementType()) == 1)
1438 return true; // predicate constant always simple
1439 return false;
1440 }
1441
1442 /***********************************************************************
1443 * ConstantLoader::isSimple : detect if a constant is "simple"
1444 *
1445 * A simple constant is one we know can be a constant operand in an instruction.
1446 */
isSimple() const1447 bool ConstantLoader::isSimple() const {
1448 IGC_ASSERT_MESSAGE(!needFixingSimple(),
1449 "simple case shall be fixed first before this call");
1450 if (isa<UndefValue>(C))
1451 return true; // undef is simple (and generates no vISA code)
1452 if (C->getType()->getScalarType()->isIntegerTy(1) && C->isAllOnesValue())
1453 return true; // all 1s predicate is simple
1454 if(User && User->isBinaryOp())
1455 if (isa<VectorType>(C->getType()))
1456 if (auto splat = C->getSplatValue())
1457 if (splat->isZeroValue())
1458 return true;
1459 if (!isLegalSize())
1460 return false; // Simple constant must be legally sized
1461 if (isBigSimple())
1462 return true; // a big simple constant that is legally sized is simple
1463 if (isPackedIntVector())
1464 return true;
1465 if (isPackedFloatVector())
1466 return true;
1467 return false;
1468 }
1469
allowI64Ops() const1470 bool ConstantLoader::allowI64Ops() const {
1471 if (!Subtarget.hasLongLong())
1472 return false;
1473 return true;
1474 }
1475 /***********************************************************************
1476 * ConstantLoader::isPackedIntVector : check for a packed int vector
1477 * (having already done the analysis in the ConstantLoader constructor)
1478 */
isPackedIntVector() const1479 bool ConstantLoader::isPackedIntVector() const {
1480 // Check for a packed int vector. Either the element type must be i16, or
1481 // the user (instruction using the constant) must be genx.constanti or
1482 // wrregion or wrconstregion. Not allowed if the user is a logic op.
1483 if (cast<IGCLLVM::FixedVectorType>(C->getType())->getNumElements() >
1484 ImmIntVec::Width)
1485 return false; // wrong width for packed vector
1486 if (PackedIntScale == 1 && (PackedIntAdjust == 0 || PackedIntAdjust == -8)) {
1487 if (!User)
1488 return true; // user not specified -- assume it is a mov, so wrong element
1489 // size is allowed
1490 if (!C->getType()->getScalarType()->isIntegerTy(16)
1491 && GenXIntrinsic::getGenXIntrinsicID(User) != GenXIntrinsic::genx_constanti
1492 && !GenXIntrinsic::isWrRegion(User))
1493 return false; // wrong element size when it is not a mov
1494 switch (User->getOpcode()) {
1495 case Instruction::And:
1496 case Instruction::Or:
1497 case Instruction::Xor:
1498 return false; // disallow packed vector in logic op
1499 default:
1500 break;
1501 }
1502 return true;
1503 }
1504 return false;
1505 }
1506
1507 /***********************************************************************
1508 * ConstantLoader::isPackedFloatVector : check for a packed float vector
1509 * (having already done the analysis in the ConstantLoader constructor)
1510 */
isPackedFloatVector() const1511 bool ConstantLoader::isPackedFloatVector() const {
1512 auto *VT = dyn_cast<IGCLLVM::FixedVectorType>(C->getType());
1513 if (!VT)
1514 return false;
1515 if (VT->getNumElements() > 4)
1516 return false;
1517 return PackedFloat;
1518 }
1519
1520 /***********************************************************************
1521 * ConstantLoader::getConsolidatedConstant : get the consolidated constant
1522 * for the given constant
1523 *
1524 * A "consolidated constant" is one where a vector of byte or short is
1525 * turned into the equivalent (as if by bitcast) vector of int.
1526 */
getConsolidatedConstant(Constant * C)1527 Constant *ConstantLoader::getConsolidatedConstant(Constant *C) {
1528 if (isa<UndefValue>(C))
1529 return nullptr;
1530 auto *VT = dyn_cast<IGCLLVM::FixedVectorType>(C->getType());
1531 if (!VT)
1532 return nullptr;
1533 const unsigned BytesPerElement =
1534 DL.getTypeSizeInBits(VT->getElementType()) / genx::ByteBits;
1535 const unsigned NumElements = VT->getNumElements();
1536 if (!BytesPerElement)
1537 return nullptr; // vector of i1
1538 if (BytesPerElement >= 4)
1539 return nullptr; // already vector of i32/i64/float/double
1540 if (NumElements * BytesPerElement & 3)
1541 return nullptr; // not a multiple of 4 bytes long
1542 // We're loading a vector of byte or short (but not i1). Use int so the
1543 // instruction does not use so many channels. This may also save it being
1544 // split by legalization.
1545 unsigned Compaction = BytesPerElement == 1 ? 4 : 2;
1546 unsigned Mask = BytesPerElement == 1 ? 0xff : 0xffff;
1547 SmallVector<Constant *, 8> Elements;
1548 Type *I32Ty = Type::getInt32Ty(C->getContext());
1549 for (unsigned i = 0; i != NumElements; i += Compaction) {
1550 unsigned Val = 0;
1551 bool IsUndef = true;
1552 for (unsigned j = 0; j != Compaction; ++j) {
1553 unsigned Bits = 0;
1554 Constant *El = C->getAggregateElement(i + j);
1555 // We assume that anything that is not ConstantInt is undefined. That
1556 // can include a constant expression with an undefined value in the
1557 // middle.
1558 if (auto CI = dyn_cast<ConstantInt>(El)) {
1559 Bits = CI->getSExtValue();
1560 IsUndef = false;
1561 }
1562 else if (auto CI = dyn_cast<ConstantFP>(El)) {
1563 APFloat V = CI->getValueAPF();
1564 Bits = V.bitcastToAPInt().getZExtValue();
1565 IsUndef = false;
1566 }
1567 Val |= (Bits & Mask) << (j * BytesPerElement * 8);
1568 }
1569 if (IsUndef)
1570 Elements.push_back(UndefValue::get(I32Ty));
1571 else
1572 Elements.push_back(ConstantInt::get(I32Ty, Val));
1573 }
1574 // Construct the constant with i32 element type.
1575 return ConstantVector::get(Elements);
1576 }
1577
1578 /***********************************************************************
1579 * ConstantLoader::analyze : analyze a constant value
1580 *
1581 * This analyzes whether a constant of no more than the right vector width
1582 * (integer 8 or fp 4) can be loaded as a packed vector, possibly scaled
1583 * and adjusted.
1584 */
analyze()1585 void ConstantLoader::analyze() {
1586 auto *VT = dyn_cast<IGCLLVM::FixedVectorType>(C->getType());
1587 if (!VT)
1588 return;
1589 if (C->getSplatValue())
1590 return; // don't analyze if already a splat
1591 unsigned NumElements = VT->getNumElements();
1592 if (VT->getElementType()->isIntegerTy()) {
1593 unsigned MaxSize = 2 * Subtarget.getGRFByteSize(); // element type is boolean
1594 if (!VT->getElementType()->isIntegerTy(1)) {
1595 unsigned ElmSz = VT->getScalarSizeInBits() / genx::ByteBits;
1596 MaxSize = 2 * Subtarget.getGRFByteSize() / ElmSz;
1597 }
1598 if (NumElements <= MaxSize)
1599 analyzeForPackedInt(NumElements);
1600 } else if (NumElements <= 8 && VT->getElementType()->isFloatingPointTy())
1601 analyzeForPackedFloat(NumElements);
1602 }
1603
analyzeForPackedInt(unsigned NumElements)1604 void ConstantLoader::analyzeForPackedInt(unsigned NumElements) {
1605 // Get element values.
1606 int64_t Min = INT64_MAX;
1607 int64_t Max = INT64_MIN;
1608 SmallVector<int64_t, 32> Elements;
1609 Constant *SomeDefinedElement = nullptr;
1610 for (unsigned i = 0; i != NumElements; ++i) {
1611 auto El = C->getAggregateElement(i);
1612 if (isa<UndefValue>(El))
1613 continue;
1614 SomeDefinedElement = El;
1615 int64_t Element = cast<ConstantInt>(El)->getSExtValue();
1616 Elements.push_back(Element);
1617 Min = std::min(Min, Element);
1618 Max = std::max(Max, Element);
1619 }
1620 if (Elements.empty()) {
1621 // Constant is undef.
1622 IGC_ASSERT_MESSAGE(C == UndefValue::get(C->getType()),
1623 "constant consists only of undef elements only if it's undef itself");
1624 return;
1625 }
1626 if (Elements.size() == 1) {
1627 // if we don't have an immediate user - do not create new constant
1628 // (constant materilization expects that NewC is cleared)
1629 if (!User)
1630 return;
1631 // All but one element undef. If num elements equal 8
1632 // then turn it into a splat constant
1633 if (NumElements != ImmIntVec::Width)
1634 return;
1635 NewC = ConstantVector::getSplat(IGCLLVM::getElementCount(NumElements),
1636 SomeDefinedElement);
1637 return;
1638 }
1639 int64_t ResArith;
1640 if (IGCLLVM::SubOverflow(Max, Min, ResArith))
1641 return;
1642 if (ResArith <= ImmIntVec::MaxUInt) {
1643 if (Min >= ImmIntVec::MinUInt && Max <= ImmIntVec::MaxUInt) {
1644 // Values all in the range [MinUInt..MaxUInt]. We can do this with a packed
1645 // unsigned int with no extra scaling or adjustment.
1646 PackedIntScale = 1;
1647 PackedIntAdjust = 0;
1648 PackedIntMax = Max;
1649 return;
1650 }
1651 if (Min >= ImmIntVec::MinSInt && Max <= ImmIntVec::MaxSInt) {
1652 // Values all in the range [MinSInt..MaxSInt]. We can do this with a packed
1653 // unsigned int with no extra scaling or adjustment.
1654 PackedIntScale = 1;
1655 PackedIntAdjust = -8;
1656 PackedIntMax = Max + 8;
1657 return;
1658 }
1659 // Values all in the range [Min..Min+MaxUInt]. We can do this
1660 // with a packed int with an adjustment.
1661 PackedIntScale = 1;
1662 PackedIntAdjust = Min;
1663 PackedIntMax = Max - Min;
1664 return;
1665 }
1666 // Get unique absolute differences, so we can detect if we have a valid
1667 // packed int vector that is then scaled and has a splatted constant
1668 // added/subtracted.
1669 SmallVector<uint64_t, 31> Diffs;
1670 SmallSet<uint64_t, 31> DiffsSet;
1671 for (unsigned i = 0, e = Elements.size() - 1; i != e; ++i) {
1672 int64_t Diff;
1673 if (IGCLLVM::SubOverflow(Elements[i + 1], Elements[i], Diff))
1674 return;
1675 if (!Diff)
1676 continue;
1677 uint64_t AbsDiff = std::abs(Diff);
1678 if (AbsDiff > UINT_MAX)
1679 return;
1680 if (DiffsSet.insert(AbsDiff).second)
1681 Diffs.push_back(AbsDiff);
1682 }
1683 IGC_ASSERT_MESSAGE(!Diffs.empty(), "not expecting splatted constant");
1684 // Calculate the GCD (greatest common divisor) of the diffs
1685 uint64_t GCD = Diffs[0];
1686 if (Diffs.size() > 1) {
1687 for(unsigned i = 1; i < Diffs.size(); i++)
1688 GCD = GreatestCommonDivisor64(GCD, Diffs[i]);
1689 }
1690 // Scale should fit in signed integer
1691 if (GCD > static_cast<uint64_t>(std::numeric_limits<int64_t>::max()))
1692 return;
1693 int64_t CurScale = static_cast<int64_t>(GCD);
1694 if (!IGCLLVM::MulOverflow(CurScale, static_cast<int64_t>(ImmIntVec::MaxUInt), ResArith) &&
1695 (Max - Min) > ResArith)
1696 return; // range of values too big
1697 PackedIntScale = CurScale;
1698 PackedIntMax = ImmIntVec::MaxUInt;
1699 // Special case adjust of 0 or -8 as then we can save doing an adjust at all
1700 // by using unsigned or signed packed vector respectively.
1701 if (!(Min % CurScale)) {
1702 if (Min >= ImmIntVec::MinUInt &&
1703 (!IGCLLVM::MulOverflow(CurScale, static_cast<int64_t>(ImmIntVec::MaxUInt), ResArith) &&
1704 Max <= ResArith)) {
1705 PackedIntAdjust = ImmIntVec::MinUInt;
1706 return;
1707 }
1708 if ((!IGCLLVM::MulOverflow(CurScale, static_cast<int64_t>(ImmIntVec::MinSInt), ResArith) &&
1709 Min >= ResArith) &&
1710 (!IGCLLVM::MulOverflow(CurScale, static_cast<int64_t>(ImmIntVec::MaxSInt), ResArith) &&
1711 Max <= ResArith)) {
1712 PackedIntAdjust = Min;
1713 PackedIntMax = ImmIntVec::MaxSInt;
1714 return;
1715 }
1716 // Special case all pre-scaled values being in [-15,0] as we can do that
1717 // by negating the scale and not needing to adjust.
1718 if ((!IGCLLVM::MulOverflow(CurScale, static_cast<int64_t>(-ImmIntVec::MaxUInt), ResArith) &&
1719 Min >= ResArith) &&
1720 Max <= -ImmIntVec::MinUInt) {
1721 PackedIntAdjust = ImmIntVec::MinUInt;
1722 PackedIntScale = -PackedIntScale;
1723 return;
1724 }
1725 }
1726 PackedIntAdjust = Min;
1727 }
1728
is8bitPackedFloat(float f)1729 static bool is8bitPackedFloat(float f) {
1730 union {
1731 float f;
1732 unsigned u;
1733 } u;
1734
1735 u.f = f;
1736 unsigned Exp = (u.u >> 23) & 0xFF;
1737 unsigned Frac = u.u & 0x7FFFFF;
1738 if (Exp == 0 && Frac == 0)
1739 return true;
1740 if (Exp < 124 || Exp > 131)
1741 return false;
1742 if ((Frac & 0x780000) != Frac)
1743 return false;
1744 Frac >>= 19;
1745 if (Exp == 124 && Frac == 0)
1746 return false;
1747 return true;
1748 }
1749
analyzeForPackedFloat(unsigned NumElements)1750 void ConstantLoader::analyzeForPackedFloat(unsigned NumElements) {
1751 if (!Subtarget.hasPackedFloat())
1752 return;
1753
1754 for (unsigned i = 0; i != NumElements; ++i) {
1755 auto Elt = C->getAggregateElement(i);
1756 if (isa<UndefValue>(Elt))
1757 continue;
1758 ConstantFP *CFP = dyn_cast<ConstantFP>(Elt);
1759 // Bail out if any element cannot be analyzed.
1760 if (!CFP)
1761 return;
1762 const APFloat &FP = CFP->getValueAPF();
1763 // Bail out if it's not supported.
1764 // TODO: Only support single precision so far.
1765 if (&FP.getSemantics() != &APFloat::IEEEsingle())
1766 return;
1767 // Bail out if it's not finite.
1768 if (!FP.isFinite())
1769 return;
1770 // Check if it could be represented in 8-bit packed float.
1771 if (!is8bitPackedFloat(FP.convertToFloat()))
1772 return;
1773 }
1774 PackedFloat = true;
1775 }
1776