1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 //
10 /// GenXConstants
11 /// -------------
12 ///
13 /// GenXConstants is not in itself a pass. It contains utility functions and a
14 /// class used by other passes for constant loading.
15 ///
16 /// loadNonSimpleConstants
17 /// ^^^^^^^^^^^^^^^^^^^^^^
18 ///
19 /// The GenXPostLegalization pass calls loadNonSimpleConstants to insert a load
20 /// for any operand that is a non-simple constant. (A non-simple constant is one
21 /// that is too big or an invalid value for a constant operand.)
22 ///
23 /// It is called in two places:
24 ///
25 /// 1. in the GenXPostLegalization pass, run after legalization but
26 ///    before CSE, so CSE has an opportunity to common up loaded non-simple
27 ///    constants;
28 /// 2. later on in GenXCategory, to mop up non-simple constant operands
29 ///    created by CSE's constant propagation.
30 ///
31 /// This does not insert a load if the constant is "big simple" (that is, it is
32 /// illegally wide but each legalized part of it is simple) and it is used in
33 /// the "old value" operand of a wrregion, or as a call arg.  Inserting a load
34 /// of such a constant here would allow the load to be CSEd, which would be
35 /// counter productive as some of the uses would not be kill uses and so
36 /// coalescing would fail there.
37 ///
38 /// Phi incoming constants are not loaded here; they are loaded in
39 /// loadPhiConstants called from GenXCategory. Phi constant loads do not need to
40 /// participate in CSE as loadPhiConstants has its own commoning up tailored for
41 /// phi nodes.
42 ///
43 /// loadConstants
44 /// ^^^^^^^^^^^^^
45 ///
46 /// This is called from GenXCategory.  It inserts a load for each constant
47 /// operand that is not allowed to be constant, but remains after
48 /// loadNonSimpleConstants.
49 ///
50 /// Phi incoming constants are not loaded here; they are loaded in
51 /// loadPhiConstants called from GenXCategory.
52 ///
53 /// loadPhiConstants
54 /// ^^^^^^^^^^^^^^^^
55 ///
56 /// This is called from GenXCategory, and it inserts loads for constant phi
57 /// incomings, commoning up when possible and sensible.
58 ///
59 /// Commoning up (inserting one load for multiple phi incomings with the same
60 /// constant, across one or more phi nodes) proceeds as follows:
61 ///
62 /// Firstly, we divide the phi nodes into _webs_, where each web is the maximal
63 /// set of phi nodes that are related through phi nodes and two address
64 /// instructions, so will be coalesced later on in the flow.
65 ///
66 /// Secondly, for a single web, we look for multiple uses of the same constant.
67 /// Such a constant has a load instruction inserted just once, at the end of the
68 /// nearest common dominator of all the corresponding incoming blocks.
69 ///
70 /// If that insert point is in an empty split critical edge block, we instead
71 /// insert in the block above that, in the hope that the split critical edge
72 /// block can be removed later.
73 ///
74 /// ConstantLoader
75 /// ^^^^^^^^^^^^^^
76 ///
77 /// ConstantLoader is a class that represents a constant and information on how
78 /// to load it. This is where analysis happens of whether it is a legal packed
79 /// vector, or whether it needs multiple instructions to load it. It then has
80 /// methods to insert the code to load the constant.
81 ///
82 //===----------------------------------------------------------------------===//
83 #define DEBUG_TYPE "GENX_CONSTANTS"
84 
85 #include "GenXConstants.h"
86 #include "GenXGotoJoin.h"
87 #include "GenXIntrinsics.h"
88 #include "GenXUtil.h"
89 
90 #include "llvm/ADT/APFloat.h"
91 #include "llvm/ADT/SmallSet.h"
92 #include "llvm/IR/BasicBlock.h"
93 #include "llvm/IR/Constants.h"
94 #include "llvm/IR/DerivedTypes.h"
95 #include "llvm/IR/Dominators.h"
96 #include "llvm/IR/Function.h"
97 #include "llvm/IR/InstIterator.h"
98 #include "llvm/IR/Instructions.h"
99 #include "llvm/IR/Intrinsics.h"
100 #include "llvm/IR/ValueMap.h"
101 #include "llvm/Support/Casting.h"
102 #include "llvm/Support/Debug.h"
103 
104 #include "Probe/Assertion.h"
105 
106 #include "llvmWrapper/IR/DerivedTypes.h"
107 #include "llvmWrapper/Support/MathExtras.h"
108 #include "llvmWrapper/Support/TypeSize.h"
109 
110 using namespace llvm;
111 using namespace genx;
112 
113 /***********************************************************************
114  * loadConstantStruct : insert instructions to load a constant struct
115  */
loadConstantStruct(Constant * C,Instruction * InsertPt,const GenXSubtarget & Subtarget,const DataLayout & DL,SmallVectorImpl<Instruction * > * AddedInstructions=nullptr)116 static Value *loadConstantStruct(
117     Constant *C, Instruction *InsertPt, const GenXSubtarget &Subtarget,
118     const DataLayout &DL,
119     SmallVectorImpl<Instruction *> *AddedInstructions = nullptr) {
120   auto ST = cast<StructType>(C->getType());
121   Value *Agg = UndefValue::get(ST);
122   for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
123     Constant *El = C->getAggregateElement(i);
124     if (isa<UndefValue>(El))
125       continue;
126     Value *LoadedEl = nullptr;
127     if (isa<StructType>(El->getType()))
128       LoadedEl =
129           loadConstantStruct(El, InsertPt, Subtarget, DL, AddedInstructions);
130     else {
131       LoadedEl = ConstantLoader(El, Subtarget, DL, nullptr, AddedInstructions)
132                      .loadBig(InsertPt);
133     }
134     auto *InsertInst =
135         InsertValueInst::Create(Agg, LoadedEl, i, "loadstruct", InsertPt);
136     Agg = InsertInst;
137     if (AddedInstructions)
138       AddedInstructions->push_back(InsertInst);
139   }
140   return Agg;
141 }
142 
143 /***********************************************************************
144  * loadNonSimpleConstants : for any non-simple or illegal size constant in
145  *      an instruction, load it.
146  *
147  * Enter:   Inst = instruction to find constant operands in
148  *          AddedInstructions = 0 else vector to push added instructions onto
149  *
150  * Return:  whether code was modified
151  *
152  * This does not load constants in a phi nodes. That is done in
153  * loadPhiConstants.
154  */
loadNonSimpleConstants(Instruction * Inst,const GenXSubtarget & Subtarget,const DataLayout & DL,SmallVectorImpl<Instruction * > * AddedInstructions)155 bool genx::loadNonSimpleConstants(
156     Instruction *Inst, const GenXSubtarget &Subtarget, const DataLayout &DL,
157     SmallVectorImpl<Instruction *> *AddedInstructions) {
158   bool Modified = false;
159   if (isa<PHINode>(Inst))
160     return Modified;
161   // Omit call target operand of a call.
162   unsigned NumArgs = Inst->getNumOperands();
163   auto CI = dyn_cast<CallInst>(Inst);
164   if (CI)
165     NumArgs = CI->getNumArgOperands();
166   unsigned IID = GenXIntrinsic::getAnyIntrinsicID(Inst);
167   // Do not proceed loading of genx.alloca argument since its value doesn't
168   // needed (only type matters) and always null.
169   if (IID == GenXIntrinsic::genx_alloca)
170     return Modified;
171   for (unsigned i = 0; i != NumArgs; ++i) {
172     if (isa<Constant>(Inst->getOperand(i))) {
173       Use *U = &Inst->getOperandUse(i);
174       Constant *C = dyn_cast<Constant>(*U);
175       if (!C)
176         continue;
177       if (isa<UndefValue>(C))
178         continue;
179       if (opMustBeConstant(Inst, i))
180         continue;
181       if (C->getType()->isStructTy()) {
182         *U = loadConstantStruct(C, Inst, Subtarget, DL, AddedInstructions);
183         Modified = true;
184         continue;
185       }
186 
187       ConstantLoader CL(C, Subtarget, DL, Inst, AddedInstructions);
188       if (CL.needFixingSimple()) {
189         Modified = true;
190         CL.fixSimple(i);
191         continue;
192       }
193       if (CL.isSimple())
194         continue;
195       // Do not load a "big simple" constant for the "old value of vector"
196       // input of a wrregion, so it does not get CSEd. CSEing it is
197       // counter-productive because, if it has multiple uses, it will
198       // need to be two-address copied by GenXCoalescing anyway.
199       if (GenXIntrinsic::isWrRegion(IID)
200           && i == GenXIntrinsic::GenXRegion::OldValueOperandNum
201           && CL.isBigSimple())
202         continue;
203       // Similarly, do not load a "big simple" constant for a call arg.
204       if (CI && IID == GenXIntrinsic::not_any_intrinsic && CL.isBigSimple())
205         continue;
206       *U = CL.loadBig(Inst);
207       Modified = true;
208     }
209   }
210   return Modified;
211 }
212 
loadConstantsForInlineAsm(CallInst * CI,const GenXSubtarget & Subtarget,const DataLayout & DL,SmallVectorImpl<Instruction * > * AddedInstructions)213 bool genx::loadConstantsForInlineAsm(
214     CallInst *CI, const GenXSubtarget &Subtarget, const DataLayout &DL,
215     SmallVectorImpl<Instruction *> *AddedInstructions) {
216   IGC_ASSERT_MESSAGE(CI->isInlineAsm(), "Inline asm expected");
217   bool Modified = false;
218   auto ConstraintsInfo = genx::getGenXInlineAsmInfo(CI);
219   Use *U;
220   for (unsigned i = 0, e = ConstraintsInfo.size(), ArgNo = 0; i != e; ++i) {
221     auto &Info = ConstraintsInfo[i];
222     if (Info.isOutput())
223       continue;
224     U = &CI->getOperandUse(ArgNo);
225     ArgNo++;
226     if (auto C = dyn_cast<Constant>(*U)) {
227       if (!isa<UndefValue>(C)) {
228         switch (Info.getConstraintType()) {
229         default:
230           *U = ConstantLoader(C, Subtarget, DL, nullptr, AddedInstructions)
231                    .load(CI);
232           Modified = true;
233           break;
234         case ConstraintType::Constraint_n:
235         case ConstraintType::Constraint_i:
236         case ConstraintType::Constraint_F:
237           break;
238         }
239       }
240     }
241   }
242   return Modified;
243 }
244 
245 /***********************************************************************
246  * loadConstants : load constants as required for an instruction
247  *
248  * This handles operands that are not allowed to be constant. A constant
249  * operand that needs loading because it is a non-simple constant is
250  * handled in loadNonSimpleConstants.
251  *
252  * This does not load constants in a phi nodes. That is done in
253  * loadPhiConstants.
254  */
loadConstants(Instruction * Inst,const GenXSubtarget & Subtarget,const DataLayout & DL)255 bool genx::loadConstants(Instruction *Inst, const GenXSubtarget &Subtarget,
256                          const DataLayout &DL) {
257   bool Modified = false;
258   Use *U;
259   if (isa<PHINode>(Inst))
260     return Modified;
261   if (isa<BinaryOperator>(Inst) &&
262       Inst->getType()->getScalarType()->isIntegerTy(1)) {
263     // Predicate binary operator: disallow constant operands, except
264     // that xor with -1 is allowed.
265     for (unsigned oi = 0; oi != 2; ++oi)
266       if (auto C = dyn_cast<Constant>(Inst->getOperand(oi))) {
267         auto IsNot = [=]() {
268           if (oi != 1)
269             return false;
270           if (Inst->getOpcode() != Instruction::Xor)
271             return false;
272           if (!C->getType()->isVectorTy())
273             return C->isAllOnesValue();
274           Constant *C1 = C->getSplatValue();
275           return C1 && C1->isAllOnesValue();
276         };
277         if (!IsNot()) {
278           Inst->setOperand(oi, ConstantLoader(C, Subtarget, DL).load(Inst));
279           Modified = true;
280         }
281       }
282   }
283   if (isa<SelectInst>(Inst)) {
284     // select: disallow constant selector
285     U = &Inst->getOperandUse(0);
286     if (auto C = dyn_cast<Constant>(*U)) {
287       *U = ConstantLoader(C, Subtarget, DL).load(Inst);
288       Modified = true;
289     }
290     return Modified;
291   }
292   if (isa<InsertValueInst>(Inst)) {
293     // insertvalue (inserting a value into a struct): disallow constant
294     // on element operand.
295     U = &Inst->getOperandUse(1);
296     if (auto C = dyn_cast<Constant>(*U)) {
297       *U = ConstantLoader(C, Subtarget, DL).load(Inst);
298       Modified = true;
299     }
300     // Also disallow constant (other than undef) on old struct value operand.
301     // We need to load each non-undef element separately.
302     U = &Inst->getOperandUse(0);
303     if (auto C = dyn_cast<Constant>(*U))
304       if (!isa<UndefValue>(C))
305         *U = loadConstantStruct(C, Inst, Subtarget, DL);
306     return Modified;
307   }
308   if (auto Br = dyn_cast<BranchInst>(Inst)) {
309     // Conditional branch: disallow constant condition.
310     if (Br->isConditional()) {
311       if (auto C = dyn_cast<Constant>(Br->getCondition())) {
312         Br->setCondition(ConstantLoader(C, Subtarget, DL).load(Br));
313         Modified = true;
314       }
315     }
316     return Modified;
317   }
318   if (auto Ret = dyn_cast<ReturnInst>(Inst)) {
319     // Return: disallow constant return value in a subroutine (internal
320     // linkage).
321     if (Ret->getNumOperands() && Ret->getParent()->getParent()->getLinkage()
322           == GlobalValue::InternalLinkage) {
323       if (auto C = dyn_cast<Constant>(Ret->getOperand(0))) {
324         if (!C->getType()->isVoidTy() && !isa<UndefValue>(C)) {
325           Ret->setOperand(0, ConstantLoader(C, Subtarget, DL).load(Ret));
326           Modified = true;
327         }
328       }
329     }
330     return Modified;
331   }
332   auto CI = dyn_cast<CallInst>(Inst);
333   if (!CI)
334     return Modified;
335   if (CI->isInlineAsm())
336     return loadConstantsForInlineAsm(CI, Subtarget, DL, nullptr);
337   int IntrinsicID = GenXIntrinsic::getAnyIntrinsicID(CI);
338   switch (IntrinsicID) {
339     case GenXIntrinsic::not_any_intrinsic:
340     case Intrinsic::fma:
341     case GenXIntrinsic::genx_ssmad:
342     case GenXIntrinsic::genx_sumad:
343     case GenXIntrinsic::genx_usmad:
344     case GenXIntrinsic::genx_uumad:
345     case GenXIntrinsic::genx_output:
346     case GenXIntrinsic::genx_output_1:
347       // load all args for subroutine and some intrinsic calls.
348       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
349         U = &CI->getOperandUse(i);
350         if (auto C = dyn_cast<Constant>(*U)) {
351           if (!isa<UndefValue>(C)) {
352             *U = ConstantLoader(C, Subtarget, DL).loadBig(CI);
353             Modified = true;
354           }
355         }
356       }
357       break;
358     case GenXIntrinsic::genx_constanti:
359     case GenXIntrinsic::genx_constantf:
360       break;
361     case GenXIntrinsic::genx_absi:
362     case GenXIntrinsic::genx_absf:
363       // abs modifier: disallow constant input.
364       U = &CI->getOperandUse(0);
365       if (auto C = dyn_cast<Constant>(*U)) {
366         *U = ConstantLoader(C, Subtarget, DL).load(CI);
367         Modified = true;
368       }
369       break;
370     case GenXIntrinsic::genx_rdpredregion:
371     case GenXIntrinsic::genx_any:
372     case GenXIntrinsic::genx_all:
373       // rdpredregion, any, all: disallow constant input
374       U = &CI->getOperandUse(0);
375       if (auto C = dyn_cast<Constant>(*U)) {
376         *U = ConstantLoader(C, Subtarget, DL).load(CI);
377         Modified = true;
378       }
379       break;
380     case GenXIntrinsic::genx_rdregioni:
381     case GenXIntrinsic::genx_rdregionf:
382       // rdregion: disallow constant input
383       U = &CI->getOperandUse(0);
384       if (auto C = dyn_cast<Constant>(*U)) {
385         *U = ConstantLoader(C, Subtarget, DL).loadBig(CI);
386         Modified = true;
387       }
388       // Also disallow constant vector index (constant scalar OK).
389       U = &CI->getOperandUse(GenXIntrinsic::GenXRegion::RdIndexOperandNum);
390       if (auto C = dyn_cast<Constant>(*U)) {
391         if (isa<VectorType>(C->getType())) {
392           *U = ConstantLoader(C, Subtarget, DL).load(CI);
393           Modified = true;
394         }
395       }
396       break;
397     case GenXIntrinsic::genx_wrpredpredregion:
398       // wrpredpred: disallow constant "old vector" input unless undef
399       U = &CI->getOperandUse(0);
400       if (auto C = dyn_cast<Constant>(*U)) {
401         if (!isa<UndefValue>(C)) {
402           *U = ConstantLoader(C, Subtarget, DL).loadBig(CI);
403           Modified = true;
404         }
405       }
406       break;
407     case GenXIntrinsic::genx_wrregioni:
408     case GenXIntrinsic::genx_wrregionf:
409       // wrregion: disallow constant "old vector" input unless undef
410       U = &CI->getOperandUse(0);
411       if (auto C = dyn_cast<Constant>(*U)) {
412         if (!isa<UndefValue>(C)) {
413           *U = ConstantLoader(C, Subtarget, DL).loadBig(CI);
414           Modified = true;
415         }
416       }
417       // Also disallow constant vector index (constant scalar OK).
418       U = &CI->getOperandUse(GenXIntrinsic::GenXRegion::WrIndexOperandNum);
419       if (auto C = dyn_cast<Constant>(*U)) {
420         if (isa<VectorType>(C->getType())) {
421           *U = ConstantLoader(C, Subtarget, DL).load(CI);
422           Modified = true;
423         }
424       }
425       // Also disallow constant predicate unless all ones.
426       U = &CI->getOperandUse(GenXIntrinsic::GenXRegion::PredicateOperandNum);
427       if (auto C = dyn_cast<Constant>(*U)) {
428         if (!C->isAllOnesValue()) {
429           *U = ConstantLoader(C, Subtarget, DL).load(CI);
430           Modified = true;
431         }
432       }
433       break;
434     case GenXIntrinsic::genx_simdcf_goto:
435       // goto: disallow constant predicate input, unless it is all 0. We want to
436       // allow constant all 0, as it is the encoding used for an "else", and
437       // loading the constant into a predicate register stops the finalizer's
438       // structurizer working.
439       U = &CI->getOperandUse(2);
440       if (auto C = dyn_cast<Constant>(*U)) {
441         if (!C->isNullValue()) {
442           *U = ConstantLoader(C, Subtarget, DL).load(CI);
443           Modified = true;
444         }
445       }
446       break;
447     default:
448       // Intrinsic: check intrinsic descriptor to see where constant args
449       // are allowed.
450       // Iterate through each field in the intrinsic info.
451       GenXIntrinsicInfo II(IntrinsicID);
452       // Intrinsic not found.
453       if (II.isNull())
454         return Modified;
455       unsigned MaxRawOperands = II.getTrailingNullZoneStart(CI);
456       for (GenXIntrinsicInfo::iterator i = II.begin(), e = II.end(); i != e; ++i) {
457         GenXIntrinsicInfo::ArgInfo AI = *i;
458         if (!AI.isArgOrRet() || AI.isRet())
459           continue;
460         // This field relates to an operand.
461         U = &CI->getOperandUse(AI.getArgIdx());
462         auto C = dyn_cast<Constant>(*U);
463         if (!C)
464           continue;
465         // Operand is constant.
466         // Allow constant if it is i1 or vector of i1 set to all ones; this
467         // represents an "all true" predication field.
468         if (C->getType()->getScalarType()->isIntegerTy(1) && C->isAllOnesValue())
469           continue;
470         // Allow constant if intrinsic descriptor allows it for this arg.
471         if (!AI.isImmediateDisallowed())
472           continue;
473         // If it is a RAW operand, allow the constant if it's in the trailing
474         // null region (it must be a null constant if so), or if the value
475         // is undefined and RAW_NULLALLOWED is enabled.
476         if (AI.isRaw()) {
477           if ((unsigned)AI.getArgIdx() >= MaxRawOperands) {
478             IGC_ASSERT(C->isNullValue());
479             continue;
480           }
481           if (isa<UndefValue>(C) && AI.rawNullAllowed())
482             continue;
483         }
484         // Also allow constant if it is undef in a TWOADDR
485         if (isa<UndefValue>(C) && AI.getCategory() == GenXIntrinsicInfo::TWOADDR)
486           continue;
487         // Also allow constant if it is a reserved surface index.
488         if (AI.getCategory() == GenXIntrinsicInfo::SURFACE &&
489             visa::isReservedSurfaceIndex(visa::convertToSurfaceIndex(C))) {
490           continue;
491         }
492         // Operand is not allowed to be constant. Insert code to load it.
493         *U = ConstantLoader(C, Subtarget, DL).loadBig(CI);
494         Modified = true;
495       }
496       break;
497   }
498   return Modified;
499 }
500 
areConstantsEqual(const Constant * C1,const Constant * C2)501 bool genx::areConstantsEqual(const Constant *C1, const Constant *C2) {
502   IGC_ASSERT(C1 && C2);
503   // If these are same constants then it's obviously true
504   if (C1 == C2)
505     return true;
506 
507   Type *C1Ty = C1->getType();
508   Type *C2Ty = C2->getType();
509 
510   bool SameType = C1Ty == C2Ty;
511   // If types are not the same then compare if types are bitcastable
512   if (!SameType) {
513     if (!C1Ty->canLosslesslyBitCastTo(C2Ty))
514       return false;
515   }
516 
517   // Most common case: check for zero initializers
518   if (C1->isZeroValue() && C2->isZeroValue())
519     return true;
520 
521   auto *GC1 = dyn_cast<GlobalValue>(C1);
522   auto *GC2 = dyn_cast<GlobalValue>(C2);
523   // TODO: check for specific versions of each global
524   if (GC1 || GC2)
525     return false;
526 
527   if (C1->getValueID() != C2->getValueID())
528     return false;
529 
530   // Check contents
531 
532   if (const auto *C1Seq = dyn_cast<ConstantDataSequential>(C1)) {
533     const auto *C2Seq = cast<ConstantDataSequential>(C2);
534     StringRef C1RawData = C1Seq->getRawDataValues();
535     StringRef C2RawData = C2Seq->getRawDataValues();
536     if (C1RawData.size() == C2RawData.size())
537       return (C1RawData.compare(C2RawData) == 0);
538     return false;
539   }
540 
541   switch (C1->getValueID()) {
542   default:
543     // Otherwise be conservative
544     return false;
545   case Value::ConstantIntVal: {
546     const APInt &C1Int = cast<ConstantInt>(C1)->getValue();
547     const APInt &C2Int = cast<ConstantInt>(C2)->getValue();
548     return C1Int == C2Int;
549   }
550   case Value::ConstantFPVal: {
551     const APFloat &C1FP = cast<ConstantFP>(C1)->getValueAPF();
552     const APFloat &C2FP = cast<ConstantFP>(C2)->getValueAPF();
553     return C1FP.bitcastToAPInt() == C2FP.bitcastToAPInt();
554   }
555   case Value::ConstantVectorVal: {
556     const ConstantVector *C1CV = cast<ConstantVector>(C1);
557     const ConstantVector *C2CV = cast<ConstantVector>(C2);
558     unsigned NumElementsC1 =
559         cast<IGCLLVM::FixedVectorType>(C1Ty)->getNumElements();
560     unsigned NumElementsC2 =
561         cast<IGCLLVM::FixedVectorType>(C2Ty)->getNumElements();
562     if (NumElementsC1 != NumElementsC2)
563       return false;
564     for (uint64_t i = 0; i < NumElementsC1; ++i)
565       if (!areConstantsEqual(cast<Constant>(C1CV->getOperand(i)),
566                              cast<Constant>(C2CV->getOperand(i))))
567         return false;
568     return true;
569   }
570   case Value::ConstantArrayVal: {
571     const ConstantArray *C1A = cast<ConstantArray>(C1);
572     const ConstantArray *C2A = cast<ConstantArray>(C2);
573     uint64_t NumElementsC1 = cast<ArrayType>(C1Ty)->getNumElements();
574     uint64_t NumElementsC2 = cast<ArrayType>(C2Ty)->getNumElements();
575     if (NumElementsC1 != NumElementsC2)
576       return false;
577     for (uint64_t i = 0; i < NumElementsC1; ++i)
578       if (!areConstantsEqual(cast<Constant>(C1A->getOperand(i)),
579                              cast<Constant>(C2A->getOperand(i))))
580         return false;
581     return true;
582   }
583   }
584 }
585 
586 /***********************************************************************
587  * cleanupConstantLoads : remove all genx.constant* intrinsics that have
588  * non-constant source operand
589  */
cleanupConstantLoads(Function * F)590 bool genx::cleanupConstantLoads(Function *F) {
591   bool Modified = false;
592   for (auto I = inst_begin(F), E = inst_end(F); I != E;) {
593     auto *CI = dyn_cast<CallInst>(&*I++);
594     if (!CI)
595       continue;
596     auto IID = GenXIntrinsic::getAnyIntrinsicID(CI);
597     if (IID != GenXIntrinsic::genx_constanti &&
598         IID != GenXIntrinsic::genx_constantf &&
599         IID != GenXIntrinsic::genx_constantpred)
600       continue;
601     if (isa<Constant>(CI->getOperand(0)))
602       continue;
603     CI->replaceAllUsesWith(CI->getOperand(0));
604     CI->eraseFromParent();
605     Modified = true;
606   }
607   return Modified;
608 }
609 
610 /***********************************************************************
611  * loadPhiConstants : load constant incomings in phi nodes, commoning up
612  *      if appropriate
613  */
loadPhiConstants(Function & F,DominatorTree * DT,const GenXSubtarget & Subtarget,const DataLayout & DL,bool ExcludePredicate)614 bool genx::loadPhiConstants(Function &F, DominatorTree *DT,
615                             const GenXSubtarget &Subtarget,
616                             const DataLayout &DL, bool ExcludePredicate) {
617   bool Modified = false;
618   std::set<Instruction *> Done;
619   for (BasicBlock &BB : F) {
620     for (auto bi = BB.begin();; ++bi) {
621       auto Phi = dyn_cast<PHINode>(&*bi);
622       if (!Phi)
623         break;
624       if (!Done.insert(Phi).second)
625         continue; // phi node already processed in some web
626       // Gather the web of phi nodes and two address instructions related to
627       // this one.  This is an approximation to the web of instructions that
628       // will or could be coalesced.
629       // (Use Web as a worklist of phi nodes and two address instructions to
630       // use to find other phi nodes and two address instructions.)
631       //
632       // We process a web of related phi nodes at a time, rather than all phi
633       // nodes that use the constant, to avoid this situation:
634       // we try and common up two phi nodes in the same basic block (e.g. two
635       // variables both initialized to 0 before a loop), but end up having to
636       // insert a copy for one of them anyway in coalescing.
637       SmallVector<Instruction *, 4> Web;
638       Web.push_back(Phi);
639       for (unsigned wi = 0; wi != Web.size(); ++wi) {
640         auto Inst = Web[wi];
641         unsigned oi = 0, oe = 0;
642         if ((Phi = dyn_cast<PHINode>(Inst))) {
643           // Phi node: process each incoming.
644           oe = Phi->getNumIncomingValues();
645         } else {
646           if (auto *CI = dyn_cast<CallInst>(Inst)) {
647             // Two address instruction: process just the two address operand.
648             oi = *getTwoAddressOperandNum(CI);
649             oe = oi + 1;
650           } else {
651             IGC_ASSERT(isa<CastInst>(Inst));
652             oi = 0;
653             oe = 1;
654           }
655         }
656 
657         auto IsPhiOrTwoAddress = [=](Value *V) {
658           if (isa<PHINode>(V))
659             return true;
660           if (auto CI = dyn_cast<CallInst>(V))
661             return getTwoAddressOperandNum(CI).hasValue();
662           return false;
663         };
664 
665         // For each incoming:
666         for (; oi != oe; ++oi ) {
667           auto Incoming = Inst->getOperand(oi);
668           // If it is a phi node or two address instruction, push it into the
669           // web for processing later.
670           if (IsPhiOrTwoAddress(Incoming)) {
671             auto IncomingInst = cast<Instruction>(Incoming);
672             if (Done.insert(IncomingInst).second)
673               Web.push_back(IncomingInst);
674           } else if (!isa<Constant>(Incoming)) {
675             // For any other inst or arg, see if it has any other use in a phi
676             // node or two address inst, and push that into the web.
677             for (auto ui = Incoming->use_begin(), ue = Incoming->use_end();
678                 ui != ue; ++ui) {
679               auto User = cast<Instruction>(ui->getUser());
680               // Add bitcasts into the web to process their users too
681               if (IsPhiOrTwoAddress(User) ||
682                   (isa<CastInst>(User) && cast<CastInst>(User)->isNoopCast(DL)))
683                 if (Done.insert(User).second)
684                   Web.push_back(User);
685             }
686           }
687         }
688         // Now process each use of the result of the phi node or two address
689         // instruction. If the use is in a phi node or is a two address operand,
690         // push the user into the web.
691         for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) {
692           auto User = cast<Instruction>(ui->getUser());
693           if (IsPhiOrTwoAddress(User))
694             if (Done.insert(User).second)
695               Web.push_back(User);
696         }
697       }
698       LLVM_DEBUG(
699         dbgs() << "loadPhiConstants: Web of phi nodes and two address insts:\n";
700         for (auto wi = Web.begin(), we = Web.end(); wi != we; ++wi)
701           dbgs() << **wi << "\n"
702       );
703       // Now process the web, ignoring anything other than phi nodes.
704       // Gather the distinct constants, and every use for each one in a phi
705       // node.
706       std::map<Constant *, SmallVector<Use *, 4>> ConstantUses;
707       SmallVector<Constant *, 8> DistinctConstants;
708 
709       // Fill ConstantUses map
710       // Process phis with larger types first to make sure that wider
711       // constant goes to ConstantUses map first
712       auto WebPhisRange = make_filter_range(
713           Web, [](Instruction *I) { return isa<PHINode>(I); });
714       SmallVector<Instruction *, 4> WebPhis(WebPhisRange);
715       std::sort(WebPhis.begin(), WebPhis.end(),
716                 [&DL](Instruction *I1, Instruction *I2) {
717                   return DL.getTypeSizeInBits(I1->getType()) >
718                          DL.getTypeSizeInBits(I2->getType());
719                 });
720 
721       for (auto *Inst : WebPhis) {
722         auto *Phi = cast<PHINode>(Inst);
723         for (unsigned oi = 0, oe = Phi->getNumIncomingValues(); oi != oe; ++oi) {
724           Use *U = &Phi->getOperandUse(oi);
725           auto *C = dyn_cast<Constant>(*U);
726           if (!C || isa<UndefValue>(C))
727             continue;
728           // when doing this transform in pattern matching phase
729           if (ExcludePredicate) {
730             if (C->getType()->getScalarType()->isIntegerTy(1))
731               continue;
732             if (DL.getTypeSizeInBits(C->getType()) <= 256)
733               continue;
734             auto IncomingBlock = Phi->getIncomingBlock(oi);
735             if (GotoJoin::isBranchingJoinLabelBlock(IncomingBlock))
736               continue;
737           }
738 
739           // Merge uses if constants are bitcastable.
740           auto EqualC = llvm::find_if(DistinctConstants, [&C](Constant *C2) {
741             return genx::areConstantsEqual(C, C2);
742           });
743           if (EqualC != DistinctConstants.end())
744             C = *EqualC;
745 
746           auto Entry = &ConstantUses[C];
747           if (!Entry->size())
748             DistinctConstants.push_back(C);
749           Entry->push_back(U);
750         }
751       }
752       // Handle each distinct constant.
753       for (unsigned dci = 0, dce = DistinctConstants.size(); dci != dce; ++dci) {
754         Constant *C = DistinctConstants[dci];
755         auto Entry = &ConstantUses[C];
756         if (Entry->size() != 1) {
757           LLVM_DEBUG(
758             dbgs() << "multiple use of " << *C << "\n";
759             for (unsigned ei = 0, ee = Entry->size(); ei != ee; ++ei)
760               dbgs() << *(*Entry)[ei]->getUser() << "\n"
761           );
762         }
763         // Find the closest common dominator of the incoming blocks of all phi
764         // uses of the constant. That is where we want to insert the constant
765         // load.
766         Use *U = (*Entry)[0];
767         auto InsertBB = cast<PHINode>(U->getUser())
768             ->getIncomingBlock(U->getOperandNo());
769         for (unsigned ei = 1, ee = Entry->size(); ei != ee; ++ei) {
770           U = (*Entry)[ei];
771           auto Phi = cast<PHINode>(U->getUser());
772           auto IncomingBB = Phi->getIncomingBlock(U->getOperandNo());
773           InsertBB = DT->findNearestCommonDominator(InsertBB, IncomingBB);
774         }
775         // If that location is an empty split critical edge block, go up to its
776         // predecessor (which is also its immediate dominator) if this block is
777         // "true" successor of branching simd cf block. In this case we cannot
778         // insert anything in current block and have to create partial
779         // redundancy.
780         IGC_ASSERT(InsertBB);
781         auto *InsertTerm = InsertBB->getTerminator();
782         auto *SinglePred = InsertBB->getSinglePredecessor();
783         if (InsertTerm->getNumSuccessors() == 1 &&
784             InsertTerm == &InsertBB->front() && SinglePred &&
785             GotoJoin::isBranchingGotoJoinBlock(SinglePred))
786           InsertBB = SinglePred;
787 
788         // Insert the constant load.
789         ConstantLoader CL(C, Subtarget, DL);
790         Value *Load = nullptr;
791         Instruction *InsertBefore = InsertBB->getTerminator();
792         if (!CL.isSimple())
793           Load = CL.loadBig(InsertBefore);
794         else
795           Load = CL.load(InsertBefore);
796         Modified = true;
797         // Modify the uses.
798 
799         SmallDenseMap<Type *, Value *, 4> CastMap;
800         // Create cast of specific type of given value or reuse it
801         // if exists
802         auto CreateOrReuseCast = [&CastMap](Value *V, Type *Ty,
803                                             Instruction *InsertBefore) {
804           // No cast needed
805           if (V->getType() == Ty)
806             return V;
807           // Assume bitcastable for now
808           if (!CastMap.count(Ty))
809             CastMap[Ty] =
810                 CastInst::Create(Instruction::BitCast, V, Ty,
811                                  V->getName() + ".cast", InsertBefore);
812           return CastMap[Ty];
813         };
814 
815         for (unsigned ei = 0, ee = Entry->size(); ei != ee; ++ei) {
816           auto *U = (*Entry)[ei];
817           *U = CreateOrReuseCast(Load, U->get()->getType(), InsertBefore);
818         }
819         // replace other non-phi uses that are also dominated by the InsertBB
820         for (unsigned wi = 0, we = Web.size(); wi != we; ++wi) {
821           if (isa<PHINode>(Web[wi]))
822             continue;
823           auto CI = dyn_cast<CallInst>(Web[wi]);
824           if (CI && getTwoAddressOperandNum(CI)) {
825             auto oi = *getTwoAddressOperandNum(CI);
826             Use *U = &CI->getOperandUse(oi);
827             auto *UC = dyn_cast<Constant>(*U);
828             if (UC && UC == C) {
829               if (CI->getParent() != InsertBB && DT->dominates(InsertBB, CI->getParent()))
830                 *U = CreateOrReuseCast(Load, U->get()->getType(), InsertBefore);
831             }
832           }
833         }
834       }
835     }
836   }
837   return Modified;
838 }
839 
isReplicatedConstantVector(const ConstantVector * Orig,const ConstantVector * ReplicateCandidate)840 bool genx::isReplicatedConstantVector(
841     const ConstantVector *Orig, const ConstantVector *ReplicateCandidate) {
842   IGC_ASSERT(Orig && ReplicateCandidate);
843   // First compare for same element type
844   if (Orig->getType()->getElementType() !=
845       ReplicateCandidate->getType()->getElementType())
846     return false;
847 
848   unsigned OrigNumElements = Orig->getType()->getNumElements();
849   unsigned CandidateNumElements =
850       ReplicateCandidate->getType()->getNumElements();
851 
852   // Check replicate possibility by size: candidate should be
853   // at least larger and it's size is divisible by the size of
854   // original vector
855   if ((OrigNumElements >= CandidateNumElements) ||
856       ((CandidateNumElements % OrigNumElements) != 0))
857     return false;
858 
859   // Get slices
860   unsigned NumReplicates = CandidateNumElements / OrigNumElements;
861   SmallVector<Constant *, 4> Slices;
862   for (unsigned i = 0; i < NumReplicates; i++)
863     Slices.push_back(genx::getConstantSubvector(
864         ReplicateCandidate, i * OrigNumElements, OrigNumElements));
865 
866   // Compare all slices
867   return llvm::all_of(Slices,
868                       [Orig](Constant *Slice) { return Slice == Orig; });
869 }
870 
fixSimple(int OperandIdx)871 void ConstantLoader::fixSimple(int OperandIdx) {
872   IGC_ASSERT_MESSAGE(User, "user must be provided");
873   IGC_ASSERT_MESSAGE(NewC, "no need to fix simple case");
874   IGC_ASSERT_MESSAGE(User->getOperand(OperandIdx) == C,
875     "wrong arguments: wrong operand index was provided");
876   User->setOperand(OperandIdx, NewC);
877   C = NewC;
878   // indicate that we no longer need fix
879   NewC = nullptr;
880 }
881 
882 /***********************************************************************
883  * ConstantLoader::loadNonSimple : load a non-simple constant
884  *
885  * Enter:   C = constant to lower if necessary
886  *          Inst = instruction it is used in (also used to insert new
887  *                 code before)
888  *
889  * Return:  new instruction
890  */
loadNonSimple(Instruction * Inst)891 Instruction *ConstantLoader::loadNonSimple(Instruction *Inst) {
892   IGC_ASSERT(!isSimple());
893   if (!isLegalSize())
894     return loadBig(Inst);
895   if (PackedFloat) {
896     unsigned NumElts =
897         cast<IGCLLVM::FixedVectorType>(C->getType())->getNumElements();
898     SmallVector<Instruction *, 4> Quads;
899     for (unsigned i = 0, e = NumElts; i != e; i += 4) {
900       SmallVector<Constant *, 4> Quad;
901       for (unsigned j = 0; j != 4 && (i + j) < NumElts; ++j)
902         Quad.push_back(C->getAggregateElement(i + j));
903       ConstantLoader Packed(ConstantVector::get(Quad), Subtarget, DL);
904       Quads.push_back(Packed.load(Inst));
905     }
906     Value *V = UndefValue::get(C->getType());
907     unsigned Offset = 0;
908     auto DbgLoc = Inst->getDebugLoc();
909     for (auto &Q : Quads) {
910       auto *VTy = cast<IGCLLVM::FixedVectorType>(Q->getType());
911       Region R(V, &DL);
912       R.getSubregion(Offset, VTy->getNumElements());
913       V = R.createWrRegion(V, Q, "constant.quad" + Twine(Offset), Inst, DbgLoc);
914       Offset += VTy->getNumElements();
915     }
916     return cast<Instruction>(V);
917   }
918   if (PackedIntScale) {
919     auto PackTy = C->getType()->getScalarType();
920     // limit the constant-type to 32-bit because we do not want 64-bit operation
921     if (DL.getTypeSizeInBits(PackTy) > 32)
922       PackTy = Type::getInt32Ty(Inst->getContext());
923     // Load as a packed int vector with scale and/or adjust.
924     SmallVector<Constant *, 32> PackedVals;
925     for (unsigned
926              i = 0,
927              e = cast<IGCLLVM::FixedVectorType>(C->getType())->getNumElements();
928          i != e; ++i) {
929       int64_t Val = 0;
930       if (auto CI = dyn_cast<ConstantInt>(C->getAggregateElement(i))) {
931         Val = CI->getSExtValue();
932         Val -= PackedIntAdjust;
933         Val /= PackedIntScale;
934       }
935       PackedVals.push_back(ConstantInt::get(PackTy, Val, /*isSigned=*/true));
936       IGC_ASSERT(cast<ConstantInt>(PackedVals.back())->getSExtValue() >= -8
937           && cast<ConstantInt>(PackedVals.back())->getSExtValue() <= 15);
938     }
939 
940     ConstantLoader Packed(ConstantVector::get(PackedVals), Subtarget, DL);
941     auto *LoadPacked = Packed.loadNonPackedIntConst(Inst);
942     if (PackedIntScale != 1) {
943       auto *SplatVal =
944           ConstantInt::get(PackTy, PackedIntScale, /*isSigned=*/true);
945       auto *CVTy = cast<IGCLLVM::FixedVectorType>(C->getType());
946       auto ElemCount = IGCLLVM::getElementCount(CVTy->getNumElements());
947       auto *Op1 = ConstantVector::getSplat(ElemCount, SplatVal);
948       LoadPacked = BinaryOperator::Create(Instruction::Mul, LoadPacked, Op1,
949                                           "constantscale", Inst);
950     }
951     if (PackedIntAdjust) {
952       auto *SplatVal =
953           ConstantInt::get(PackTy, PackedIntAdjust, /*isSigned=*/true);
954       auto *CVTy = cast<IGCLLVM::FixedVectorType>(C->getType());
955       auto ElemCount = IGCLLVM::getElementCount(CVTy->getNumElements());
956       auto *Op1 = ConstantVector::getSplat(ElemCount, SplatVal);
957       LoadPacked = BinaryOperator::Create(Instruction::Add, LoadPacked, Op1,
958                                           "constantadjust", Inst);
959     }
960     if (DL.getTypeSizeInBits(PackTy) <
961         DL.getTypeSizeInBits(C->getType()->getScalarType())) {
962       LoadPacked = CastInst::CreateSExtOrBitCast(LoadPacked, C->getType(),
963                                                  "constantsext", Inst);
964 
965       bool IsI64 =
966           C->getType()->getScalarType() == Type::getInt64Ty(Inst->getContext());
967       if (IsI64 && !allowI64Ops()) {
968         if (LoadPacked->getOpcode() == Instruction::CastOps::SExt) {
969           LoadPacked = genx::emulateI64Operation(&Subtarget, LoadPacked,
970                                                  EmulationFlag::RAUWE);
971         }
972       }
973     }
974     return LoadPacked;
975   }
976   if (auto CC = getConsolidatedConstant(C)) {
977     // We're loading a vector of byte or short (but not i1). Use int so the
978     // instruction does not use so many channels. This may also save it being
979     // split by legalization.
980     ConstantLoader CCL(CC, Subtarget, DL);
981     Instruction *NewInst = nullptr;
982     if (CCL.isSimple())
983       NewInst = CCL.load(Inst);
984     else
985       NewInst = CCL.loadNonSimple(Inst);
986     NewInst = CastInst::Create(Instruction::BitCast, NewInst, C->getType(),
987         "constant", Inst);
988     if (AddedInstructions)
989       AddedInstructions->push_back(NewInst);
990     return NewInst;
991   }
992   auto *VT = cast<IGCLLVM::FixedVectorType>(C->getType());
993   unsigned NumElements = VT->getNumElements();
994   SmallVector<Constant *, 32> Elements;
995   unsigned UndefBits = 0;
996   if (ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(C)) {
997     // Gather the elements.
998     for (unsigned i = 0; i != NumElements; ++i) {
999       Constant *El = CDV->getElementAsConstant(i);
1000       IGC_ASSERT_MESSAGE(!isa<UndefValue>(El), "CDV element can't be undef");
1001       Elements.push_back(El);
1002     }
1003   } else {
1004     ConstantVector *CV = cast<ConstantVector>(C);
1005     // Gather the elements.
1006     for (unsigned i = 0; i != NumElements; ++i) {
1007       Constant *El = CV->getOperand(i);
1008       if (isa<UndefValue>(El))
1009         UndefBits |= 1 << i;
1010       Elements.push_back(El);
1011     }
1012   }
1013   unsigned RemainingBits = ~UndefBits
1014       & ((NumElements == 32 ? 0 : 1 << NumElements) - 1);
1015   if (!RemainingBits) {
1016     // All elements are undef. This should have been simplified away earlier,
1017     // but we need to cope with it in case it was not. Just load the first
1018     // element.
1019     RemainingBits = 1;
1020   }
1021   Instruction *Result = 0;
1022   // If it is wider than 8 elements, see if we can load any group of 8 as a
1023   // packed vector.
1024   if (NumElements > 8) {
1025     for (unsigned Idx = 0; Idx < NumElements - 4; Idx += 8) {
1026       unsigned Size = std::min(8U, NumElements - Idx);
1027       Constant *SubC = getConstantSubvector(C, Idx, Size);
1028       if (isa<UndefValue>(SubC))
1029         continue;
1030       ConstantLoader SubLoader(SubC, Subtarget, DL);
1031       if (SubLoader.PackedIntScale == 0 && !SubLoader.isPackedFloatVector())
1032         continue;
1033       Region R(C, &DL);
1034       R.getSubregion(Idx, Size);
1035       if (SubLoader.isSimple()) {
1036         Value *SubV = SubC;
1037         Result = R.createWrConstRegion(
1038             Result ? (Value *)Result : (Value *)UndefValue::get(C->getType()),
1039             SubV, "constant.split" + Twine(Idx), Inst, Inst->getDebugLoc());
1040       } else {
1041         Value* SubV = SubLoader.loadNonSimple(Inst);
1042         Result = R.createWrRegion(
1043             Result ? (Value *)Result : (Value *)UndefValue::get(C->getType()),
1044             SubV, "constant.split" + Twine(Idx), Inst, Inst->getDebugLoc());
1045       }
1046       if (AddedInstructions)
1047         AddedInstructions->push_back(Result);
1048       RemainingBits &= ~(255 << Idx);
1049     }
1050     if (!RemainingBits)
1051       return Result;
1052   }
1053 
1054   // Build the splat sets, that is, the sets of elements of identical value.
1055   SmallVector<unsigned, 32> SplatSets;
1056   {
1057     ValueMap<Constant *, unsigned> SplatSetFinder;
1058     for (unsigned i = 0; i != NumElements; ++i) {
1059       Constant *El = Elements[i];
1060       if (!isa<UndefValue>(El)) {
1061         std::pair<ValueMap<Constant *, unsigned>::iterator, bool> Created
1062             = SplatSetFinder.insert(std::pair<Constant *, unsigned>(El,
1063                   SplatSets.size()));
1064         if (Created.second) {
1065           // First time this Constant has been seen.
1066           SplatSets.push_back(1 << i);
1067         } else {
1068           // Add on to existing splat set.
1069           SplatSets[Created.first->second] |= 1 << i;
1070         }
1071       }
1072     }
1073   }
1074   // Remove any splat set with only a single element.
1075   unsigned NewSize = 0;
1076   for (unsigned i = 0, e = SplatSets.size(); i != e; ++i) {
1077     if (countPopulation(SplatSets[i]) >= 2)
1078       SplatSets[NewSize++] = SplatSets[i];
1079   }
1080   SplatSets.resize(NewSize);
1081   // Determine which elements are suitable for inclusion in a packed vector.
1082   // FIXME Not implemented yet. For an int vector constant, we need to
1083   // determine whether the instruction expects the operand to be signed
1084   // or unsigned.
1085 
1086   // Loop constructing the constant until it is complete.
1087   do {
1088     // Find the splat set that will contribute the most elements
1089     // to the vector, taking into account what elements we can access
1090     // in a 1D region write. (Initialize BestSplatSetBits so, if no best
1091     // splat is found, we just do a single element out of RemainingBits.)
1092     //
1093     // Note that we are looking for the splat set that sets the most elements,
1094     // not the one that _usefully_ sets the most elements. For example,
1095     // Examples/sepia has a constant vector of the form
1096     // < A, B, C, 0, 0, A, B, C >
1097     // We have four splat sets {0,5} {1,6} {2,7} {3,4}, each of which
1098     // has two elements. What we want to do is set one of the A, B or C
1099     // sets first, rather than the 0s, because region restrictions mean that
1100     // we can only set such a pair if we do it first. If the loop below were
1101     // to find the splat set that _usefully_ sets the most elements, all four
1102     // sets would say "2" and we would arbitrarily pick one of them. But, if
1103     // we ask each splat set how many elements it sets, even uselessly, then
1104     // the A, B and C sets say "8" and the 0 set says "2", and we ensure that
1105     // we do one of the A, B or C sets first.
1106     // So we end up setting the constant in this order (arbitrarily picking
1107     // A first):
1108     //     < A, A, A, A, A, A, A, A >
1109     //     <          0, 0          >
1110     //     <    B                   >
1111     //     <                   B    >
1112     //     <       C                >
1113     //     <                      C >
1114     // giving five wrregion instructions rather than six.
1115     unsigned BestSplatSetBits = 1 << genx::log2(RemainingBits);
1116     unsigned BestSplatSetUsefulBits = BestSplatSetBits;
1117     unsigned BestSplatSetCount = 1;
1118     Constant *BestSplatSetConst = Elements[genx::log2(RemainingBits)];
1119     for (unsigned i = 0, e = SplatSets.size(); i != e; ++i) {
1120       unsigned Bits = getRegionBits(SplatSets[i] & RemainingBits,
1121           SplatSets[i] | RemainingBits | UndefBits, NumElements);
1122       unsigned Count = countPopulation(Bits);
1123       // For this splat set, Bits is a bitmap of the vector elements that
1124       // we can set in this splat set in a legal 1D region (possibly including
1125       // elements already set and undef elements), and Count is how many
1126       // elements that still need setting the region will set.
1127       if (Count > BestSplatSetCount) {
1128         BestSplatSetBits = Bits;
1129         BestSplatSetUsefulBits = Bits & SplatSets[i];
1130         BestSplatSetCount = Count;
1131         BestSplatSetConst = Elements[genx::log2(SplatSets[i])];
1132       }
1133     }
1134     // Now BestSplatSetBits is a bitmap of the vector elements to include in
1135     // the best splat. Set up the splatted constant.
1136     if (!Result) {
1137       // For the first time round the loop, just splat the whole vector,
1138       // whatever BestSplatBits says.
1139       Result = loadConstant(
1140           ConstantVector::getSplat(IGCLLVM::getElementCount(NumElements),
1141                                    BestSplatSetConst),
1142           Inst, Subtarget, DL, AddedInstructions);
1143       Result->setDebugLoc(Inst->getDebugLoc());
1144     } else {
1145       // Not the first time round the loop. Set up the splatted subvector,
1146       // and write it as a region.
1147       Region R(BestSplatSetBits,
1148                DL.getTypeSizeInBits(VT->getElementType()) / genx::ByteBits);
1149       Constant *NewConst = ConstantVector::getSplat(
1150           IGCLLVM::getElementCount(R.NumElements), BestSplatSetConst);
1151       Result = R.createWrConstRegion(Result, NewConst, "constant", Inst,
1152                                      Inst->getDebugLoc());
1153       if (AddedInstructions)
1154         AddedInstructions->push_back(Result);
1155     }
1156     RemainingBits &= ~BestSplatSetUsefulBits;
1157   } while (RemainingBits);
1158   return Result;
1159 }
1160 
1161 /***********************************************************************
1162  * getRegionBits : determine which vector elements we can set with a
1163  *                 1D region
1164  *
1165  * Enter:   NeededBits = bits for vector elements we need to set
1166  *          OptionalBits = bits for vector elements we could set
1167  *          VecWidth = number of elements in vector
1168  *
1169  * Return:  bits for vector elements to set as a legal 1D region,
1170  *          maximizing how many of NeededBits are set
1171  */
getRegionBits(unsigned NeededBits,unsigned OptionalBits,unsigned VecWidth)1172 unsigned ConstantLoader::getRegionBits(unsigned NeededBits,
1173     unsigned OptionalBits, unsigned VecWidth) {
1174   if (!NeededBits)
1175     return 0;
1176   // Get the first and last element numbers in NeededBits.
1177   unsigned FirstNeeded = countTrailingZeros(NeededBits, ZB_Undefined);
1178   unsigned LastNeeded = 31 - countLeadingZeros((uint32_t)NeededBits, ZB_Undefined);
1179   // Set the max width to the min size including both those elements
1180   // rounded up to the next power of two.
1181   unsigned MaxWidth = LastNeeded - FirstNeeded + 1;
1182   unsigned LogMaxWidth = genx::log2(MaxWidth);
1183   if (MaxWidth != 1U << LogMaxWidth) {
1184     ++LogMaxWidth;
1185     MaxWidth = 1U << LogMaxWidth;
1186   }
1187   // Special case NeededBits only having one element.
1188   if (LogMaxWidth == 0)
1189     return NeededBits;
1190   // Now find the best region.
1191   unsigned BestBits = 0;
1192   unsigned BestCount = 0;
1193   // Try each stride.
1194   static const unsigned StrideBitsTable[] = { 0xffffffffU, 0x55555555U, 0x11111111U };
1195   for (unsigned LogStride = 0, Stride = 1;
1196       LogStride <= 2U && LogStride < LogMaxWidth;
1197       ++LogStride, Stride <<= 1U) {
1198     // Try each width (not including 1).
1199     for (unsigned Width = 1U << (LogMaxWidth - LogStride); Width > 1; Width >>= 1) {
1200       if (Width <= BestCount)
1201         break;
1202       // Try each start index.
1203       for (unsigned Idx = 0; Idx + (Width - 1) * Stride < VecWidth; ++Idx) {
1204         if (Idx + Width > VecWidth)
1205           break;
1206         // Calculate which indexes the region will set.
1207         unsigned Bits = StrideBitsTable[LogStride];
1208         if (Width != 32)
1209           Bits &= (1 << Width) - 1;
1210         Bits <<= Idx;
1211         // See if it sets any elements that we are not allowed to set.
1212         if (Bits & ~(NeededBits | OptionalBits))
1213           continue;
1214         // See if it sets all of NeededBits.
1215         if ((Bits & NeededBits) == NeededBits)
1216           return Bits;
1217         // See if it is the best one we have seen so far.
1218         unsigned Count = countPopulation(Bits & NeededBits);
1219         if (Count > BestCount) {
1220           BestCount = Count;
1221           BestBits = Bits;
1222           if (BestCount == Width)
1223             break;
1224         }
1225       }
1226     }
1227   }
1228   if (!BestCount) {
1229     // We could not find any region that includes more than one of NeededBits.
1230     // Just do a single element.
1231     return 1 << genx::log2(NeededBits);
1232   }
1233   return BestBits;
1234 }
1235 
loadSplatConstant(Instruction * InsertPos)1236 Instruction *ConstantLoader::loadSplatConstant(Instruction *InsertPos) {
1237   // Skip scalar types, vector type with just one element, or boolean vector.
1238   auto *VTy = dyn_cast<IGCLLVM::FixedVectorType>(C->getType());
1239   if (!VTy ||
1240       VTy->getNumElements() == 1 ||
1241       VTy->getScalarType()->isIntegerTy(1))
1242     return nullptr;
1243   // Skip non-splat vector.
1244   Constant *C1 = C->getSplatValue();
1245   if (!C1)
1246     return nullptr;
1247   // Create <1 x T> constant and broadcast it through rdregion.
1248   Constant *CV = ConstantVector::get(C1);
1249   // Load that scalar constant first.
1250   ConstantLoader L(CV, Subtarget, DL);
1251   Value *V = L.load(InsertPos);
1252   // Broadcast through rdregion.
1253   Region R(V, &DL);
1254   R.Width = R.NumElements = VTy->getNumElements();
1255   R.Stride = 0;
1256   R.VStride = 0;
1257   R.Offset = 0;
1258   Instruction *NewInst = R.createRdRegion(V, ".constsplat", InsertPos, DebugLoc());
1259   if (AddedInstructions)
1260     AddedInstructions->push_back(NewInst);
1261   return NewInst;
1262 }
1263 
1264 /***********************************************************************
1265  * ConstantLoader::load : insert instruction to load a constant
1266  *
1267  * We use llvm.genx.constant, rather than bitcast, because CSE has a habit
1268  * of propagating a constant bitcast back into our operand that is not
1269  * allowed to be constant.
1270  *
1271  * Enter:   C = constant to load
1272  *          InsertBefore = insert new instruction before here
1273  *
1274  * Return:  new instruction
1275  */
load(Instruction * InsertBefore)1276 Instruction *ConstantLoader::load(Instruction *InsertBefore) {
1277   IGC_ASSERT(isSimple());
1278   // Do not splat load on byte data as HW does not support byte imm source.
1279   if (!C->getType()->getScalarType()->isIntegerTy(8))
1280     if (auto NewInst = loadSplatConstant(InsertBefore))
1281       return NewInst;
1282 
1283   if (!PackedFloat && !PackedIntScale && !isa<UndefValue>(C)) { // not packed int constant or undef
1284     if (auto CC = getConsolidatedConstant(C)) {
1285       // We're loading a vector of byte or short (but not i1). Use int so the
1286       // instruction does not use so many channels. This may also save it being
1287       // split by legalization.
1288       Instruction *NewInst =
1289           loadConstant(CC, InsertBefore, Subtarget, DL, AddedInstructions);
1290       NewInst = CastInst::Create(Instruction::BitCast, NewInst, C->getType(),
1291           "constant", InsertBefore);
1292       if (AddedInstructions)
1293         AddedInstructions->push_back(NewInst);
1294       return NewInst;
1295     }
1296   }
1297 
1298   // Load the constant as normal.
1299   Value *Args[] = { C };   // Args to new llvm.genx.constant
1300   Type *OverloadedTypes[] = { C->getType() };
1301   GenXIntrinsic::ID IntrinsicID = GenXIntrinsic::genx_constanti;
1302   if (C->getType()->isFPOrFPVectorTy())
1303     IntrinsicID = GenXIntrinsic::genx_constantf;
1304   else if (C->getType()->getScalarType()->isIntegerTy(1))
1305     IntrinsicID = GenXIntrinsic::genx_constantpred;
1306   Module *M = InsertBefore->getParent()->getParent()->getParent();
1307   Function *Decl = GenXIntrinsic::getGenXDeclaration(M, IntrinsicID, OverloadedTypes);
1308   Instruction *NewInst = CallInst::Create(Decl, Args, "constant", InsertBefore);
1309   if (AddedInstructions)
1310     AddedInstructions->push_back(NewInst);
1311   return NewInst;
1312 }
1313 
1314 /***********************************************************************
1315  * ConstantLoader::loadNonPackedIntConst : insert instruction to load a constant
1316  *                               that are not packed because they have width > 8.
1317  *
1318  * Enter:   C = constant to load
1319  *          InsertBefore = insert new instruction before here
1320  *
1321  * Return:  new instruction
1322  */
loadNonPackedIntConst(Instruction * InsertBefore)1323 Instruction *ConstantLoader::loadNonPackedIntConst(Instruction *InsertBefore) {
1324   auto *CTy = cast<IGCLLVM::FixedVectorType>(C->getType());
1325   IGC_ASSERT(CTy->isIntOrIntVectorTy());
1326   // Simple vectors are already the correct size <= 8, process common load
1327   if (isSimple())
1328     return load(InsertBefore);
1329 
1330   unsigned NumElements = CTy->getNumElements();
1331   Instruction *Result = nullptr;
1332   for (unsigned Idx = 0; Idx != NumElements;) {
1333     unsigned Size =
1334         std::min(PowerOf2Floor(NumElements - Idx), (uint64_t)ImmIntVec::Width);
1335     Constant *SubC = getConstantSubvector(C, Idx, Size);
1336     Value *SubV = SubC;
1337     ConstantLoader SubLoader(SubC, Subtarget, DL);
1338     SubV = SubLoader.load(InsertBefore);
1339 
1340     Region R(C, &DL);
1341     R.getSubregion(Idx, Size);
1342     Result = R.createWrRegion(
1343         Result ? (Value *)Result : (Value *)UndefValue::get(C->getType()), SubV,
1344         "constant.split" + Twine(Idx), InsertBefore, DebugLoc());
1345     Idx += Size;
1346   }
1347   return Result;
1348 }
1349 /***********************************************************************
1350  * ConstantLoader::loadBig : insert instruction to load a constant that might
1351  *      be illegally sized
1352  */
loadBig(Instruction * InsertBefore)1353 Instruction *ConstantLoader::loadBig(Instruction *InsertBefore) {
1354   if (isLegalSize() || isa<UndefValue>(C)) {
1355     // Does not need legalizing.
1356     if (!isSimple())
1357       return loadNonSimple(InsertBefore);
1358     return load(InsertBefore);
1359   }
1360   IGC_ASSERT_MESSAGE(!C->getType()->getScalarType()->isIntegerTy(1),
1361     "not expecting predicate in here");
1362   if (Constant *Consolidated = getConsolidatedConstant(C)) {
1363     // Load as a consolidated constant, then bitcast to the correct type.
1364     auto Load =
1365         ConstantLoader(Consolidated, Subtarget, DL, nullptr, AddedInstructions)
1366             .loadBig(InsertBefore);
1367     IGC_ASSERT(Load);
1368     Load = CastInst::Create(Instruction::BitCast, Load, C->getType(),
1369         Load->getName() + ".cast", InsertBefore);
1370     if (AddedInstructions)
1371       AddedInstructions->push_back(Load);
1372     return Load;
1373   }
1374   auto VT = cast<IGCLLVM::FixedVectorType>(C->getType());
1375   const unsigned NumElements = VT->getNumElements();
1376   const unsigned GRFWidthInBits = Subtarget.getGRFByteSize() * genx::ByteBits;
1377   const unsigned ElementBits = DL.getTypeSizeInBits(VT->getElementType());
1378   unsigned MaxSize = 2 * GRFWidthInBits / ElementBits;
1379   MaxSize = std::min(MaxSize, 32U);
1380   Instruction *Result = nullptr;
1381   for (unsigned Idx = 0; Idx != NumElements; ) {
1382     unsigned Size = std::min(1U << genx::log2(NumElements - Idx), MaxSize);
1383     // Load this subvector constant if necessary, and insert into the overall
1384     // value with wrregion.
1385     Constant *SubC = getConstantSubvector(C, Idx, Size);
1386     Value *SubV = SubC;
1387     ConstantLoader SubLoader(SubC, Subtarget, DL);
1388     if (!SubLoader.isSimple())
1389       SubV = SubLoader.loadNonSimple(InsertBefore);
1390     Region R(C, &DL);
1391     R.getSubregion(Idx, Size);
1392     Result = R.createWrRegion(
1393         Result ? (Value *)Result : (Value *)UndefValue::get(C->getType()), SubV,
1394         "constant.split" + Twine(Idx), InsertBefore, DebugLoc());
1395     if (AddedInstructions)
1396       AddedInstructions->push_back(Result);
1397     Idx += Size;
1398   }
1399   return Result;
1400 }
1401 
1402 /***********************************************************************
1403  * ConstantLoader::isLegalSize : detect if a constant is a legal size
1404  */
isLegalSize() const1405 bool ConstantLoader::isLegalSize() const {
1406   auto *VT = dyn_cast<IGCLLVM::FixedVectorType>(C->getType());
1407   if (!VT)
1408     return true;
1409   const int NumBits = DL.getTypeSizeInBits(VT);
1410   if (!llvm::isPowerOf2_32(NumBits))
1411     return false;
1412   const int GRFSizeInBits = Subtarget.getGRFByteSize() * genx::ByteBits;
1413   if (NumBits > GRFSizeInBits * 2)
1414     return false; // bigger than 2 GRFs
1415   if (VT->getNumElements() > 32)
1416     return false; // 64 bytes not allowed
1417   return true;
1418 }
1419 
1420 /***********************************************************************
1421  * ConstantLoader::isBigSimple : detect if a constant is either simple,
1422  *    or would be simple after being split into legal sizes
1423  *
1424  * This does not do a thorough check so it misses some cases of a constant
1425  * that would split into simple constants.
1426  */
isBigSimple() const1427 bool ConstantLoader::isBigSimple() const {
1428   IGC_ASSERT_MESSAGE(!needFixingSimple(),
1429     "simple case shall be fixed first before this call");
1430   if (isa<UndefValue>(C))
1431     return true; // undef is simple
1432   auto VT = dyn_cast<VectorType>(C->getType());
1433   if (!VT)
1434     return true; // scalar always simple
1435   if (C->getSplatValue())
1436     return true; // splat constant always simple
1437   if (DL.getTypeSizeInBits(VT->getElementType()) == 1)
1438     return true; // predicate constant always simple
1439   return false;
1440 }
1441 
1442 /***********************************************************************
1443  * ConstantLoader::isSimple : detect if a constant is "simple"
1444  *
1445  * A simple constant is one we know can be a constant operand in an instruction.
1446  */
isSimple() const1447 bool ConstantLoader::isSimple() const {
1448   IGC_ASSERT_MESSAGE(!needFixingSimple(),
1449     "simple case shall be fixed first before this call");
1450   if (isa<UndefValue>(C))
1451     return true; // undef is simple (and generates no vISA code)
1452   if (C->getType()->getScalarType()->isIntegerTy(1) && C->isAllOnesValue())
1453     return true; // all 1s predicate is simple
1454   if(User && User->isBinaryOp())
1455     if (isa<VectorType>(C->getType()))
1456       if (auto splat = C->getSplatValue())
1457         if (splat->isZeroValue())
1458           return true;
1459   if (!isLegalSize())
1460     return false; // Simple constant must be legally sized
1461   if (isBigSimple())
1462     return true; // a big simple constant that is legally sized is simple
1463   if (isPackedIntVector())
1464     return true;
1465   if (isPackedFloatVector())
1466     return true;
1467   return false;
1468 }
1469 
allowI64Ops() const1470 bool ConstantLoader::allowI64Ops() const {
1471   if (!Subtarget.hasLongLong())
1472     return false;
1473   return true;
1474 }
1475 /***********************************************************************
1476  * ConstantLoader::isPackedIntVector : check for a packed int vector
1477  *    (having already done the analysis in the ConstantLoader constructor)
1478  */
isPackedIntVector() const1479 bool ConstantLoader::isPackedIntVector() const {
1480   // Check for a packed int vector. Either the element type must be i16, or
1481   // the user (instruction using the constant) must be genx.constanti or
1482   // wrregion or wrconstregion. Not allowed if the user is a logic op.
1483   if (cast<IGCLLVM::FixedVectorType>(C->getType())->getNumElements() >
1484       ImmIntVec::Width)
1485     return false; // wrong width for packed vector
1486   if (PackedIntScale == 1 && (PackedIntAdjust == 0 || PackedIntAdjust == -8)) {
1487     if (!User)
1488       return true; // user not specified -- assume it is a mov, so wrong element
1489                    //  size is allowed
1490     if (!C->getType()->getScalarType()->isIntegerTy(16)
1491         && GenXIntrinsic::getGenXIntrinsicID(User) != GenXIntrinsic::genx_constanti
1492         && !GenXIntrinsic::isWrRegion(User))
1493       return false; // wrong element size when it is not a mov
1494     switch (User->getOpcode()) {
1495       case Instruction::And:
1496       case Instruction::Or:
1497       case Instruction::Xor:
1498         return false; // disallow packed vector in logic op
1499       default:
1500         break;
1501     }
1502     return true;
1503   }
1504   return false;
1505 }
1506 
1507 /***********************************************************************
1508  * ConstantLoader::isPackedFloatVector : check for a packed float vector
1509  *    (having already done the analysis in the ConstantLoader constructor)
1510  */
isPackedFloatVector() const1511 bool ConstantLoader::isPackedFloatVector() const {
1512   auto *VT = dyn_cast<IGCLLVM::FixedVectorType>(C->getType());
1513   if (!VT)
1514     return false;
1515   if (VT->getNumElements() > 4)
1516     return false;
1517   return PackedFloat;
1518 }
1519 
1520 /***********************************************************************
1521  * ConstantLoader::getConsolidatedConstant : get the consolidated constant
1522  *        for the given constant
1523  *
1524  * A "consolidated constant" is one where a vector of byte or short is
1525  * turned into the equivalent (as if by bitcast) vector of int.
1526  */
getConsolidatedConstant(Constant * C)1527 Constant *ConstantLoader::getConsolidatedConstant(Constant *C) {
1528   if (isa<UndefValue>(C))
1529     return nullptr;
1530   auto *VT = dyn_cast<IGCLLVM::FixedVectorType>(C->getType());
1531   if (!VT)
1532     return nullptr;
1533   const unsigned BytesPerElement =
1534       DL.getTypeSizeInBits(VT->getElementType()) / genx::ByteBits;
1535   const unsigned NumElements = VT->getNumElements();
1536   if (!BytesPerElement)
1537     return nullptr; // vector of i1
1538   if (BytesPerElement >= 4)
1539     return nullptr; // already vector of i32/i64/float/double
1540   if (NumElements * BytesPerElement & 3)
1541     return nullptr; // not a multiple of 4 bytes long
1542   // We're loading a vector of byte or short (but not i1). Use int so the
1543   // instruction does not use so many channels. This may also save it being
1544   // split by legalization.
1545   unsigned Compaction = BytesPerElement == 1 ? 4 : 2;
1546   unsigned Mask = BytesPerElement == 1 ? 0xff : 0xffff;
1547   SmallVector<Constant *, 8> Elements;
1548   Type *I32Ty = Type::getInt32Ty(C->getContext());
1549   for (unsigned i = 0; i != NumElements; i += Compaction) {
1550     unsigned Val = 0;
1551     bool IsUndef = true;
1552     for (unsigned j = 0; j != Compaction; ++j) {
1553       unsigned Bits = 0;
1554       Constant *El = C->getAggregateElement(i + j);
1555       // We assume that anything that is not ConstantInt is undefined. That
1556       // can include a constant expression with an undefined value in the
1557       // middle.
1558       if (auto CI = dyn_cast<ConstantInt>(El)) {
1559         Bits = CI->getSExtValue();
1560         IsUndef = false;
1561       }
1562       else if (auto CI = dyn_cast<ConstantFP>(El)) {
1563         APFloat V = CI->getValueAPF();
1564         Bits = V.bitcastToAPInt().getZExtValue();
1565         IsUndef = false;
1566       }
1567       Val |= (Bits & Mask) << (j * BytesPerElement * 8);
1568     }
1569     if (IsUndef)
1570       Elements.push_back(UndefValue::get(I32Ty));
1571     else
1572       Elements.push_back(ConstantInt::get(I32Ty, Val));
1573   }
1574   // Construct the constant with i32 element type.
1575   return ConstantVector::get(Elements);
1576 }
1577 
1578 /***********************************************************************
1579  * ConstantLoader::analyze : analyze a constant value
1580  *
1581  * This analyzes whether a constant of no more than the right vector width
1582  * (integer 8 or fp 4) can be loaded as a packed vector, possibly scaled
1583  * and adjusted.
1584  */
analyze()1585 void ConstantLoader::analyze() {
1586   auto *VT = dyn_cast<IGCLLVM::FixedVectorType>(C->getType());
1587   if (!VT)
1588     return;
1589   if (C->getSplatValue())
1590     return; // don't analyze if already a splat
1591   unsigned NumElements = VT->getNumElements();
1592   if (VT->getElementType()->isIntegerTy()) {
1593     unsigned MaxSize = 2 * Subtarget.getGRFByteSize(); // element type is boolean
1594     if (!VT->getElementType()->isIntegerTy(1)) {
1595       unsigned ElmSz = VT->getScalarSizeInBits() / genx::ByteBits;
1596       MaxSize = 2 * Subtarget.getGRFByteSize() / ElmSz;
1597     }
1598     if (NumElements <= MaxSize)
1599       analyzeForPackedInt(NumElements);
1600   } else if (NumElements <= 8 && VT->getElementType()->isFloatingPointTy())
1601     analyzeForPackedFloat(NumElements);
1602 }
1603 
analyzeForPackedInt(unsigned NumElements)1604 void ConstantLoader::analyzeForPackedInt(unsigned NumElements) {
1605   // Get element values.
1606   int64_t Min = INT64_MAX;
1607   int64_t Max = INT64_MIN;
1608   SmallVector<int64_t, 32> Elements;
1609   Constant *SomeDefinedElement = nullptr;
1610   for (unsigned i = 0; i != NumElements; ++i) {
1611     auto El = C->getAggregateElement(i);
1612     if (isa<UndefValue>(El))
1613       continue;
1614     SomeDefinedElement = El;
1615     int64_t Element = cast<ConstantInt>(El)->getSExtValue();
1616     Elements.push_back(Element);
1617     Min = std::min(Min, Element);
1618     Max = std::max(Max, Element);
1619   }
1620   if (Elements.empty()) {
1621     // Constant is undef.
1622     IGC_ASSERT_MESSAGE(C == UndefValue::get(C->getType()),
1623       "constant consists only of undef elements only if it's undef itself");
1624     return;
1625   }
1626   if (Elements.size() == 1) {
1627     // if we don't have an immediate user - do not create new constant
1628     // (constant materilization expects that NewC is cleared)
1629     if (!User)
1630       return;
1631     // All but one element undef. If num elements equal 8
1632     // then turn it into a splat constant
1633     if (NumElements != ImmIntVec::Width)
1634       return;
1635     NewC = ConstantVector::getSplat(IGCLLVM::getElementCount(NumElements),
1636                                     SomeDefinedElement);
1637     return;
1638   }
1639   int64_t ResArith;
1640   if (IGCLLVM::SubOverflow(Max, Min, ResArith))
1641     return;
1642   if (ResArith <= ImmIntVec::MaxUInt) {
1643     if (Min >= ImmIntVec::MinUInt && Max <= ImmIntVec::MaxUInt) {
1644       // Values all in the range [MinUInt..MaxUInt]. We can do this with a packed
1645       // unsigned int with no extra scaling or adjustment.
1646       PackedIntScale = 1;
1647       PackedIntAdjust = 0;
1648       PackedIntMax = Max;
1649       return;
1650     }
1651     if (Min >= ImmIntVec::MinSInt && Max <= ImmIntVec::MaxSInt) {
1652       // Values all in the range [MinSInt..MaxSInt]. We can do this with a packed
1653       // unsigned int with no extra scaling or adjustment.
1654       PackedIntScale = 1;
1655       PackedIntAdjust = -8;
1656       PackedIntMax = Max + 8;
1657       return;
1658     }
1659     // Values all in the range [Min..Min+MaxUInt]. We can do this
1660     // with a packed int with an adjustment.
1661     PackedIntScale = 1;
1662     PackedIntAdjust = Min;
1663     PackedIntMax = Max - Min;
1664     return;
1665   }
1666   // Get unique absolute differences, so we can detect if we have a valid
1667   // packed int vector that is then scaled and has a splatted constant
1668   // added/subtracted.
1669   SmallVector<uint64_t, 31> Diffs;
1670   SmallSet<uint64_t, 31> DiffsSet;
1671   for (unsigned i = 0, e = Elements.size() - 1; i != e; ++i) {
1672     int64_t Diff;
1673     if (IGCLLVM::SubOverflow(Elements[i + 1], Elements[i], Diff))
1674       return;
1675     if (!Diff)
1676       continue;
1677     uint64_t AbsDiff = std::abs(Diff);
1678     if (AbsDiff > UINT_MAX)
1679       return;
1680     if (DiffsSet.insert(AbsDiff).second)
1681       Diffs.push_back(AbsDiff);
1682   }
1683   IGC_ASSERT_MESSAGE(!Diffs.empty(), "not expecting splatted constant");
1684   // Calculate the GCD (greatest common divisor) of the diffs
1685   uint64_t GCD = Diffs[0];
1686   if (Diffs.size() > 1) {
1687     for(unsigned i = 1; i < Diffs.size(); i++)
1688       GCD = GreatestCommonDivisor64(GCD, Diffs[i]);
1689   }
1690   // Scale should fit in signed integer
1691   if (GCD > static_cast<uint64_t>(std::numeric_limits<int64_t>::max()))
1692     return;
1693   int64_t CurScale = static_cast<int64_t>(GCD);
1694   if (!IGCLLVM::MulOverflow(CurScale, static_cast<int64_t>(ImmIntVec::MaxUInt), ResArith) &&
1695       (Max - Min) > ResArith)
1696     return; // range of values too big
1697   PackedIntScale = CurScale;
1698   PackedIntMax = ImmIntVec::MaxUInt;
1699   // Special case adjust of 0 or -8 as then we can save doing an adjust at all
1700   // by using unsigned or signed packed vector respectively.
1701   if (!(Min % CurScale)) {
1702     if (Min >= ImmIntVec::MinUInt &&
1703         (!IGCLLVM::MulOverflow(CurScale, static_cast<int64_t>(ImmIntVec::MaxUInt), ResArith) &&
1704          Max <= ResArith)) {
1705       PackedIntAdjust = ImmIntVec::MinUInt;
1706       return;
1707     }
1708     if ((!IGCLLVM::MulOverflow(CurScale, static_cast<int64_t>(ImmIntVec::MinSInt), ResArith) &&
1709          Min >= ResArith) &&
1710         (!IGCLLVM::MulOverflow(CurScale, static_cast<int64_t>(ImmIntVec::MaxSInt), ResArith) &&
1711          Max <= ResArith)) {
1712       PackedIntAdjust = Min;
1713       PackedIntMax = ImmIntVec::MaxSInt;
1714       return;
1715     }
1716     // Special case all pre-scaled values being in [-15,0] as we can do that
1717     // by negating the scale and not needing to adjust.
1718     if ((!IGCLLVM::MulOverflow(CurScale, static_cast<int64_t>(-ImmIntVec::MaxUInt), ResArith) &&
1719          Min >= ResArith) &&
1720         Max <= -ImmIntVec::MinUInt) {
1721       PackedIntAdjust = ImmIntVec::MinUInt;
1722       PackedIntScale = -PackedIntScale;
1723       return;
1724     }
1725   }
1726   PackedIntAdjust = Min;
1727 }
1728 
is8bitPackedFloat(float f)1729 static bool is8bitPackedFloat(float f) {
1730   union {
1731     float f;
1732     unsigned u;
1733   } u;
1734 
1735   u.f = f;
1736   unsigned Exp = (u.u >> 23) & 0xFF;
1737   unsigned Frac = u.u & 0x7FFFFF;
1738   if (Exp == 0 && Frac == 0)
1739     return true;
1740   if (Exp < 124 || Exp > 131)
1741     return false;
1742   if ((Frac & 0x780000) != Frac)
1743     return false;
1744   Frac >>= 19;
1745   if (Exp == 124 && Frac == 0)
1746     return false;
1747   return true;
1748 }
1749 
analyzeForPackedFloat(unsigned NumElements)1750 void ConstantLoader::analyzeForPackedFloat(unsigned NumElements) {
1751   if (!Subtarget.hasPackedFloat())
1752     return;
1753 
1754   for (unsigned i = 0; i != NumElements; ++i) {
1755     auto Elt = C->getAggregateElement(i);
1756     if (isa<UndefValue>(Elt))
1757       continue;
1758     ConstantFP *CFP = dyn_cast<ConstantFP>(Elt);
1759     // Bail out if any element cannot be analyzed.
1760     if (!CFP)
1761       return;
1762     const APFloat &FP = CFP->getValueAPF();
1763     // Bail out if it's not supported.
1764     // TODO: Only support single precision so far.
1765     if (&FP.getSemantics() != &APFloat::IEEEsingle())
1766       return;
1767     // Bail out if it's not finite.
1768     if (!FP.isFinite())
1769       return;
1770     // Check if it could be represented in 8-bit packed float.
1771     if (!is8bitPackedFloat(FP.convertToFloat()))
1772       return;
1773   }
1774   PackedFloat = true;
1775 }
1776