1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2018-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 //
10 /// GenXEmulate
11 /// -----------
12 ///
13 /// GenXEmulate is a mudule pass that emulates certain LLVM IR instructions.
14 ///
15 //===----------------------------------------------------------------------===//
16 
17 #define DEBUG_TYPE "GENX_EMULATION"
18 
19 #include "GenX.h"
20 #include "GenXSubtarget.h"
21 #include "GenXTargetMachine.h"
22 #include "GenXUtil.h"
23 #include "IGC/common/StringMacros.hpp"
24 #include "Probe/Assertion.h"
25 
26 #include "llvmWrapper/IR/DerivedTypes.h"
27 #include "llvmWrapper/IR/Function.h"
28 
29 #include "vc/BiF/Tools.h"
30 #include "vc/GenXOpts/Utils/InternalMetadata.h"
31 #include "vc/Support/BackendConfig.h"
32 #include "vc/Support/GenXDiagnostic.h"
33 #include "vc/Utils/General/BiF.h"
34 
35 #include "llvm/GenXIntrinsics/GenXIntrinsics.h"
36 #include "llvm/GenXIntrinsics/GenXMetadata.h"
37 
38 #include <llvm/Analysis/TargetFolder.h>
39 #include <llvm/CodeGen/TargetPassConfig.h>
40 #include <llvm/IR/Function.h>
41 #include <llvm/IR/IRBuilder.h>
42 #include <llvm/IR/InstVisitor.h>
43 #include <llvm/IR/Module.h>
44 #include <llvm/Linker/Linker.h>
45 #include <llvm/Pass.h>
46 #include <llvm/Support/Process.h>
47 #include <llvm/Support/raw_ostream.h>
48 
49 #include "Probe/Assertion.h"
50 
51 #include <array>
52 #include <string>
53 
54 using namespace llvm;
55 using namespace genx;
56 
57 static constexpr const char *LibraryFunctionPrefix = "__cm_intrinsic_impl_";
58 static constexpr const char *EmuLibSDivPrefix = "__cm_intrinsic_impl_sdiv";
59 static constexpr const char *EmuLibSRemPrefix = "__cm_intrinsic_impl_srem";
60 static constexpr const char *EmuLibUDivPrefix = "__cm_intrinsic_impl_udiv";
61 static constexpr const char *EmuLibURemPrefix = "__cm_intrinsic_impl_urem";
62 static constexpr const char *EmuLibFP2UIPrefix = "__cm_intrinsic_impl_fp2ui";
63 static constexpr const char *EmuLibFP2SIPrefix = "__cm_intrinsic_impl_fp2si";
64 static constexpr const char *EmuLibUI2FPPrefix = "__cm_intrinsic_impl_ui2fp";
65 static constexpr const char *EmuLibSI2FPPrefix = "__cm_intrinsic_impl_si2fp";
66 
67 struct PrefixOpcode {
68   const char *Prefix;
69   const unsigned Opcode;
70 };
71 constexpr std::array<PrefixOpcode, 4> DivRemPrefixes = {
72     {{EmuLibSDivPrefix, BinaryOperator::SDiv},
73      {EmuLibSRemPrefix, BinaryOperator::SRem},
74      {EmuLibUDivPrefix, BinaryOperator::UDiv},
75      {EmuLibURemPrefix, BinaryOperator::URem}}};
76 
77 constexpr std::array<PrefixOpcode, 4> EmulationFPConvertsPrefixes = {
78     {{EmuLibFP2UIPrefix, Instruction::FPToUI},
79      {EmuLibFP2SIPrefix, Instruction::FPToSI},
80      {EmuLibUI2FPPrefix, Instruction::UIToFP},
81      {EmuLibSI2FPPrefix, Instruction::SIToFP}}};
82 
83 static constexpr const char *RoundingRtzSuffix = "__rtz_";
84 static constexpr const char *RoundingRteSuffix = "__rte_";
85 static constexpr const char *RoundingRtpSuffix = "__rtp_";
86 static constexpr const char *RoundingRtnSuffix = "__rtn_";
87 
88 // TODO: move this to vc-intrinsics`
89 static constexpr int VCRoundingRTE = 0;
90 static constexpr int VCRoundingRTP = 1 << 4;
91 static constexpr int VCRoundingRTN = 2 << 4;
92 static constexpr int VCRoundingRTZ = 3 << 4;
93 
94 namespace {
95 
96 static cl::opt<bool> OptDbgOnlyDisableDivremEmulation(
97     "vc-dbgonly-emu-disable-divrem", cl::init(false), cl::Hidden,
98     cl::desc("do not load divrem emulation functions"));
99 // Currenly, we have no guarantee that each and every genx intrinsic
100 // is emulated. Only the most frequently encounted are.
101 // This flag is to help finding such undetected cases.
102 static cl::opt<bool> OptStrictChecksEnable("vc-i64emu-strict-checks",
103                                            cl::init(false), cl::Hidden,
104                                            cl::desc("enables strict checks"));
105 static cl::opt<bool>
106     OptStricterSVM("vc-i64emu-strict-report-svm", cl::init(false), cl::Hidden,
107                    cl::desc("strict check will break on svm* operations"));
108 // NOTE: probably should be true by default
109 static cl::opt<bool>
110     OptStricterAtomic("vc-i64emu-strict-report-atomic", cl::init(false),
111                       cl::Hidden,
112                       cl::desc("strict check will break on 64-bit atomics"));
113 static cl::opt<bool> OptStricterOword(
114     "vc-i64emu-strict-report-oword", cl::init(false), cl::Hidden,
115     cl::desc("strict check will break on 64-bit oword reads/writes"));
116 static cl::opt<bool> OptStricterAlloc(
117     "vc-i64emu-strict-report-alloc", cl::init(false), cl::Hidden,
118     cl::desc("strict check will break on 64-bit alloc"));
119 static cl::opt<bool> OptStricterFaddr(
120     "vc-i64emu-strict-report-faddr", cl::init(false), cl::Hidden,
121     cl::desc("strict check will break on 64-bit faddr"));
122 static cl::opt<bool>
123     OptStricterConst("vc-i64emu-strict-const", cl::init(false), cl::Hidden,
124                      cl::desc("strict check will break on 64-bit constanti"));
125 static cl::opt<bool> OptStricterRegions(
126     "vc-i64emu-strict-regions", cl::init(false), cl::Hidden,
127     cl::desc("strict check will break on 64-bit rdregion/wrregion"));
128 static cl::opt<bool> OptStricterConverts(
129     "vc-i64emu-strict-converts", cl::init(false), cl::Hidden,
130     cl::desc("strict check will break on 64-bit convers which are NOT noop"));
131 // TODO: we expect this to be turned on by default
132 static cl::opt<bool> OptStrictEmulationRequests(
133     "vc-i64emu-strict-requests", cl::init(false),
134     cl::Hidden,
135     cl::desc("Explicit emulation requests are subject to stricter checks"));
136 static cl::opt<bool> OptIcmpEnable("vc-i64emu-icmp-enable", cl::init(true),
137                                    cl::Hidden,
138                                    cl::desc("enable icmp emulation"));
139 static cl::opt<bool> OptProcessPtrs("vc-i64emu-ptrs-enable", cl::init(true),
140                                     cl::Hidden,
141                                     cl::desc("enable icmp emulation"));
142 static cl::opt<bool> OptConvertPartialPredicates(
143     "vc-i64emu-icmp-ppred-lowering", cl::init(true), cl::Hidden,
144     cl::desc("if \"partial predicates\" shall be converted to icmp"));
145 
146 using IRBuilder = IRBuilder<TargetFolder>;
147 struct OpType {
148   unsigned Opcode;
149   Type *ResType;
150   Type *FirstArgType;
151 };
152 static std::function<bool(const OpType &, const OpType &)> OpTypeComparator =
__anon79d88a180202(const OpType &ot1, const OpType &ot2) 153     [](const OpType &ot1, const OpType &ot2) -> bool {
154   if (ot1.Opcode < ot2.Opcode)
155     return true;
156   if (ot2.Opcode < ot1.Opcode)
157     return false;
158   if (ot1.ResType < ot2.ResType)
159     return true;
160   if (ot2.ResType < ot1.ResType)
161     return false;
162   return ot1.FirstArgType < ot2.FirstArgType;
163 };
164 
processToEraseList(T & EraseList)165 template <typename T> static void processToEraseList(T &EraseList) {
166   std::for_each(EraseList.begin(), EraseList.end(),
167                 [](auto *Item) { Item->eraseFromParent(); });
168   EraseList.clear();
169 }
170 
171 class GenXEmulate : public ModulePass {
172 
173   friend Instruction *llvm::genx::emulateI64Operation(const GenXSubtarget *ST,
174                                                       Instruction *In,
175                                                       EmulationFlag AuxAction);
176   std::vector<Instruction *> DiscracedList;
177   // Maps <opcode, type> to its corresponding emulation function.
178   std::map<OpType, Function *, decltype(OpTypeComparator)> EmulationFuns{
179       OpTypeComparator};
180 
181   std::vector<Instruction *> ToErase;
182   const GenXSubtarget *ST = nullptr;
183 
184   class Emu64Expander : public InstVisitor<Emu64Expander, Value *> {
185 
186     friend InstVisitor<Emu64Expander, Value *>;
187 
188     const GenXSubtarget &ST;
189     std::map<OpType, Function *, decltype(OpTypeComparator)> *EmulationFuns;
190 
191     IVSplitter SplitBuilder;
192     Instruction &Inst;
193 
194     Value *expandBitwiseOp(BinaryOperator &);
195     Value *expandBitLogicOp(BinaryOperator &);
196 
197     Value *visitAdd(BinaryOperator &);
198     Value *visitSub(BinaryOperator &);
199     Value *visitAnd(BinaryOperator &);
200     Value *visitOr(BinaryOperator &);
201     Value *visitXor(BinaryOperator &);
202     Value *visitSelectInst(SelectInst &I);
203     Value *visitICmp(ICmpInst &);
204 
205     Value *visitShl(BinaryOperator &);
206     Value *visitLShr(BinaryOperator &);
207     Value *visitAShr(BinaryOperator &);
208 
209     Value *buildRightShift(IVSplitter &SplitBuilder, BinaryOperator &Op);
210 
211     Value *visitZExtInst(ZExtInst &I);
212     Value *visitSExtInst(SExtInst &I);
213 
214     Value *visitPtrToInt(PtrToIntInst &I);
215     Value *visitIntToPtr(IntToPtrInst &I);
216 
217     Value *visitGenxTrunc(CallInst &CI);
218     Value *visitGenxMinMax(CallInst &CI);
219     // genx_absi
220     Value *visitGenxAbsi(CallInst &CI);
221     // handles genx_{XX}add_sat cases
222     Value *visitGenxAddSat(CallInst &CI);
223     // handles genx_fpto{X}i_sat cases
224     Value *visitGenxFPToISat(CallInst &CI);
225 
226     // [+] bitcast
227     // [-] genx.constanti ?
228     // [-] genx.scatter ?
229     // [-] genx.gather ?
230     Value *visitCallInst(CallInst &CI);
visitInstruction(Instruction & I)231     Value *visitInstruction(Instruction &I) { return nullptr; }
232 
233     // if the value is not an Instruciton (like ConstExpr), return the original
234     // value. Return the emulated sequence otherwise
235     Value *ensureEmulated(Value *Val);
236 
237     static bool isI64PointerOp(const Instruction &I);
238     static bool isConvertOfI64(const Instruction &I);
239     static bool isI64ToFP(const Instruction &I);
240     static bool isI64Cmp(const Instruction &I);
241     static bool isI64AddSat(const Instruction &I);
242     static Value *detectBitwiseNot(BinaryOperator &);
243     static Type *changeScalarType(Type *T, Type *NewTy);
244 
245     struct VectorInfo {
246       Value *V;
247       IGCLLVM::FixedVectorType *VTy;
248     };
249     static VectorInfo toVector(IRBuilder &Builder, Value *In);
250     static bool getConstantUI32Values(Value *V,
251                                       SmallVectorImpl<uint32_t> &Result);
252 
253     // functors to help with shift emulation
254     struct LessThan32 {
operator ()__anon79d88a180111::GenXEmulate::Emu64Expander::LessThan32255       bool operator()(uint64_t Val) const { return Val < 32u; }
256     };
257     struct GreaterThan32 {
operator ()__anon79d88a180111::GenXEmulate::Emu64Expander::GreaterThan32258       bool operator()(uint64_t Val) const { return Val > 32u; }
259     };
260     struct Equals32 {
operator ()__anon79d88a180111::GenXEmulate::Emu64Expander::Equals32261       bool operator()(uint64_t Val) const { return Val == 32u; }
262     };
263 
needsEmulation() const264     bool needsEmulation() const {
265       return (SplitBuilder.IsI64Operation() || isI64Cmp(Inst) ||
266               isConvertOfI64(Inst) || isI64PointerOp(Inst) ||
267               isI64AddSat(Inst));
268     }
269 
getIRBuilder()270     IRBuilder getIRBuilder() {
271       return IRBuilder(Inst.getParent(), BasicBlock::iterator(&Inst),
272                        TargetFolder(Inst.getModule()->getDataLayout()));
273     }
274 
275     class ConstantEmitter {
276     public:
ConstantEmitter(Value * V)277       ConstantEmitter(Value *V)
278           : ElNum(
279                 cast<IGCLLVM::FixedVectorType>(V->getType())->getNumElements()),
280             Ty32(Type::getInt32Ty(V->getContext())) {}
getSplat(unsigned Val) const281       Constant *getSplat(unsigned Val) const {
282         auto *KV = Constant::getIntegerValue(Ty32, APInt(32, Val));
283         return ConstantDataVector::getSplat(ElNum, KV);
284       }
getZero() const285       Constant *getZero() const { return Constant::getNullValue(getVTy()); }
getOnes() const286       Constant *getOnes() const { return Constant::getAllOnesValue(getVTy()); }
getVTy() const287       Type *getVTy() const {
288         return IGCLLVM::FixedVectorType::get(Ty32, ElNum);
289       }
290 
291     private:
292       unsigned ElNum = 0;
293       Type *Ty32 = nullptr;
294     };
295 
296   public:
Emu64Expander(const GenXSubtarget & ST,Instruction & I,std::map<OpType,Function *,decltype(OpTypeComparator)> * EF=nullptr)297     Emu64Expander(
298         const GenXSubtarget &ST, Instruction &I,
299         std::map<OpType, Function *, decltype(OpTypeComparator)> *EF = nullptr)
300         : ST(ST), SplitBuilder(I), Inst(I), EmulationFuns(EF) {}
301 
getSubtarget() const302     const GenXSubtarget &getSubtarget() const { return ST; }
tryExpand()303     Value *tryExpand() {
304       if (!needsEmulation())
305         return nullptr;
306       LLVM_DEBUG(dbgs() << "i64-emu: trying " << Inst << "\n");
307       auto *Result = visit(Inst);
308 
309       if (Result)
310         LLVM_DEBUG(dbgs() << "i64-emu: emulated with " << *Result << "\n");
311 
312       return Result;
313     }
314     using LHSplit = IVSplitter::LoHiSplit;
315     Value *buildTernaryAddition(IRBuilder &Builder, Value &A, Value &B,
316                                 Value &C, const Twine &Name) const;
317     struct AddSubExtResult {
318       Value *Val; // Main Value
319       Value *CB;  // Carry/Borrow
320     };
321     static AddSubExtResult buildAddc(Module *M, IRBuilder &B, Value &R,
322                                      Value &L, const Twine &Prefix);
323     static AddSubExtResult buildSubb(Module *M, IRBuilder &B, Value &L,
324                                      Value &R, const Twine &Prefix);
325     static Value *buildGeneralICmp(IRBuilder &B, CmpInst::Predicate P,
326                                    bool IsPartialPredicate, const LHSplit &L,
327                                    const LHSplit &R);
328     static Value *tryOptimizedShr(IRBuilder &B, IVSplitter &SplitBuilder,
329                                   BinaryOperator &Op, ArrayRef<uint32_t> Sa);
330     static Value *tryOptimizedShl(IRBuilder &B, IVSplitter &SplitBuilder,
331                                   BinaryOperator &Op, ArrayRef<uint32_t> Sa);
332     static Value *buildGenericRShift(IRBuilder &B, IVSplitter &SplitBuilder,
333                                      BinaryOperator &Op);
334 
335     enum Rounding {
336       // Not used currenly
337     };
338     struct ShiftInfo {
ShiftInfo__anon79d88a180111::GenXEmulate::Emu64Expander::ShiftInfo339       ShiftInfo(Value *ShaIn, Value *Sh32In, Value *Mask1In, Value *Mask0In)
340           : Sha{ShaIn}, Sh32{Sh32In}, Mask1{Mask1In}, Mask0{Mask0In} {}
341       // Masked Shift Amount
342       Value *Sha = nullptr;
343       // 32 - Sha
344       Value *Sh32 = nullptr;
345       // To zero-out the high part (shift >= 32)
346       Value *Mask1 = nullptr;
347       // To negate results if Sha = 0
348       Value *Mask0 = nullptr;
349     };
350     static Value *buildPartialRShift(IRBuilder &B, Value *SrcLo, Value *SrcHi,
351                                      const ShiftInfo &SI);
352     static ShiftInfo constructShiftInfo(IRBuilder &B, Value *Base);
353 
354     static bool hasStrictEmulationRequirement(Instruction *Inst);
355   };
356 
357 public:
358   static char ID;
GenXEmulate()359   explicit GenXEmulate() : ModulePass(ID) {}
getPassName() const360   StringRef getPassName() const override { return "GenX emulation"; }
361   void getAnalysisUsage(AnalysisUsage &AU) const override;
362   bool runOnModule(Module &M) override;
363   void runOnFunction(Function &F);
364 
365 private:
366   Value *emulateInst(Instruction *Inst);
367   Function *getEmulationFunction(const Instruction *Inst) const;
368   void buildEmuFunCache(Module &M);
369 
370   // Check if a function is to emulate instructions.
isEmulationFunction(const Function * F)371   static bool isEmulationFunction(const Function* F) {
372     return F->hasFnAttribute(genx::FunctionMD::VCEmulationRoutine);
373   }
374 };
375 
376 } // end namespace
377 
isI64PointerOp(const Instruction & I)378 bool GenXEmulate::Emu64Expander::isI64PointerOp(const Instruction &I) {
379   auto Opcode = I.getOpcode();
380   const DataLayout &DL = I.getModule()->getDataLayout();
381   if (Opcode == Instruction::ICmp) {
382     auto *OpSTy = I.getOperand(0)->getType()->getScalarType();
383     if (!OpSTy->isPointerTy())
384       return false;
385     if (DL.getTypeSizeInBits(OpSTy) < 64)
386       return false;
387     return true;
388   }
389   if (Opcode == Instruction::PtrToInt || Opcode == Instruction::IntToPtr) {
390     auto *PtrType = I.getType()->getScalarType();
391     auto *IntType = I.getOperand(0)->getType()->getScalarType();
392     if (Opcode == Instruction::PtrToInt)
393       std::swap(PtrType, IntType);
394     if (cast<CastInst>(&I)->isNoopCast(DL))
395       return false;
396     return (DL.getTypeSizeInBits(PtrType) == 64 ||
397             DL.getTypeSizeInBits(IntType) == 64);
398   }
399   return false;
400 }
isConvertOfI64(const Instruction & I)401 bool GenXEmulate::Emu64Expander::isConvertOfI64(const Instruction &I) {
402 
403   if (GenXEmulate::Emu64Expander::isI64ToFP(I))
404     return true;
405 
406   auto IID = GenXIntrinsic::getAnyIntrinsicID(&I);
407   switch (IID) {
408   case GenXIntrinsic::genx_uutrunc_sat:
409   case GenXIntrinsic::genx_sstrunc_sat:
410   case GenXIntrinsic::genx_ustrunc_sat:
411   case GenXIntrinsic::genx_sutrunc_sat:
412     return I.getOperand(0)->getType()->getScalarType()->isIntegerTy(64);
413   }
414   return false;
415 }
isI64ToFP(const Instruction & I)416 bool GenXEmulate::Emu64Expander::isI64ToFP(const Instruction &I) {
417   if (Instruction::UIToFP != I.getOpcode() &&
418       Instruction::SIToFP != I.getOpcode()) {
419     return false;
420   }
421   return I.getOperand(0)->getType()->getScalarType()->isIntegerTy(64);
422 }
isI64Cmp(const Instruction & I)423 bool GenXEmulate::Emu64Expander::isI64Cmp(const Instruction &I) {
424   if (Instruction::ICmp != I.getOpcode())
425     return false;
426   return I.getOperand(0)->getType()->getScalarType()->isIntegerTy(64);
427 }
isI64AddSat(const Instruction & I)428 bool GenXEmulate::Emu64Expander::isI64AddSat(const Instruction &I) {
429   if (auto *CI = dyn_cast<CallInst>(&I)) {
430     switch (GenXIntrinsic::getAnyIntrinsicID(CI)) {
431     case GenXIntrinsic::genx_suadd_sat:
432     case GenXIntrinsic::genx_usadd_sat:
433     case GenXIntrinsic::genx_uuadd_sat:
434     case GenXIntrinsic::genx_ssadd_sat: {
435       Value *Arg0 = I.getOperand(0);
436       Value *Arg1 = I.getOperand(1);
437       return Arg0->getType()->isIntOrIntVectorTy(64) &&
438              Arg1->getType()->isIntOrIntVectorTy(64);
439     }
440     default:
441       return false;
442     }
443   }
444   return false;
445 }
446 
detectBitwiseNot(BinaryOperator & Op)447 Value *GenXEmulate::Emu64Expander::detectBitwiseNot(BinaryOperator &Op) {
448   if (Instruction::Xor != Op.getOpcode())
449     return nullptr;
450 
451   auto isAllOnes = [](const Value *V) {
452     if (auto *C = dyn_cast<Constant>(V))
453       return C->isAllOnesValue();
454     return false;
455   };
456 
457   if (isAllOnes(Op.getOperand(1)))
458     return Op.getOperand(0);
459   if (isAllOnes(Op.getOperand(0)))
460     return Op.getOperand(1);
461 
462   return nullptr;
463 }
464 
465 // Changes scalar to scalar, vector to vector
changeScalarType(Type * T,Type * NewTy)466 Type *GenXEmulate::Emu64Expander::changeScalarType(Type *T, Type *NewTy) {
467   IGC_ASSERT_MESSAGE(NewTy == NewTy->getScalarType(), "NewTy must be scalar");
468   return (T->isVectorTy())
469              ? IGCLLVM::FixedVectorType::get(
470                    NewTy, cast<IGCLLVM::FixedVectorType>(T)->getNumElements())
471              : NewTy;
472 }
473 
474 // changes vector/scalar i64 type so it now uses scalar type i32
475 // <2 x i64> -> <4 x i32>
476 // i64 -> <2 x i32>
convertI64TypeToI32(const Type * OldType)477 static Type *convertI64TypeToI32(const Type *OldType) {
478   IGC_ASSERT_MESSAGE(OldType, "Error: nullptr input");
479   IGC_ASSERT_MESSAGE(OldType->isIntOrIntVectorTy(),
480                      "Error: OldType not int or int vector type");
481   IGC_ASSERT_MESSAGE(OldType->getScalarType()->isIntegerTy(64),
482                      "Error: OldType Scalar type not i64");
483 
484   bool OldTypeIsVec = isa<IGCLLVM::FixedVectorType>(OldType);
485 
486   Type *Int32Ty = Type::getInt32Ty(OldType->getContext());
487 
488   unsigned OldWidth =
489       OldTypeIsVec ? cast<IGCLLVM::FixedVectorType>(OldType)->getNumElements()
490                    : 1;
491 
492   constexpr unsigned Multiplier = 2;
493   unsigned NewWidth = OldWidth * Multiplier;
494   return IGCLLVM::FixedVectorType::get(Int32Ty, NewWidth);
495 }
496 
497 // Change type and exec size, like
498 // or <2 x i64> -> or <4 x i32>
499 // or i64 -> or < 2 x i32>
500 //
501 // So, resulted llvm IR:
502 // From:
503 // %res = or <2 x i64> %val1, %val2
504 // To:
505 // %val1.cast = bitcast %val1 to <4 x i32>
506 // %val2.cast = bitcast %val2 to <4 x i32>
507 // %res.tmp = or <4 x i32> %val1.cast, %val2.cast
508 // %res = bitcast %res.tmp to <2 x i64>
expandBitLogicOp(BinaryOperator & Op)509 Value *GenXEmulate::Emu64Expander::expandBitLogicOp(BinaryOperator &Op) {
510   auto Builder = getIRBuilder();
511 
512   Type *PrevBinOpTy = Op.getType();
513   Type *NextBinOpTy = convertI64TypeToI32(PrevBinOpTy);
514   IGC_ASSERT(NextBinOpTy);
515 
516   Value *Op0 = Op.getOperand(0);
517   Value *Op1 = Op.getOperand(1);
518 
519   Value *Op0Cast =
520       Builder.CreateBitCast(Op0, NextBinOpTy, Op0->getName() + ".cast");
521   Value *Op1Cast =
522       Builder.CreateBitCast(Op1, NextBinOpTy, Op1->getName() + ".cast");
523 
524   Value *BinOp = Builder.CreateBinOp(Op.getOpcode(), Op0Cast, Op1Cast,
525                                      Twine("int_emu.") + Inst.getName());
526 
527   return Builder.CreateBitCast(BinOp, PrevBinOpTy, Op.getName() + ".cast");
528 }
529 
expandBitwiseOp(BinaryOperator & Op)530 Value *GenXEmulate::Emu64Expander::expandBitwiseOp(BinaryOperator &Op) {
531   auto Src0 = SplitBuilder.splitOperandHalf(0);
532   auto Src1 = SplitBuilder.splitOperandHalf(1);
533 
534   auto Builder = getIRBuilder();
535 
536   Value *Part1 = Builder.CreateBinOp(Op.getOpcode(), Src0.Left, Src1.Left,
537                                      Inst.getName() + ".part1");
538   Value *Part2 = Builder.CreateBinOp(Op.getOpcode(), Src0.Right, Src1.Right,
539                                      Inst.getName() + ".part2");
540   return SplitBuilder.combineHalfSplit(
541       {Part1, Part2}, Twine("int_emu.") + Op.getOpcodeName() + ".",
542       Inst.getType()->isIntegerTy());
543 }
visitAdd(BinaryOperator & Op)544 Value *GenXEmulate::Emu64Expander::visitAdd(BinaryOperator &Op) {
545   auto Src0 = SplitBuilder.splitOperandLoHi(0);
546   auto Src1 = SplitBuilder.splitOperandLoHi(1);
547 
548   auto Builder = getIRBuilder();
549   // add64 transforms as:
550   //    [add_lo, carry] = genx_addc(src0.l0, src1.lo)
551   //    add_hi = add(carry, add(src0.hi, src1.hi))
552   //    add64  = combine(add_lo,add_hi)
553   auto AddcRes = buildAddc(Inst.getModule(), Builder, *Src0.Lo, *Src1.Lo,
554                            "int_emu.add64.lo.");
555   auto *AddLo = AddcRes.Val;
556   auto *AddHi =
557       buildTernaryAddition(Builder, *AddcRes.CB, *Src0.Hi, *Src1.Hi, "add_hi");
558   return SplitBuilder.combineLoHiSplit(
559       {AddLo, AddHi}, Twine("int_emu.") + Op.getOpcodeName() + ".",
560       Inst.getType()->isIntegerTy());
561 }
visitSub(BinaryOperator & Op)562 Value *GenXEmulate::Emu64Expander::visitSub(BinaryOperator &Op) {
563   auto Src0 = SplitBuilder.splitOperandLoHi(0);
564   auto Src1 = SplitBuilder.splitOperandLoHi(1);
565 
566   auto *SubbFunct = GenXIntrinsic::getGenXDeclaration(
567       Inst.getModule(), GenXIntrinsic::genx_subb,
568       {Src0.Lo->getType(), Src1.Lo->getType()});
569 
570   auto Builder = getIRBuilder();
571   // sub64 transforms as:
572   //    [sub_lo, borrow] = genx_subb(src0.l0, src1.lo)
573   //    sub_hi = add(src0.hi, add(-borrow, -src1.hi))
574   //    sub64  = combine(sub_lo, sub_hi)
575   using namespace GenXIntrinsic::GenXResult;
576   auto *SubbVal = Builder.CreateCall(SubbFunct, {Src0.Lo, Src1.Lo}, "subb");
577   auto *SubLo = Builder.CreateExtractValue(SubbVal, {IdxSubb_Sub}, "subb.sub");
578   auto *Borrow =
579       Builder.CreateExtractValue(SubbVal, {IdxSubb_Borrow}, "subb.borrow");
580   auto *MinusBorrow = Builder.CreateNeg(Borrow, "borrow.negate");
581   auto *MinusS1Hi = Builder.CreateNeg(Src1.Hi, "negative.src1_hi");
582   auto *SubHi = buildTernaryAddition(Builder, *Src0.Hi, *MinusBorrow,
583                                      *MinusS1Hi, "sub_hi");
584   return SplitBuilder.combineLoHiSplit(
585       {SubLo, SubHi}, Twine("int_emu.") + Op.getOpcodeName() + ".",
586       Inst.getType()->isIntegerTy());
587 }
visitAnd(BinaryOperator & Op)588 Value *GenXEmulate::Emu64Expander::visitAnd(BinaryOperator &Op) {
589   return expandBitLogicOp(Op);
590 }
visitOr(BinaryOperator & Op)591 Value *GenXEmulate::Emu64Expander::visitOr(BinaryOperator &Op) {
592   return expandBitLogicOp(Op);
593 }
visitXor(BinaryOperator & Op)594 Value *GenXEmulate::Emu64Expander::visitXor(BinaryOperator &Op) {
595   if (auto *NotOperand = detectBitwiseNot(Op)) {
596     unsigned OperandIdx = NotOperand == Op.getOperand(0) ? 0 : 1;
597     auto Src0 = SplitBuilder.splitOperandHalf(OperandIdx);
598     auto *Part1 = BinaryOperator::CreateNot(Src0.Left, ".part1_not", &Inst);
599     auto *Part2 = BinaryOperator::CreateNot(Src0.Right, ".part2_not", &Inst);
600     Part1->setDebugLoc(Inst.getDebugLoc());
601     Part2->setDebugLoc(Inst.getDebugLoc());
602     return SplitBuilder.combineHalfSplit({Part1, Part2}, "int_emu.not.",
603                                          Op.getType()->isIntegerTy());
604   }
605   return expandBitLogicOp(Op);
606 }
607 GenXEmulate::Emu64Expander::VectorInfo
toVector(IRBuilder & Builder,Value * In)608 GenXEmulate::Emu64Expander::toVector(IRBuilder &Builder, Value *In) {
609   if (In->getType()->isVectorTy())
610     return {In, cast<IGCLLVM::FixedVectorType>(In->getType())};
611 
612   if (auto *CIn = dyn_cast<ConstantInt>(In)) {
613     uint64_t CVals[] = {CIn->getZExtValue()};
614     auto *VectorValue = ConstantDataVector::get(In->getContext(), CVals);
615     return {VectorValue,
616             cast<IGCLLVM::FixedVectorType>(VectorValue->getType())};
617   }
618   auto *VTy = IGCLLVM::FixedVectorType::get(In->getType(), 1);
619   auto *VectorValue = Builder.CreateBitCast(In, VTy);
620   return {VectorValue, VTy};
621   // Note: alternatively, we could do something like this:
622   // Value *UndefVector = UndefValue::get(VTy);
623   // return Builder.CreateInsertElement(UndefVector, In, (uint64_t)0, ...
624 }
getConstantUI32Values(Value * V,SmallVectorImpl<uint32_t> & Result)625 bool GenXEmulate::Emu64Expander::getConstantUI32Values(
626     Value *V, SmallVectorImpl<uint32_t> &Result) {
627 
628   auto FitsUint32 = [](uint64_t V) {
629     return V <= std::numeric_limits<uint32_t>::max();
630   };
631   Result.clear();
632   if (auto *Scalar = dyn_cast<ConstantInt>(V)) {
633     uint64_t Value = Scalar->getZExtValue();
634     if (!FitsUint32(Value))
635       return false;
636     Result.push_back(Value);
637     return true;
638   }
639   auto *SeqVal = dyn_cast<ConstantDataSequential>(V);
640   if (!SeqVal)
641     return false;
642 
643   Result.reserve(SeqVal->getNumElements());
644   for (unsigned i = 0; i < SeqVal->getNumElements(); ++i) {
645     auto *CV = dyn_cast_or_null<ConstantInt>(SeqVal->getAggregateElement(i));
646     if (!CV)
647       return false;
648     uint64_t Value = CV->getZExtValue();
649     if (!FitsUint32(Value))
650       return false;
651     Result.push_back(Value);
652   }
653   return true;
654 }
visitSelectInst(SelectInst & I)655 Value *GenXEmulate::Emu64Expander::visitSelectInst(SelectInst &I) {
656   auto SrcTrue = SplitBuilder.splitOperandLoHi(1);
657   auto SrcFalse = SplitBuilder.splitOperandLoHi(2);
658   auto *Cond = I.getCondition();
659 
660   auto Builder = getIRBuilder();
661   // sel from 64-bit values transforms as:
662   //    split TrueVal and FalseVal on lo/hi parts
663   //    lo_part = self(cond, src0.l0, src1.lo)
664   //    hi_part = self(cond, src0.hi, src1.hi)
665   //    result  = combine(lo_part, hi_part)
666   auto *SelLo = Builder.CreateSelect(Cond, SrcTrue.Lo, SrcFalse.Lo, "sel.lo");
667   auto *SelHi = Builder.CreateSelect(Cond, SrcTrue.Hi, SrcFalse.Hi, "sel.hi");
668   return SplitBuilder.combineLoHiSplit(
669       {SelLo, SelHi}, Twine("int_emu.") + I.getOpcodeName() + ".",
670       I.getType()->isIntegerTy());
671 }
visitICmp(ICmpInst & Cmp)672 Value *GenXEmulate::Emu64Expander::visitICmp(ICmpInst &Cmp) {
673   if (!OptIcmpEnable)
674     return nullptr;
675 
676   auto Builder = getIRBuilder();
677 
678   if (isI64PointerOp(Cmp)) {
679 
680     if (!OptProcessPtrs) {
681       LLVM_DEBUG(dbgs() << "i64-emu::WARNING: " << Cmp << " won't be emulated\n");
682       return nullptr;
683     }
684 
685     Type *Ty64 = Builder.getInt64Ty();
686     if (Cmp.getType()->isVectorTy()) {
687       auto NumElements =
688           cast<IGCLLVM::FixedVectorType>(Cmp.getType())->getNumElements();
689       Ty64 = IGCLLVM::FixedVectorType::get(Ty64, NumElements);
690     }
691     auto *IL = Builder.CreatePtrToInt(Cmp.getOperand(0), Ty64);
692     auto *IR = Builder.CreatePtrToInt(Cmp.getOperand(1), Ty64);
693     // Create new 64-bit compare
694     auto *NewICMP = Builder.CreateICmp(Cmp.getPredicate(), IL, IR);
695     return ensureEmulated(NewICMP);
696   }
697 
698   const bool PartialPredicate =
699       std::any_of(Cmp.user_begin(), Cmp.user_end(), [](const User *U) {
700         auto IID = GenXIntrinsic::getAnyIntrinsicID(U);
701         return IID == GenXIntrinsic::genx_wrpredregion ||
702                IID == GenXIntrinsic::genx_wrpredpredregion;
703       });
704 
705   unsigned BaseOperand = 0;
706   const bool FoldConstants = !(PartialPredicate && OptConvertPartialPredicates);
707   IVSplitter Splitter(Cmp, &BaseOperand);
708   auto Src0 = Splitter.splitOperandLoHi(0, FoldConstants);
709   auto Src1 = Splitter.splitOperandLoHi(1, FoldConstants);
710 
711   Value *Result = buildGeneralICmp(Builder, Cmp.getPredicate(),
712                                    PartialPredicate, Src0, Src1);
713 
714   if (Cmp.getType()->isIntegerTy() && !Result->getType()->isIntegerTy()) {
715     // we expect this cast to be possible
716     IGC_ASSERT(Cmp.getType() == Result->getType()->getScalarType());
717     Result = Builder.CreateBitCast(Result, Cmp.getType(),
718                                    Result->getName() + ".toi");
719   }
720   return Result;
721 }
visitShl(BinaryOperator & Op)722 Value *GenXEmulate::Emu64Expander::visitShl(BinaryOperator &Op) {
723 
724   auto Builder = getIRBuilder();
725 
726   llvm::SmallVector<uint32_t, 8> ShaVals;
727   if (getConstantUI32Values(Op.getOperand(1), ShaVals)) {
728     auto *Result = tryOptimizedShl(Builder, SplitBuilder, Op, ShaVals);
729     if (Result)
730       return Result;
731   }
732 
733   auto L = SplitBuilder.splitOperandLoHi(0);
734   auto R = SplitBuilder.splitOperandLoHi(1);
735 
736   auto SI = constructShiftInfo(Builder, R.Lo);
737   ConstantEmitter K(L.Lo);
738 
739   // Shift Left
740   // 1. Calculate MASK1. MASK1 is 0 when the shift is >= 32 (large shift)
741   // 2. Calculate MASK0. MASK0 is 0 iff the shift is 0
742   // 3. Calculate Lo part:
743   //    [(L.Lo *SHL* SHA) *AND* MASK1 | MASK1 to ensure zero if large shift
744   auto *Lo = Builder.CreateAnd(Builder.CreateShl(L.Lo, SI.Sha), SI.Mask1);
745   // 4. Calculate Hi part:
746   // Hl1: [L.Lo *SHL* (SHA - 32)] *AND* ~MASK1 | shifted out values, large shift
747   // Hl2: [(L.Lo *AND* MASK0) *LSR* (32 - SHA)] *AND* MASK1 | nz for small shift
748   // Hh:  [(L.Hi *SHL* Sha)] *AND* MASK1 | MASK1 discards result if large shift
749   // Hi:  *OR* the above
750   // NOTE: SI.Sh32 == (32 - SHA)
751   auto *Hl1 = Builder.CreateShl(L.Lo, Builder.CreateNeg(SI.Sh32));
752   Hl1 = Builder.CreateAnd(Hl1, Builder.CreateNot(SI.Mask1));
753 
754   auto *Hl2 = Builder.CreateLShr(Builder.CreateAnd(L.Lo, SI.Mask0), SI.Sh32);
755   Hl2 = Builder.CreateAnd(Hl2, SI.Mask1);
756 
757   auto *Hh = Builder.CreateAnd(Builder.CreateShl(L.Hi, SI.Sha), SI.Mask1);
758 
759   auto *Hi = Builder.CreateOr(Hh, Builder.CreateOr(Hl1, Hl2));
760   return SplitBuilder.combineLoHiSplit(
761       {Lo, Hi}, Twine("int_emu.") + Op.getOpcodeName() + ".",
762       Op.getType()->isIntegerTy());
763 }
visitLShr(BinaryOperator & Op)764 Value *GenXEmulate::Emu64Expander::visitLShr(BinaryOperator &Op) {
765   return buildRightShift(SplitBuilder, Op);
766 }
visitAShr(BinaryOperator & Op)767 Value *GenXEmulate::Emu64Expander::visitAShr(BinaryOperator &Op) {
768   return buildRightShift(SplitBuilder, Op);
769 }
770 
visitZExtInst(ZExtInst & I)771 Value *GenXEmulate::Emu64Expander::visitZExtInst(ZExtInst &I) {
772   auto Builder = getIRBuilder();
773   auto VOp = toVector(Builder, I.getOperand(0));
774   Value *LoPart = VOp.V;
775   if (VOp.VTy->getScalarType()->getPrimitiveSizeInBits() < 32) {
776     auto *ExtendedType = IGCLLVM::FixedVectorType::get(
777         Builder.getInt32Ty(), VOp.VTy->getNumElements());
778     LoPart = Builder.CreateZExt(LoPart, ExtendedType, ".zext32");
779   }
780   auto *ZeroValue = Constant::getNullValue(LoPart->getType());
781   return SplitBuilder.combineLoHiSplit({LoPart, ZeroValue}, "int_emu.zext64.",
782                                        Inst.getType()->isIntegerTy());
783 }
visitSExtInst(SExtInst & I)784 Value *GenXEmulate::Emu64Expander::visitSExtInst(SExtInst &I) {
785   auto Builder = getIRBuilder();
786   auto VOp = toVector(Builder, I.getOperand(0));
787   auto *LoPart = VOp.V;
788   if (VOp.VTy->getScalarType()->getPrimitiveSizeInBits() < 32) {
789     auto *ExtendedType = IGCLLVM::FixedVectorType::get(
790         Builder.getInt32Ty(), VOp.VTy->getNumElements());
791     LoPart = Builder.CreateSExt(LoPart, ExtendedType, ".sext32");
792   }
793   auto *HiPart = Builder.CreateAShr(LoPart, 31u, ".sign_hi");
794   return SplitBuilder.combineLoHiSplit({LoPart, HiPart}, "int_emu.sext64.",
795                                        Inst.getType()->isIntegerTy());
796 }
visitPtrToInt(PtrToIntInst & I)797 Value *GenXEmulate::Emu64Expander::visitPtrToInt(PtrToIntInst &I) {
798 
799   const DataLayout &DL = I.getModule()->getDataLayout();
800   // do not emulate noop
801   if (cast<CastInst>(&I)->isNoopCast(DL))
802     return nullptr;
803 
804   if (!OptProcessPtrs) {
805     LLVM_DEBUG(dbgs() << "i64-emu::WARNING: " << I << " won't be emulated\n");
806     return nullptr;
807   }
808   // ptr32 -> i64 conversions are not supported
809   if (DL.getTypeSizeInBits(I.getOperand(0)->getType()->getScalarType()) <
810       DL.getTypeSizeInBits(I.getType()->getScalarType())) {
811     LLVM_DEBUG(dbgs() << "i64-emu::ERROR: " << I << " can't be emulated\n");
812     vc::diagnose(I.getContext(), "GenXEmulate", &I,
813                  "ptr32->i64 extensions are not supported");
814   }
815 
816   auto Builder = getIRBuilder();
817   auto VOp = toVector(Builder, I.getOperand(0));
818 
819   auto *VTy64 = IGCLLVM::FixedVectorType::get(Builder.getInt64Ty(),
820                                               VOp.VTy->getNumElements());
821   auto *Cast = Builder.CreatePtrToInt(VOp.V, VTy64);
822 
823   auto *ResTy = I.getType();
824   unsigned Stride =
825       VTy64->getPrimitiveSizeInBits() / ResTy->getPrimitiveSizeInBits();
826   unsigned NumElements = VOp.VTy->getNumElements();
827 
828   auto *VElTy = IGCLLVM::FixedVectorType::get(ResTy->getScalarType(),
829                                               Stride * NumElements);
830   auto *ElCast = Builder.CreateBitCast(Cast, VElTy, "int_emu.ptr2int.elcast.");
831   genx::Region R(ElCast);
832   R.NumElements = NumElements;
833   R.Stride = Stride;
834   R.Width = NumElements;
835   R.VStride = R.Stride * R.Width;
836   auto *Result = (Value *)R.createRdRegion(
837       ElCast, "int_emu.trunc." + I.getName() + ".", &I, I.getDebugLoc());
838   if (Result->getType() != ResTy) {
839     Result = Builder.CreateBitCast(
840         Result, ResTy, Twine("int_emu.trunc.") + I.getName() + ".to_s.");
841   }
842   return Result;
843 }
visitIntToPtr(IntToPtrInst & I)844 Value *GenXEmulate::Emu64Expander::visitIntToPtr(IntToPtrInst &I) {
845 
846   const DataLayout &DL = I.getModule()->getDataLayout();
847   // do not emulate noop
848   if (cast<CastInst>(&I)->isNoopCast(DL))
849     return nullptr;
850 
851   if (!OptProcessPtrs) {
852     LLVM_DEBUG(dbgs() << "i64-emu::WARNING: " << I << " won't be emulated\n");
853     return nullptr;
854   }
855   // i64 -> ptr32 truncations are not supported
856   if (DL.getTypeSizeInBits(I.getOperand(0)->getType()->getScalarType()) >
857       DL.getTypeSizeInBits(I.getType()->getScalarType())) {
858     LLVM_DEBUG(dbgs() << "i64-emu::ERROR: " << I << " can't be emulated\n");
859     vc::diagnose(I.getContext(), "GenXEmulate", &I,
860                  "i64->ptr32 truncations are not supported");
861   }
862 
863   auto Builder = getIRBuilder();
864   auto VOp = toVector(Builder, I.getOperand(0));
865 
866   auto *VTy32 = IGCLLVM::FixedVectorType::get(Builder.getInt32Ty(),
867                                               VOp.VTy->getNumElements());
868   auto *VTy64 = IGCLLVM::FixedVectorType::get(Builder.getInt64Ty(),
869                                               VOp.VTy->getNumElements());
870   Value *VI32 = VOp.V;
871   if (VOp.VTy != VTy32)
872     VI32 = Builder.CreateZExt(VOp.V, VTy32);
873 
874   auto *Zext64 = Builder.CreateZExt(VI32, VTy64);
875   auto *Zext = ensureEmulated(Zext64);
876 
877   Type *ResType = I.getType();
878   Type *CnvType = ResType;
879   if (!ResType->isVectorTy()) {
880     CnvType = IGCLLVM::FixedVectorType::get(ResType, 1);
881   }
882   auto *Result = Builder.CreateIntToPtr(Zext, CnvType);
883   if (ResType != CnvType) {
884     Result = Builder.CreateBitCast(Result, ResType,
885                                    Twine("int_emu.") + I.getOpcodeName() + ".");
886   }
887   return Result;
888 }
visitGenxTrunc(CallInst & CI)889 Value *GenXEmulate::Emu64Expander::visitGenxTrunc(CallInst &CI) {
890 
891   auto IID = GenXIntrinsic::getAnyIntrinsicID(&Inst);
892   unsigned DstSize = CI.getType()->getScalarType()->getPrimitiveSizeInBits();
893   IGC_ASSERT(DstSize == 8 || DstSize == 16 || DstSize == 32 || DstSize == 64);
894 
895   // early exit
896   if (IID == GenXIntrinsic::genx_uutrunc_sat ||
897       IID == GenXIntrinsic::genx_sstrunc_sat) {
898     if (DstSize == 64)
899       return CI.getOperand(0);
900   }
901 
902   auto Builder = getIRBuilder();
903   auto VOp = toVector(Builder, CI.getOperand(0));
904 
905   auto MakeConstantSplat64 = [](IRBuilder &B, IGCLLVM::FixedVectorType *VTy,
906                                 uint64_t Value) {
907     auto *KV = Constant::getIntegerValue(B.getInt64Ty(), APInt(64, Value));
908     return ConstantDataVector::getSplat(VTy->getNumElements(), KV);
909   };
910   auto MaxDstSigned   = [&](unsigned DstSize) {
911      uint64_t MaxVal = (1ull << (DstSize - 1)) - 1;
912      return MakeConstantSplat64(Builder, VOp.VTy, MaxVal);
913   };
914   auto MinDstSigned   = [&](unsigned DstSize) {
915      uint64_t Ones = ~0ull;
916      uint64_t MinVal = Ones << (DstSize - 1);
917      return MakeConstantSplat64(Builder, VOp.VTy, MinVal);
918   };
919   auto MaxDstUnsigned = [&](unsigned DstSize) {
920      uint64_t MaxVal = ~0ull;
921      MaxVal = MaxVal >> (64 - DstSize);
922      return MakeConstantSplat64(Builder, VOp.VTy, MaxVal);
923   };
924   auto MinDstUnsigned = [&](unsigned DstSize) {
925      return MakeConstantSplat64(Builder, VOp.VTy, 0);
926   };
927 
928   Value *Cond1 = nullptr;
929   Value *Limit1 = nullptr;
930   // optional
931   Value *Cond2 = nullptr;
932   Value *Limit2 = nullptr;
933 
934   switch (IID) {
935   case GenXIntrinsic::genx_uutrunc_sat:
936     // UGT maxDstUnsigend -> maxDstUnsigned
937     Limit1 = MaxDstUnsigned(DstSize);
938     Cond1 = ensureEmulated(Builder.CreateICmpUGT(VOp.V, Limit1));
939   break;
940   case GenXIntrinsic::genx_sstrunc_sat:
941     // Result = Operand
942     // SGT (maxDstSigned) -> maxDstSigned
943     // SLT (minDstSigned) -> minDstSigned
944     // trunc
945     Limit1 = MaxDstSigned(DstSize);
946     Cond1 = ensureEmulated(Builder.CreateICmpSGT(VOp.V, Limit1));
947     Limit2 = MinDstSigned(DstSize);
948     Cond2 = ensureEmulated(Builder.CreateICmpSLT(VOp.V, Limit2));
949   break;
950   case GenXIntrinsic::genx_ustrunc_sat: // unsigned result, signed operand
951     // UGE (maxDstUnsigned) -> maxDstSigned
952     // Operand < 0 -> 0
953     // trunc
954     Limit1 = MaxDstUnsigned(DstSize);
955     Cond1 = ensureEmulated(Builder.CreateICmpUGE(VOp.V, Limit1));
956     Limit2 = MinDstUnsigned(DstSize);
957     Cond2 = ensureEmulated(Builder.CreateICmpSLT(VOp.V, Limit2));
958   break;
959   case GenXIntrinsic::genx_sutrunc_sat: // signed result, unsigned operand
960     // UGT (maxDstSigned) -> maxDstSigned
961     // trunc
962     Limit1 = MaxDstSigned(DstSize);
963     Cond1 = ensureEmulated(Builder.CreateICmpUGT(VOp.V, Limit1));
964   break;
965   }
966   IGC_ASSERT(Cond1 && Limit1);
967   auto *Result = ensureEmulated(Builder.CreateSelect(Cond1, Limit1, VOp.V));
968   if (Cond2) {
969     Result = ensureEmulated(Builder.CreateSelect(Cond2, Limit2, Result));
970   }
971   if (DstSize <= 32) {
972     auto Splitted = SplitBuilder.splitValueLoHi(*Result);
973     if (DstSize == 32) {
974       Result = Splitted.Lo;
975     } else {
976       // DIRTY HACK: since currently our backend does not support
977       // llvm trunc instruction, we just build a 32-bit trunc.sat instead
978       unsigned ElNum = VOp.VTy->getNumElements();
979       auto *CnvType =
980           IGCLLVM::FixedVectorType::get(CI.getType()->getScalarType(), ElNum);
981       // Result = Builder.CreateTrunc(Result, CnvType);
982       Function *TrSatF = GenXIntrinsic::getAnyDeclaration(
983               CI.getModule(), IID, {CnvType, Splitted.Lo->getType()});
984       Result = Builder.CreateCall(TrSatF, Splitted.Lo, "int_emu.trunc.sat.small.");
985     }
986   }
987   if (Result->getType() == CI.getType())
988     return Result;
989 
990   return Builder.CreateBitCast(Result, CI.getType());
991 }
visitGenxMinMax(CallInst & CI)992 Value *GenXEmulate::Emu64Expander::visitGenxMinMax(CallInst &CI) {
993 
994   auto Builder = getIRBuilder();
995   Value* Lhs = CI.getOperand(0);
996   Value* Rhs = CI.getOperand(1);
997 
998   Value* CondVal = nullptr;
999   // We create 2 64-bit operations:
1000   // compare and select.
1001   // Then we replace those with yet-another expander instance
1002   auto IID = GenXIntrinsic::getAnyIntrinsicID(&Inst);
1003   switch (IID) {
1004   case GenXIntrinsic::genx_umax:
1005     CondVal = Builder.CreateICmpUGT(Lhs, Rhs);
1006     break;
1007   case GenXIntrinsic::genx_smax:
1008     CondVal = Builder.CreateICmpSGT(Lhs, Rhs);
1009     break;
1010   case GenXIntrinsic::genx_umin:
1011     CondVal = Builder.CreateICmpULT(Lhs, Rhs);
1012     break;
1013   case GenXIntrinsic::genx_smin:
1014     CondVal = Builder.CreateICmpSLT(Lhs, Rhs);
1015     break;
1016   }
1017   IGC_ASSERT(CondVal);
1018   CondVal = ensureEmulated(CondVal);
1019   return ensureEmulated(Builder.CreateSelect(CondVal, Lhs, Rhs));
1020 }
1021 
visitGenxAbsi(CallInst & CI)1022 Value *GenXEmulate::Emu64Expander::visitGenxAbsi(CallInst &CI) {
1023   auto Builder = getIRBuilder();
1024   auto Src = SplitBuilder.splitOperandLoHi(0);
1025   // we check the sign, and if
1026   ConstantEmitter K(Src.Hi);
1027   auto *VOprnd = toVector(Builder, CI.getOperand(0)).V;
1028   // This would be a 64-bit operation on a vector types
1029   auto *NegatedOpnd = Builder.CreateNeg(VOprnd);
1030   NegatedOpnd = ensureEmulated(NegatedOpnd);
1031 
1032   auto NegSplit = SplitBuilder.splitValueLoHi(*NegatedOpnd);
1033 
1034   auto *FlagSignSet = Builder.CreateICmpSLT(Src.Hi, K.getZero());
1035   auto *Lo = Builder.CreateSelect(FlagSignSet, NegSplit.Lo, Src.Lo);
1036   auto *Hi = Builder.CreateSelect(FlagSignSet, NegSplit.Hi, Src.Hi);
1037 
1038   return SplitBuilder.combineLoHiSplit({Lo, Hi}, "int_emu.genxabsi.",
1039                                        CI.getType()->isIntegerTy());
1040 }
visitGenxAddSat(CallInst & CI)1041 Value *GenXEmulate::Emu64Expander::visitGenxAddSat(CallInst &CI) {
1042 
1043   auto Src0 = SplitBuilder.splitOperandLoHi(0);
1044   auto Src1 = SplitBuilder.splitOperandLoHi(1);
1045 
1046   auto *M = CI.getModule();
1047 
1048   auto Builder = getIRBuilder();
1049   ConstantEmitter K(Src0.Lo);
1050 
1051   Value *Result = nullptr;
1052   auto IID = GenXIntrinsic::getAnyIntrinsicID(&Inst);
1053   switch (IID) {
1054   case GenXIntrinsic::genx_uuadd_sat: {
1055     if (!SplitBuilder.IsI64Operation()) {
1056       auto LoAdd =
1057           buildAddc(M, Builder, *Src0.Lo, *Src1.Lo, "int_emu.uuadd.lo");
1058       // if there are any non-zero byte in hi parts of srcs
1059       // then positive saturation is produced
1060       auto *PosSat =
1061           Builder.CreateOr(Builder.CreateOr(Src0.Hi, Src1.Hi), LoAdd.CB);
1062       auto *Saturated =
1063           Builder.CreateICmpNE(PosSat, K.getZero(), "int_emu.uuadd.sat");
1064       Result = Builder.CreateSelect(Saturated, K.getOnes(), LoAdd.Val);
1065     } else {
1066       auto LoAdd =
1067           buildAddc(M, Builder, *Src0.Lo, *Src1.Lo, "int_emu.uuadd.lo");
1068       auto HiAdd1 =
1069           buildAddc(M, Builder, *Src0.Hi, *Src1.Hi, "int_emu.uuadd.hi1.");
1070       // add carry from low part
1071       auto HiAdd2 =
1072           buildAddc(M, Builder, *HiAdd1.Val, *LoAdd.CB, "int_emu.uuadd.h2.");
1073 
1074       auto *HiResult = HiAdd2.Val;
1075       auto *Saturated =
1076           Builder.CreateICmpNE(Builder.CreateOr(HiAdd1.CB, HiAdd2.CB),
1077                                K.getZero(), "int_emu.uuadd.sat.");
1078       auto *Lo = Builder.CreateSelect(Saturated, K.getOnes(), LoAdd.Val);
1079       auto *Hi = Builder.CreateSelect(Saturated, K.getOnes(), HiResult);
1080       Result = SplitBuilder.combineLoHiSplit({Lo, Hi}, "int_emu.uuadd.",
1081                                              CI.getType()->isIntegerTy());
1082     }
1083   } break;
1084   case GenXIntrinsic::genx_ssadd_sat: {
1085     auto LoAdd = buildAddc(M, Builder, *Src0.Lo, *Src1.Lo, "int_emu.ssadd.lo");
1086     auto HiAdd1 =
1087         buildAddc(M, Builder, *Src0.Hi, *Src1.Hi, "int_emu.ssadd.hi1.");
1088     // add carry from low part
1089     auto HiAdd2 =
1090         buildAddc(M, Builder, *HiAdd1.Val, *LoAdd.CB, "int_emu.ssadd.h2.");
1091     // auto F
1092     auto *MaskBit31    = K.getSplat(1 << 31);
1093     auto *MaxSigned32  = K.getSplat((1u << 31u) - 1u);
1094     //Overflow = (x >> (os - 1)) == (y >> (os - 1)) &&
1095     //           (x >> (os - 1)) != (result >> (os - 1)) ? 1 : 0;
1096     auto *SignOp0 = Builder.CreateAnd(Src0.Hi, MaskBit31);
1097     auto *SignOp1 = Builder.CreateAnd(Src1.Hi, MaskBit31);
1098     auto *SignRes = Builder.CreateAnd(HiAdd2.Val, MaskBit31);
1099 
1100     auto *FlagSignOpMatch = Builder.CreateICmpEQ(SignOp0, SignOp1);
1101     auto *FlagSignResMismatch = Builder.CreateICmpNE(SignOp0, SignRes);
1102     auto *FlagOverflow = Builder.CreateAnd(FlagSignOpMatch, FlagSignResMismatch);
1103 
1104     // by default we assume that we have positive saturation
1105     auto *Lo = Builder.CreateSelect(FlagOverflow, K.getOnes(), LoAdd.Val);
1106     auto *Hi = Builder.CreateSelect(FlagOverflow, MaxSigned32, HiAdd2.Val);
1107     // if negative, change the saturation value
1108     auto *FlagNegativeSat = Builder.CreateAnd(FlagOverflow,
1109                                  Builder.CreateICmpSLT(SignOp0, K.getZero()));
1110     Lo = Builder.CreateSelect(FlagNegativeSat, K.getZero(), Lo);
1111     Hi = Builder.CreateSelect(FlagNegativeSat, K.getSplat(1 << 31), Hi);
1112 
1113     Result = SplitBuilder.combineLoHiSplit({Lo, Hi}, "int_emu.ssadd.",
1114                                            CI.getType()->isIntegerTy());
1115   } break;
1116   case GenXIntrinsic::genx_suadd_sat:
1117     report_fatal_error(
1118         "int_emu: genx_suadd_sat is not supported by VC backend");
1119     break;
1120   case GenXIntrinsic::genx_usadd_sat:
1121     report_fatal_error(
1122         "int_emu: genx_usadd_sat is not supported by VC backend");
1123     break;
1124   default:
1125     IGC_ASSERT_MESSAGE(0, "unknown intrinsic passed to saturation add emu");
1126   }
1127 
1128   if (Result->getType() != CI.getType()) {
1129     auto TruncID = (IID == GenXIntrinsic::genx_uuadd_sat)
1130                        ? GenXIntrinsic::genx_uutrunc_sat
1131                        : GenXIntrinsic::genx_sstrunc_sat;
1132     auto *TruncFunct = GenXIntrinsic::getGenXDeclaration(
1133         M, TruncID, {CI.getType(), Result->getType()});
1134     Result = Builder.CreateCall(TruncFunct, {Result}, "int_emu.trunc.sat");
1135     Result = ensureEmulated(Result);
1136   }
1137 
1138   return Result;
1139 }
1140 
visitGenxFPToISat(CallInst & CI)1141 Value *GenXEmulate::Emu64Expander::visitGenxFPToISat(CallInst &CI) {
1142   if (CI.getType()->getScalarType()->isDoubleTy())
1143     vc::diagnose(CI.getContext(), "GenXEmulate", &CI,
1144                  "double->UI conversions are not supported");
1145 
1146   auto IID = GenXIntrinsic::getAnyIntrinsicID(&Inst);
1147   IGC_ASSERT_MESSAGE(IID == GenXIntrinsic::genx_fptosi_sat ||
1148                          IID == GenXIntrinsic::genx_fptoui_sat,
1149                      "unknown intrinsic passed to fptoi_sat emu");
1150   const bool IsSigned = (IID == GenXIntrinsic::genx_fptosi_sat) ? true : false;
1151 
1152   auto Builder = getIRBuilder();
1153   unsigned Opcode = IsSigned ? Instruction::FPToSI : Instruction::FPToSI;
1154 
1155   Type *Ty = CI.getType();
1156   auto *F = CI.getCalledFunction();
1157   IGC_ASSERT(F);
1158   Type *Ty2 = IGCLLVM::getArg(*F, 0)->getType();
1159   OpType OpAndType{Opcode, Ty, Ty2};
1160   if (!EmulationFuns)
1161     vc::diagnose(CI.getContext(), "GenXEmulate", &CI,
1162                  "Emulation was called without initialization");
1163 
1164   auto Iter = EmulationFuns->find(OpAndType);
1165   if (Iter == EmulationFuns->end())
1166     vc::diagnose(CI.getContext(), "GenXEmulate", &CI,
1167                  "Unsupported instruction for emulation");
1168 
1169   SmallVector<Value *, 8> Args(CI.arg_operands());
1170 
1171   return Builder.CreateCall(Iter->second, Args);
1172 }
1173 
visitCallInst(CallInst & CI)1174 Value *GenXEmulate::Emu64Expander::visitCallInst(CallInst &CI) {
1175   switch (GenXIntrinsic::getAnyIntrinsicID(&Inst)) {
1176   case GenXIntrinsic::genx_uutrunc_sat:
1177   case GenXIntrinsic::genx_sstrunc_sat:
1178   case GenXIntrinsic::genx_ustrunc_sat:
1179   case GenXIntrinsic::genx_sutrunc_sat:
1180     return visitGenxTrunc(CI);
1181   case GenXIntrinsic::genx_umin:
1182   case GenXIntrinsic::genx_umax:
1183   case GenXIntrinsic::genx_smin:
1184   case GenXIntrinsic::genx_smax:
1185     return visitGenxMinMax(CI);
1186   case GenXIntrinsic::genx_absi:
1187     return visitGenxAbsi(CI);
1188   case GenXIntrinsic::genx_suadd_sat:
1189   case GenXIntrinsic::genx_usadd_sat:
1190   case GenXIntrinsic::genx_uuadd_sat:
1191   case GenXIntrinsic::genx_ssadd_sat:
1192     return visitGenxAddSat(CI);
1193   case GenXIntrinsic::genx_fptosi_sat:
1194   case GenXIntrinsic::genx_fptoui_sat:
1195     return visitGenxFPToISat(CI);
1196   }
1197   return nullptr;
1198 }
ensureEmulated(Value * Val)1199 Value *GenXEmulate::Emu64Expander::ensureEmulated(Value *Val) {
1200   Instruction *Inst = dyn_cast<Instruction>(Val);
1201   if (!Inst)
1202     return Val;
1203   auto *Emulated = Emu64Expander(ST, *Inst, EmulationFuns).tryExpand();
1204   if (!Emulated)
1205     return Val;
1206   Inst->eraseFromParent();
1207   return Emulated;
1208 }
buildTernaryAddition(IRBuilder & Builder,Value & A,Value & B,Value & C,const Twine & Name) const1209 Value *GenXEmulate::Emu64Expander::buildTernaryAddition(
1210     IRBuilder &Builder, Value &A, Value &B, Value &C, const Twine &Name) const {
1211   if (ST.hasAdd3Bfn()) {
1212     auto *Add3Funct = GenXIntrinsic::getGenXDeclaration(
1213         Inst.getModule(), GenXIntrinsic::genx_add3, {A.getType(), B.getType()});
1214     return Builder.CreateCall(Add3Funct, {&A, &B, &C}, "add3." + Name);
1215   }
1216   auto *SubH = Builder.CreateAdd(&A, &B, Name + ".part");
1217   return Builder.CreateAdd(SubH, &C, Name);
1218 }
1219 GenXEmulate::Emu64Expander::AddSubExtResult
buildAddc(Module * M,IRBuilder & Builder,Value & L,Value & R,const Twine & Prefix)1220 GenXEmulate::Emu64Expander::buildAddc(Module *M, IRBuilder &Builder, Value &L,
1221                                       Value &R, const Twine &Prefix) {
1222   IGC_ASSERT(L.getType() == R.getType());
1223 
1224   auto *AddcFunct = GenXIntrinsic::getGenXDeclaration(
1225       M, GenXIntrinsic::genx_addc, {L.getType(), R.getType()});
1226 
1227   using namespace GenXIntrinsic::GenXResult;
1228   auto *AddcVal =
1229       Builder.CreateCall(AddcFunct, {&L, &R}, Prefix + "aggregate.");
1230   auto *Add =
1231       Builder.CreateExtractValue(AddcVal, {IdxAddc_Add}, Prefix + "add.");
1232   auto *Carry =
1233       Builder.CreateExtractValue(AddcVal, {IdxAddc_Carry}, Prefix + "carry.");
1234   return {Add, Carry};
1235 }
1236 GenXEmulate::Emu64Expander::AddSubExtResult
buildSubb(Module * M,IRBuilder & Builder,Value & L,Value & R,const Twine & Prefix)1237 GenXEmulate::Emu64Expander::buildSubb(Module *M, IRBuilder &Builder, Value &L,
1238                                       Value &R, const Twine &Prefix) {
1239 
1240   IGC_ASSERT(L.getType() == R.getType());
1241 
1242   auto *SubbFunct = GenXIntrinsic::getGenXDeclaration(
1243       M, GenXIntrinsic::genx_subb, {L.getType(), R.getType()});
1244 
1245   using namespace GenXIntrinsic::GenXResult;
1246   auto *SubbVal =
1247       Builder.CreateCall(SubbFunct, {&L, &R}, Prefix + "aggregate.");
1248   auto *Sub =
1249       Builder.CreateExtractValue(SubbVal, {IdxSubb_Sub}, Prefix + "sub.");
1250   auto *Borrow =
1251       Builder.CreateExtractValue(SubbVal, {IdxSubb_Borrow}, Prefix + "borrow.");
1252   return {Sub, Borrow};
1253 }
buildGeneralICmp(IRBuilder & Builder,CmpInst::Predicate P,bool IsPartialPredicate,const LHSplit & Src0,const LHSplit & Src1)1254 Value *GenXEmulate::Emu64Expander::buildGeneralICmp(IRBuilder &Builder,
1255                                                     CmpInst::Predicate P,
1256                                                     bool IsPartialPredicate,
1257                                                     const LHSplit &Src0,
1258                                                     const LHSplit &Src1) {
1259 
1260   auto getEmulateCond1 = [](const CmpInst::Predicate P) {
1261     // For the unsigned predicate the first condition stays the same
1262     if (CmpInst::isUnsigned(P))
1263       return P;
1264     switch (P) {
1265     // transform signed predicate to an unsigned one
1266     case CmpInst::ICMP_SGT:
1267       return CmpInst::ICMP_UGT;
1268     case CmpInst::ICMP_SGE:
1269       return CmpInst::ICMP_UGE;
1270     case CmpInst::ICMP_SLT:
1271       return CmpInst::ICMP_ULT;
1272     case CmpInst::ICMP_SLE:
1273       return CmpInst::ICMP_ULE;
1274     default:
1275       llvm_unreachable("unexpected ICMP predicate for first condition");
1276     }
1277   };
1278   auto getEmulateCond2 = [](const CmpInst::Predicate P) {
1279     // discard EQ part
1280     switch (P) {
1281     case CmpInst::ICMP_SGT:
1282     case CmpInst::ICMP_SGE:
1283       return CmpInst::ICMP_SGT;
1284     case CmpInst::ICMP_SLT:
1285     case CmpInst::ICMP_SLE:
1286       return CmpInst::ICMP_SLT;
1287     case CmpInst::ICMP_UGT:
1288     case CmpInst::ICMP_UGE:
1289       return CmpInst::ICMP_UGT;
1290     case CmpInst::ICMP_ULT:
1291     case CmpInst::ICMP_ULE:
1292       return CmpInst::ICMP_ULT;
1293     default:
1294       llvm_unreachable("unexpected ICMP predicate for second condition");
1295     }
1296   };
1297 
1298   std::pair<Value *, Value *> ResultParts = {};
1299   switch (P) {
1300   case CmpInst::ICMP_EQ: {
1301     auto *T0 = Builder.CreateICmpEQ(Src0.Lo, Src1.Lo);
1302     auto *T1 = Builder.CreateICmpEQ(Src0.Hi, Src1.Hi);
1303     ResultParts = {T0, T1};
1304     break;
1305   }
1306   case CmpInst::ICMP_NE: {
1307     auto *T0 = Builder.CreateICmpNE(Src0.Lo, Src1.Lo);
1308     auto *T1 = Builder.CreateICmpNE(Src0.Hi, Src1.Hi);
1309     ResultParts = {T0, T1};
1310     break;
1311   }
1312   default: {
1313     CmpInst::Predicate EmuP1 = getEmulateCond1(P);
1314     CmpInst::Predicate EmuP2 = getEmulateCond2(P);
1315     auto *T0 = Builder.CreateICmp(EmuP1, Src0.Lo, Src1.Lo);
1316     auto *T1 = Builder.CreateICmpEQ(Src0.Hi, Src1.Hi);
1317     auto *T2 = Builder.CreateAnd(T1, T0);
1318     auto *T3 = Builder.CreateICmp(EmuP2, Src0.Hi, Src1.Hi);
1319     ResultParts = {T2, T3};
1320     break;
1321   }
1322   }
1323   auto ResultCond = (P == CmpInst::ICMP_EQ) ? Instruction::BinaryOps::And
1324                                             : Instruction::BinaryOps::Or;
1325   if (!IsPartialPredicate || !OptConvertPartialPredicates) {
1326     return Builder.CreateBinOp(
1327         ResultCond, ResultParts.first, ResultParts.second,
1328         "int_emu.cmp." + CmpInst::getPredicateName(P) + ".");
1329   }
1330   // Note:
1331   // The reason for doing this conversion is that our backend has no
1332   // convinient way to represent partial updates of predicates with anything
1333   // except for icmp instructions. In the current codebase we have -
1334   // we are unable to create a proper visa for the following case ("pseudo" IR):
1335   // bale {
1336   //   %ne1 = or <8 x i1> %a, %b
1337   //   %j = call <16 x i1> wrpredregion(<16 x i1> undef, <8 x i1> %ne1, i32 0)
1338   // }
1339   // bale {
1340   //   %ne2 = or <8 x i1> %c, %d
1341   //   %joined = call <16 x i1> wrpredregion(<16 x i1> %j, <8 x i1> %ne1, i32 8)
1342   // }
1343   // As such we convert such cases to the following sequence: 2xsel->or->cmp
1344   ConstantEmitter K(Src0.Lo);
1345   auto *L = Builder.CreateSelect(ResultParts.first, K.getOnes(), K.getZero());
1346   auto *R = Builder.CreateSelect(ResultParts.second, K.getOnes(), K.getZero());
1347   auto *IPred = Builder.CreateBinOp(ResultCond, L, R,
1348                                     "int_emu.cmp.part.int." +
1349                                         CmpInst::getPredicateName(P) + ".");
1350   return Builder.CreateICmpEQ(IPred, K.getOnes(),
1351                               "int_emu.cmp.part.i1" +
1352                                   CmpInst::getPredicateName(P) + ".");
1353 }
buildRightShift(IVSplitter & SplitBuilder,BinaryOperator & Op)1354 Value *GenXEmulate::Emu64Expander::buildRightShift(IVSplitter &SplitBuilder,
1355                                                    BinaryOperator &Op) {
1356   auto Builder = getIRBuilder();
1357 
1358   llvm::SmallVector<uint32_t, 8> ShaVals;
1359   if (getConstantUI32Values(Op.getOperand(1), ShaVals)) {
1360     auto *Result = tryOptimizedShr(Builder, SplitBuilder, Op, ShaVals);
1361     if (Result)
1362       return Result;
1363   }
1364   return buildGenericRShift(Builder, SplitBuilder, Op);
1365 }
tryOptimizedShr(IRBuilder & Builder,IVSplitter & SplitBuilder,BinaryOperator & Op,ArrayRef<uint32_t> Sa)1366 Value *GenXEmulate::Emu64Expander::tryOptimizedShr(IRBuilder &Builder,
1367                                                    IVSplitter &SplitBuilder,
1368                                                    BinaryOperator &Op,
1369                                                    ArrayRef<uint32_t> Sa) {
1370   auto Operand = SplitBuilder.splitOperandLoHi(0);
1371   Value *LoPart{};
1372   Value *HiPart{};
1373 
1374   ConstantEmitter K(Operand.Lo);
1375 
1376   bool IsLogical = Op.getOpcode() == Instruction::LShr;
1377 
1378   if (std::all_of(Sa.begin(), Sa.end(), LessThan32())) {
1379     if (std::find(Sa.begin(), Sa.end(), 0) != Sa.end()) {
1380       // TODO: for now, we bail-out if zero is encountered. Theoretically
1381       // we could mask-out potentially poisoned values by inserting
1382       // [cmp/select] pair at the end of the if branch, but for now bailing
1383       // out is a more safe choice
1384       return nullptr;
1385     }
1386     auto *ShiftA = ConstantDataVector::get(Builder.getContext(), Sa);
1387     auto *Lo1 = Builder.CreateLShr(Operand.Lo, ShiftA);
1388     auto *Hi = (IsLogical) ? Builder.CreateLShr(Operand.Hi, ShiftA)
1389                            : Builder.CreateAShr(Operand.Hi, ShiftA);
1390     auto *C32 = K.getSplat(32);
1391     auto *CShift = ConstantExpr::getSub(C32, ShiftA);
1392     auto *Lo2 = Builder.CreateShl(Operand.Hi, CShift);
1393     LoPart = Builder.CreateOr(Lo1, Lo2);
1394     HiPart = Hi;
1395   } else if (std::all_of(Sa.begin(), Sa.end(), Equals32())) {
1396     LoPart = Operand.Hi;
1397     if (IsLogical) {
1398       HiPart = K.getZero();
1399     } else {
1400       auto *C31 = K.getSplat(31);
1401       HiPart = Builder.CreateAShr(Operand.Hi, C31);
1402     }
1403   } else if (std::all_of(Sa.begin(), Sa.end(), GreaterThan32())) {
1404     auto *C32 = K.getSplat(32);
1405     auto *CRawShift = ConstantDataVector::get(Builder.getContext(), Sa);
1406     auto *CShift = ConstantExpr::getSub(CRawShift, C32);
1407     if (IsLogical) {
1408       LoPart = Builder.CreateLShr(Operand.Hi, CShift);
1409       HiPart = K.getZero();
1410     } else {
1411       auto *C31 = K.getSplat(31);
1412       LoPart = Builder.CreateAShr(Operand.Hi, CShift);
1413       HiPart = Builder.CreateAShr(Operand.Hi, C31);
1414     }
1415   } else {
1416     return nullptr;
1417   }
1418   IGC_ASSERT_MESSAGE(LoPart && HiPart, "could not construct optimized shr");
1419   return SplitBuilder.combineLoHiSplit(
1420       {LoPart, HiPart}, Twine("int_emu.") + Op.getOpcodeName() + ".",
1421       Op.getType()->isIntegerTy());
1422 }
tryOptimizedShl(IRBuilder & Builder,IVSplitter & SplitBuilder,BinaryOperator & Op,ArrayRef<uint32_t> Sa)1423 Value *GenXEmulate::Emu64Expander::tryOptimizedShl(IRBuilder &Builder,
1424                                                    IVSplitter &SplitBuilder,
1425                                                    BinaryOperator &Op,
1426                                                    ArrayRef<uint32_t> Sa) {
1427   auto Operand = SplitBuilder.splitOperandLoHi(0);
1428   Value *LoPart{};
1429   Value *HiPart{};
1430 
1431   ConstantEmitter K(Operand.Lo);
1432 
1433   if (std::all_of(Sa.begin(), Sa.end(), LessThan32())) {
1434     if (std::find(Sa.begin(), Sa.end(), 0) != Sa.end()) {
1435       // TODO: for now, we bail-out if zero is encountered. Theoretically
1436       // we could mask-out potentially poisoned values by inserting
1437       // [cmp/select] pair at the end of the if branch, but for now bailing
1438       // out seems like safe choice
1439       return nullptr;
1440     }
1441     auto *CRawShift = ConstantDataVector::get(Builder.getContext(), Sa);
1442     LoPart = Builder.CreateShl(Operand.Lo, CRawShift);
1443     auto *C32 = K.getSplat(32);
1444     auto *CShift = ConstantExpr::getSub(C32, CRawShift);
1445     auto *Hi1 = Builder.CreateShl(Operand.Hi, CRawShift);
1446     auto *Hi2 = Builder.CreateLShr(Operand.Lo, CShift);
1447     HiPart = Builder.CreateOr(Hi1, Hi2);
1448   } else if (std::all_of(Sa.begin(), Sa.end(), Equals32())) {
1449     LoPart = K.getZero();
1450     HiPart = Operand.Lo;
1451   } else if (std::all_of(Sa.begin(), Sa.end(), GreaterThan32())) {
1452     LoPart = K.getZero();
1453     auto *C32 = K.getSplat(32);
1454     auto *CRawShift = ConstantDataVector::get(Builder.getContext(), Sa);
1455     auto *CShift = ConstantExpr::getSub(CRawShift, C32);
1456     HiPart = Builder.CreateShl(Operand.Lo, CShift);
1457   } else {
1458     return nullptr;
1459   }
1460   IGC_ASSERT_MESSAGE(LoPart && HiPart, "could not construct optimized shl");
1461   return SplitBuilder.combineLoHiSplit(
1462       {LoPart, HiPart}, Twine("int_emu.") + Op.getOpcodeName() + ".",
1463       Op.getType()->isIntegerTy());
1464 }
buildGenericRShift(IRBuilder & Builder,IVSplitter & SplitBuilder,BinaryOperator & Op)1465 Value *GenXEmulate::Emu64Expander::buildGenericRShift(IRBuilder &Builder,
1466                                                       IVSplitter &SplitBuilder,
1467                                                       BinaryOperator &Op) {
1468 
1469   auto L = SplitBuilder.splitOperandLoHi(0);
1470   auto R = SplitBuilder.splitOperandLoHi(1);
1471 
1472   auto SI = constructShiftInfo(Builder, R.Lo);
1473   ConstantEmitter K(L.Lo);
1474 
1475   // Logical Shift Right
1476   // 1. Calculate MASK1. MASK1 is 0 when the shift is >= 32 (large shift)
1477   // 2. Calculate MASK0. MASK0 is 0 iff the shift is 0
1478   // 3. Calculate High part:
1479   //    [(L.Hi *LSR* Sha) *AND* MASK1], "&" discards result is large shift
1480   // 4. Calculate Low part:
1481   //  [(L.Hi & MASK0) *SHL* (32 - SHA)] & MASK1, bits from HI part shifted-out
1482   //  to LOW
1483   //  [(L.HI *LSR* (SHA - 32)] & ~MASK1, in case of large shift, all bits occupy
1484   //  LOW
1485   //  [(L.Lo *LSR* Sha) *AND* MASK1], "&" discards result if large shift
1486   //  *OR* the above
1487   auto *Lo = buildPartialRShift(Builder, L.Lo, L.Hi, SI);
1488   auto *Hi = Builder.CreateAnd(Builder.CreateLShr(L.Hi, SI.Sha), SI.Mask1);
1489 
1490   bool IsLogical = Op.getOpcode() == Instruction::LShr;
1491   if (!IsLogical) {
1492     // Arithmetic Shift Right
1493     // Do all the steps form Logical Shift
1494     // 5. SignedMask = L.Hi *ASR* 31
1495     //    HIPART |= (SignedMask *SHL* (SH32 & MASK1)) & Mask0
1496     //      HIPART &= Mask0 => apply full SignedMask for large shifts
1497     //    LOPART |= (SignedMask *SHL* (63 - Sha)) & ~MASK1 =>
1498     //      LOPART &= ~Mask1 => do not apply this for small shifts
1499     auto *SignedMask =
1500         Builder.CreateAShr(L.Hi, K.getSplat(31), "int_emu.asr.sign.");
1501 
1502     auto *AuxHi =
1503         Builder.CreateShl(SignedMask, Builder.CreateAnd(SI.Sh32, SI.Mask1));
1504     AuxHi = Builder.CreateAnd(AuxHi, SI.Mask0);
1505 
1506     auto *AuxLo = Builder.CreateShl(SignedMask,
1507                                     Builder.CreateSub(K.getSplat(63), SI.Sha));
1508     AuxLo = Builder.CreateAnd(AuxLo, Builder.CreateNot(SI.Mask1));
1509 
1510     Lo = Builder.CreateOr(Lo, AuxLo);
1511     Hi = Builder.CreateOr(Hi, AuxHi);
1512   }
1513   return SplitBuilder.combineLoHiSplit(
1514       {Lo, Hi}, Twine("int_emu.") + Op.getOpcodeName() + ".",
1515       Op.getType()->isIntegerTy());
1516 }
1517 
buildPartialRShift(IRBuilder & B,Value * SrcLo,Value * SrcHi,const ShiftInfo & SI)1518 Value *GenXEmulate::Emu64Expander::buildPartialRShift(IRBuilder &B,
1519                                                       Value *SrcLo,
1520                                                       Value *SrcHi,
1521                                                       const ShiftInfo &SI) {
1522   ConstantEmitter K(SrcLo);
1523   // calculate part which went from hi part to low
1524   auto *TmpH1 = B.CreateShl(B.CreateAnd(SrcHi, SI.Mask0), SI.Sh32);
1525   TmpH1 = B.CreateAnd(TmpH1, SI.Mask1);
1526   // TmpH2 is for the case when the shift amount is greater than 32
1527   auto *TmpH2 = B.CreateLShr(SrcHi, B.CreateSub(SI.Sha, K.getSplat(32)));
1528   // Here we mask out tmph2 is the shift is less than 32
1529   TmpH2 = B.CreateAnd(TmpH2, B.CreateNot(SI.Mask1));
1530   // Mask1 will ensure that the result is discarded if the shift is large
1531   auto *TmpL = B.CreateAnd(B.CreateLShr(SrcLo, SI.Sha), SI.Mask1);
1532 
1533   return B.CreateOr(B.CreateOr(TmpL, TmpH1), TmpH2, "int_emu.shif.r.lo.");
1534 }
1535 GenXEmulate::Emu64Expander::ShiftInfo
constructShiftInfo(IRBuilder & B,Value * RawSha)1536 GenXEmulate::Emu64Expander::constructShiftInfo(IRBuilder &B, Value *RawSha) {
1537   ConstantEmitter K(RawSha);
1538 
1539   auto *Sha = B.CreateAnd(RawSha, K.getSplat(0x3f), "int_emu.shift.sha.");
1540   auto *Sh32 = B.CreateSub(K.getSplat(32), Sha, "int_emu.shift.sh32.");
1541   auto *FlagLargeShift = B.CreateICmpUGE(Sha, K.getSplat(32));
1542   auto *FlagZeroShift = B.CreateICmpEQ(Sha, K.getSplat(0));
1543 
1544   auto *Mask1 = B.CreateSelect(FlagLargeShift, K.getZero(), K.getOnes());
1545   auto *Mask0 = B.CreateSelect(FlagZeroShift, K.getZero(), K.getOnes());
1546 
1547   return ShiftInfo{Sha, Sh32, Mask1, Mask0};
1548 }
hasStrictEmulationRequirement(Instruction * Inst)1549 bool GenXEmulate::Emu64Expander::hasStrictEmulationRequirement(
1550     Instruction *Inst) {
1551   auto isI64Type = [](Type *T) {
1552     if (T->isVectorTy())
1553       T = cast<VectorType>(T)->getElementType();
1554     return T->isIntegerTy(64) == true;
1555   };
1556   bool ret64 = isI64Type(Inst->getType());
1557   bool uses64 = false;
1558   for (unsigned i = 0; i < Inst->getNumOperands(); ++i) {
1559     uses64 |= isI64Type(Inst->getOperand(i)->getType());
1560   }
1561   // if instruction does not touch i64 - it is free to go
1562   if (!ret64 && !uses64 && !isI64PointerOp(*Inst))
1563     return false;
1564 
1565   // now things become (a little) complicated. Currently, we ignore some
1566   // instructions/intrinsic types, since they are acceptable by finalizer.
1567   // More specifically - everything which is lowered to a plain mov
1568   // (non-coverting) is fine.
1569   // It seems that sends with i64 addresses are fine too
1570 
1571   // skip moves
1572   if (GenXIntrinsic::isWrRegion(Inst) || GenXIntrinsic::isRdRegion(Inst)) {
1573     return OptStricterRegions;
1574   }
1575 
1576   // skip constants
1577   if (GenXIntrinsic::getAnyIntrinsicID(Inst) == GenXIntrinsic::genx_constanti)
1578     return OptStricterConst;
1579 
1580   switch (GenXIntrinsic::getAnyIntrinsicID(Inst)) {
1581   case GenXIntrinsic::genx_svm_scatter:
1582   case GenXIntrinsic::genx_svm_gather:
1583   case GenXIntrinsic::genx_svm_scatter4_scaled:
1584   case GenXIntrinsic::genx_svm_gather4_scaled:
1585   case GenXIntrinsic::genx_svm_block_st:
1586   case GenXIntrinsic::genx_svm_block_ld:
1587   case GenXIntrinsic::genx_svm_block_ld_unaligned:
1588     return OptStricterSVM;
1589 
1590   // TODO: not every atomic is covered here, we need to add more
1591   case GenXIntrinsic::genx_svm_atomic_add:
1592   case GenXIntrinsic::genx_svm_atomic_and:
1593   case GenXIntrinsic::genx_svm_atomic_cmpxchg:
1594   case GenXIntrinsic::genx_svm_atomic_dec:
1595   case GenXIntrinsic::genx_svm_atomic_fcmpwr:
1596   case GenXIntrinsic::genx_svm_atomic_fmax:
1597   case GenXIntrinsic::genx_svm_atomic_fmin:
1598   case GenXIntrinsic::genx_svm_atomic_imax:
1599   case GenXIntrinsic::genx_svm_atomic_imin:
1600   case GenXIntrinsic::genx_svm_atomic_inc:
1601   case GenXIntrinsic::genx_svm_atomic_max:
1602   case GenXIntrinsic::genx_svm_atomic_min:
1603   case GenXIntrinsic::genx_svm_atomic_or:
1604   case GenXIntrinsic::genx_svm_atomic_sub:
1605   case GenXIntrinsic::genx_svm_atomic_xchg:
1606   case GenXIntrinsic::genx_svm_atomic_xor:
1607     return OptStricterAtomic;
1608 
1609   case GenXIntrinsic::genx_oword_st:
1610   case GenXIntrinsic::genx_oword_ld:
1611   case GenXIntrinsic::genx_oword_ld_unaligned:
1612     return OptStricterOword;
1613   case GenXIntrinsic::genx_alloca:
1614     return OptStricterAlloc;
1615   case GenXIntrinsic::genx_faddr:
1616     return OptStricterFaddr;
1617   }
1618 
1619   switch (Inst->getOpcode()) {
1620   case Instruction::PtrToInt:
1621   case Instruction::IntToPtr: {
1622     const DataLayout &DL = Inst->getModule()->getDataLayout();
1623     if (!cast<CastInst>(Inst)->isNoopCast(DL))
1624       return OptStricterConverts;
1625     return false;
1626   }
1627   case Instruction::ICmp:
1628     return OptStrictChecksEnable;
1629   // skip bitcast and phi
1630   case Instruction::BitCast:
1631   case Instruction::PHI:
1632     return false;
1633   }
1634   return true;
1635 }
1636 
getAnalysisUsage(AnalysisUsage & AU) const1637 void GenXEmulate::getAnalysisUsage(AnalysisUsage &AU) const {
1638   AU.addRequired<TargetPassConfig>();
1639   AU.setPreservesCFG();
1640 }
1641 
runOnModule(Module & M)1642 bool GenXEmulate::runOnModule(Module &M) {
1643   bool Changed = false;
1644   ST = &getAnalysis<TargetPassConfig>()
1645             .getTM<GenXTargetMachine>()
1646             .getGenXSubtarget();
1647   buildEmuFunCache(M);
1648 
1649   for (auto &F : M.getFunctionList())
1650     runOnFunction(F);
1651 
1652   Changed |= !ToErase.empty();
1653   processToEraseList(ToErase);
1654 
1655   auto IsOldEmulationFunction = [](const Function *F) {
1656     return F->getName().contains("__cm_intrinsic_impl_");
1657   };
1658   // Delete unused builtins, make used ones internal.
1659   for (auto I = M.begin(); I != M.end();) {
1660     Function &F = *I++;
1661     if (isEmulationFunction(&F) || IsOldEmulationFunction(&F)) {
1662       Changed = true;
1663       if (F.use_empty())
1664         F.eraseFromParent();
1665       else
1666         F.setLinkage(GlobalValue::InternalLinkage);
1667     }
1668   }
1669 
1670   if (!DiscracedList.empty()) {
1671     for (const auto *Insn : DiscracedList) {
1672       llvm::errs() << "I64EMU-FAILURE: " << *Insn << "\n";
1673     }
1674     report_fatal_error("int_emu: strict emulation requirements failure", false);
1675   }
1676   return Changed;
1677 }
1678 
runOnFunction(Function & F)1679 void GenXEmulate::runOnFunction(Function &F) {
1680   for (auto &BB : F.getBasicBlockList()) {
1681     for (auto I = BB.begin(); I != BB.end(); ++I) {
1682 
1683       Instruction *Inst = &*I;
1684       auto *NewVal = emulateInst(Inst);
1685       if (NewVal) {
1686         Inst->replaceAllUsesWith(NewVal);
1687         ToErase.push_back(Inst);
1688       }
1689     }
1690   }
1691   return;
1692 }
1693 
getEmulationFunction(const Instruction * Inst) const1694 Function *GenXEmulate::getEmulationFunction(const Instruction *Inst) const {
1695 
1696   unsigned Opcode = Inst->getOpcode();
1697   Type *Ty = Inst->getType();
1698 
1699   Type *Ty2 = nullptr;
1700   if (Inst->getNumOperands() > 0)
1701     Ty2 = Inst->getOperand(0)->getType();
1702   OpType OpAndType{Opcode, Ty, Ty2};
1703 
1704   auto Iter = EmulationFuns.find(OpAndType);
1705   if (Iter != EmulationFuns.end()) {
1706     LLVM_DEBUG(dbgs() << "Emulation function: " << Iter->second->getName()
1707                       << " shall be used for: " << *Inst << "\n");
1708     return Iter->second;
1709   }
1710 
1711   return nullptr;
1712 }
1713 
buildEmuFunCache(Module & M)1714 void GenXEmulate::buildEmuFunCache(Module &M) {
1715   EmulationFuns.clear();
1716 
1717   auto UpdateCacheIfMatch = [this](Function &F, StringRef PrefixToMatch,
1718                                    unsigned OpCode) {
1719     const auto &Name = F.getName();
1720     if (!Name.startswith(PrefixToMatch))
1721       return false;
1722 
1723     Type *Ty = F.getReturnType();
1724     Type *Ty2 = nullptr;
1725     if (F.arg_size() > 0)
1726       Ty2 = IGCLLVM::getArg(F, 0)->getType();
1727     IGC_ASSERT(EmulationFuns.find({OpCode, Ty, Ty2}) == EmulationFuns.end());
1728     EmulationFuns.insert({{OpCode, Ty, Ty2}, &F});
1729     return true;
1730   };
1731 
1732   for (Function &F : M.getFunctionList()) {
1733     if (!isEmulationFunction(&F))
1734       continue;
1735     for (auto &PrOp : DivRemPrefixes)
1736       UpdateCacheIfMatch(F, PrOp.Prefix, PrOp.Opcode);
1737     if (ST->emulateLongLong()) {
1738       for (auto &PrOp : EmulationFPConvertsPrefixes)
1739         UpdateCacheIfMatch(F, PrOp.Prefix, PrOp.Opcode);
1740     }
1741   }
1742 }
1743 
emulateInst(Instruction * Inst)1744 Value *GenXEmulate::emulateInst(Instruction *Inst) {
1745   Function *EmuFn = getEmulationFunction(Inst);
1746   if (EmuFn) {
1747     IGC_ASSERT(isEmulationFunction(EmuFn));
1748     IGC_ASSERT_MESSAGE(!isa<CallInst>(Inst), "call emulation not supported yet");
1749     llvm::IRBuilder<> Builder(Inst);
1750     SmallVector<Value *, 8> Args(Inst->operands());
1751     return Builder.CreateCall(EmuFn, Args);
1752   }
1753   IGC_ASSERT(ST);
1754   if (ST->emulateLongLong()) {
1755     Value *NewInst = Emu64Expander(*ST, *Inst, &EmulationFuns).tryExpand();
1756     if (!NewInst) {
1757 #ifndef NDEBUG
1758       if (Emu64Expander::hasStrictEmulationRequirement(Inst)) {
1759         LLVM_DEBUG(dbgs() << "i64-emu::WARNING: instruction may require "
1760                           << "emulation: " << *Inst << "\n");
1761       }
1762 #endif // NDEBUG
1763       if (OptStrictChecksEnable &&
1764           Emu64Expander::hasStrictEmulationRequirement(Inst)) {
1765         DiscracedList.push_back(Inst);
1766       }
1767     }
1768 
1769     return NewInst;
1770   }
1771   return nullptr;
1772 }
1773 
emulateI64Operation(const GenXSubtarget * ST,Instruction * Inst,EmulationFlag AuxAction)1774 Instruction *llvm::genx::emulateI64Operation(const GenXSubtarget *ST,
1775                                              Instruction *Inst,
1776                                              EmulationFlag AuxAction) {
1777   LLVM_DEBUG(dbgs() << "i64-emu::WARNING: direct emulation routine was "
1778                     << "called for " << *Inst << "\n");
1779   Instruction *NewInst = nullptr;
1780 
1781   if (!ST->hasLongLong()) {
1782     Value *EmulatedResult = GenXEmulate::Emu64Expander(*ST, *Inst).tryExpand();
1783     NewInst = cast_or_null<Instruction>(EmulatedResult);
1784     // If there is no explicit request to enable i64 emulation - report
1785     // an error
1786     if (NewInst && !ST->emulateLongLong() && OptStrictEmulationRequests) {
1787       report_fatal_error("int_emu: target does not suport i64 types", false);
1788     }
1789   }
1790 
1791   // NewInst can be nullptr if the instruction does not need emulation,
1792   // (like various casts)
1793   if (!NewInst) {
1794     // if EmulationFlag::RAUWE was requested, then caller expects that
1795     // that the returned instruction can be safely used.
1796     if (AuxAction == EmulationFlag::RAUWE)
1797       return Inst; // return the original instruction
1798     return nullptr;
1799   }
1800 
1801   switch (AuxAction) {
1802   case EmulationFlag::RAUW:
1803     Inst->replaceAllUsesWith(NewInst);
1804     break;
1805   case EmulationFlag::RAUWE:
1806     Inst->replaceAllUsesWith(NewInst);
1807     Inst->eraseFromParent();
1808     break;
1809   case EmulationFlag::None:
1810     // do nothing
1811     break;
1812   }
1813   return NewInst;
1814 }
1815 char GenXEmulate::ID = 0;
1816 
1817 namespace llvm {
1818 void initializeGenXEmulatePass(PassRegistry &);
1819 }
1820 INITIALIZE_PASS_BEGIN(GenXEmulate, "GenXEmulate", "GenXEmulate", false, false)
1821 INITIALIZE_PASS_END(GenXEmulate, "GenXEmulate", "GenXEmulate", false, false)
1822 
createGenXEmulatePass()1823 ModulePass *llvm::createGenXEmulatePass() {
1824   initializeGenXEmulatePass(*PassRegistry::getPassRegistry());
1825   return new GenXEmulate;
1826 }
1827 
1828 namespace {
1829 class GenXEmulationImport : public ModulePass {
1830 public:
1831   static char ID;
1832 
GenXEmulationImport()1833   explicit GenXEmulationImport() : ModulePass(ID) {}
getPassName() const1834   StringRef getPassName() const override { return "GenX Emulation BiF Import"; }
getAnalysisUsage(AnalysisUsage & AU) const1835   void getAnalysisUsage(AnalysisUsage &AU) const override {
1836     AU.addRequired<TargetPassConfig>();
1837     AU.addRequired<GenXBackendConfig>();
1838   }
runOnModule(Module & M)1839   bool runOnModule(Module &M) override {
1840     if (OptDbgOnlyDisableDivremEmulation)
1841       return false;
1842     const GenXSubtarget &ST = getAnalysis<TargetPassConfig>()
1843                                   .getTM<GenXTargetMachine>()
1844                                   .getGenXSubtarget();
1845 
1846     auto ModEmuFun =
1847         LoadEmuFunLib(M.getContext(), M.getDataLayout(), M.getTargetTriple());
1848     if (!ModEmuFun)
1849       return false;
1850 
1851     PurgeUnneededEmulationFunctions(*ModEmuFun, ST);
1852 
1853     if (Linker::linkModules(M, std::move(ModEmuFun)))
1854       report_fatal_error("Error linking emulation routines");
1855 
1856     return true;
1857   }
1858 
1859 private:
IsLibraryFunction(const Function & F)1860   static bool IsLibraryFunction(const Function &F) {
1861     const auto &Name = F.getName();
1862     return Name.startswith(LibraryFunctionPrefix);
1863   }
1864 
1865   template <typename FilterFunction>
selectEmulationFunctions(Module & M,FilterFunction Flt)1866   static std::vector<Function *> selectEmulationFunctions(Module &M,
1867                                                           FilterFunction Flt) {
1868     std::vector<Function *> Result;
1869     auto &&Selected = make_filter_range(M.functions(), [&Flt](Function &F) {
1870       if (!IsLibraryFunction(F))
1871         return false;
1872       return Flt(F);
1873     });
1874     llvm::transform(Selected, std::back_inserter(Result),
1875                     [](Function &Fn) { return &Fn; });
1876     return Result;
1877   }
1878 
PurgeNon64BitDivRemFunctions(Module & M)1879   static void PurgeNon64BitDivRemFunctions(Module &M) {
1880     auto ToErase = selectEmulationFunctions(M, [](Function &F) {
1881       if (F.getReturnType()->getScalarType()->isIntegerTy(64))
1882         return false;
1883       return std::any_of(DivRemPrefixes.begin(), DivRemPrefixes.end(),
1884                          [&F](const auto &PrOp) {
1885                            return F.getName().startswith(PrOp.Prefix);
1886                          });
1887     });
1888     processToEraseList(ToErase);
1889   }
1890 
PurgeFPConversionFunctions(Module & M,bool TargetHasFP64,bool TargetHasI64)1891   static void PurgeFPConversionFunctions(Module &M, bool TargetHasFP64,
1892                                          bool TargetHasI64) {
1893     auto ToErase = selectEmulationFunctions(M, [=](Function &F) {
1894       // Skip non-converts
1895       if (std::none_of(EmulationFPConvertsPrefixes.begin(),
1896                        EmulationFPConvertsPrefixes.end(),
1897                        [&F](const auto &PrOp) {
1898                          return F.getName().startswith(PrOp.Prefix);
1899                        }))
1900         return false;
1901 
1902       bool IsFP64Operation =
1903           std::any_of(F.arg_begin(), F.arg_end(),
1904                       [](const auto &Arg) {
1905                         return Arg.getType()->getScalarType()->isDoubleTy();
1906                       }) ||
1907           F.getReturnType()->getScalarType()->isDoubleTy();
1908 
1909       // if target does not have support for I64 but does have FP64 - then
1910       // fp64 converts should be preserved
1911       if (!TargetHasI64 && TargetHasFP64 && IsFP64Operation) {
1912         return false;
1913       }
1914 
1915       // If target does not have support for I64 and FP64 - then
1916       // fp64 converts should be removed
1917       if (!TargetHasI64 && !TargetHasFP64 && IsFP64Operation) {
1918         return true;
1919       }
1920 
1921       return TargetHasI64;
1922     });
1923     processToEraseList(ToErase);
1924   }
1925 
PurgeUnneededEmulationFunctions(Module & ModEmuFun,const GenXSubtarget & ST)1926   static void PurgeUnneededEmulationFunctions(Module &ModEmuFun,
1927                                              const GenXSubtarget &ST) {
1928     if (ST.hasIntDivRem32())
1929       PurgeNon64BitDivRemFunctions(ModEmuFun);
1930 
1931     PurgeFPConversionFunctions(ModEmuFun, ST.hasFP64(), !ST.emulateLongLong());
1932   }
1933 
DeriveRoundingAttributes(Function & F)1934   static void DeriveRoundingAttributes(Function &F) {
1935 
1936     IGC_ASSERT(IsLibraryFunction(F));
1937 
1938     const auto &Name = F.getName();
1939     if (Name.contains(RoundingRtzSuffix)) {
1940       F.addFnAttr(genx::FunctionMD::CMFloatControl,
1941                   std::to_string(VCRoundingRTZ));
1942       return;
1943     }
1944     if (Name.contains(RoundingRteSuffix)) {
1945       F.addFnAttr(genx::FunctionMD::CMFloatControl,
1946                   std::to_string(VCRoundingRTE));
1947       return;
1948     }
1949     if (Name.contains(RoundingRtpSuffix)) {
1950       F.addFnAttr(genx::FunctionMD::CMFloatControl,
1951                   std::to_string(VCRoundingRTP));
1952       return;
1953     }
1954     if (Name.contains(RoundingRtnSuffix)) {
1955       F.addFnAttr(genx::FunctionMD::CMFloatControl,
1956                   std::to_string(VCRoundingRTN));
1957       return;
1958     }
1959   }
1960 
LoadEmuFunLib(LLVMContext & Ctx,const DataLayout & DL,const std::string & Triple)1961   std::unique_ptr<Module> LoadEmuFunLib(LLVMContext &Ctx, const DataLayout &DL,
1962                                         const std::string &Triple) {
1963 
1964     MemoryBufferRef EmulationBiFBuffer =
1965         getAnalysis<GenXBackendConfig>().getBiFModule(BiFKind::VCEmulation);
1966 
1967     // NOTE: to simplify LIT testing it is legal to have an empty buffer
1968     if (!EmulationBiFBuffer.getBufferSize())
1969       return nullptr;
1970 
1971     auto BiFModule = vc::getBiFModuleOrReportError(EmulationBiFBuffer, Ctx);
1972 
1973     BiFModule->setDataLayout(DL);
1974     BiFModule->setTargetTriple(Triple);
1975 
1976     for (Function &F : *BiFModule) {
1977       if (!IsLibraryFunction(F))
1978         continue;
1979 
1980       F.addFnAttr(genx::FunctionMD::VCEmulationRoutine);
1981       DeriveRoundingAttributes(F);
1982     }
1983 
1984     return BiFModule;
1985   }
1986 };
1987 } // namespace
1988 
1989 char GenXEmulationImport::ID = 0;
1990 
1991 namespace llvm {
1992 void initializeGenXEmulationImportPass(PassRegistry &);
1993 }
1994 INITIALIZE_PASS_BEGIN(GenXEmulationImport, "GenXEmulationImport",
1995                       "GenXEmulationImport", false, false)
1996 INITIALIZE_PASS_END(GenXEmulationImport, "GenXEmulationImport",
1997                     "GenXEmulationImport", false, false)
createGenXEmulationImportPass()1998 ModulePass *llvm::createGenXEmulationImportPass() {
1999   initializeGenXEmulationImportPass(*PassRegistry::getPassRegistry());
2000   return new GenXEmulationImport;
2001 }
2002