1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "Compiler/Optimizer/OpenCLPasses/ReplaceUnsupportedIntrinsics/ReplaceUnsupportedIntrinsics.hpp"
10 #include "Compiler/CodeGenContextWrapper.hpp"
11 #include "Compiler/CodeGenPublic.h"
12 #include "Compiler/IGCPassSupport.h"
13 #include "common/igc_regkeys.hpp"
14 
15 #include "common/LLVMWarningsPush.hpp"
16 #include "llvm/Config/llvm-config.h"
17 #include "llvmWrapper/IR/DerivedTypes.h"
18 #include "llvmWrapper/IR/Instructions.h"
19 #include "llvmWrapper/IR/IRBuilder.h"
20 #include "llvmWrapper/Support/Alignment.h"
21 #include "llvmWrapper/Support/TypeSize.h"
22 #include "llvm/IR/DerivedTypes.h"
23 #include "llvm/IR/Module.h"
24 #include "llvm/IR/Function.h"
25 #include "llvm/IR/Instructions.h"
26 #include "llvm/IR/Intrinsics.h"
27 #include "llvm/IR/IntrinsicInst.h"
28 #include <llvm/IR/InstVisitor.h>
29 #include "common/LLVMWarningsPop.hpp"
30 
31 #include <map>
32 #include "Probe/Assertion.h"
33 
34 using namespace llvm;
35 using namespace IGC;
36 using IGCLLVM::getAlign;
37 
38 namespace
39 {
40     /// ReplaceIntrinsics pass lowers calls to unsupported intrinsics functions.
41     // Two llvm instrinsics are replaced llvm.memcpy and llvm.memset. Both appear in SPIR spec.
42     class ReplaceUnsupportedIntrinsics : public llvm::FunctionPass, public llvm::InstVisitor<ReplaceUnsupportedIntrinsics>
43     {
44     public:
45         typedef void (ReplaceUnsupportedIntrinsics::* MemFuncPtr_t)(IntrinsicInst*);
46         static char ID;
47 
48         ReplaceUnsupportedIntrinsics();
49 
~ReplaceUnsupportedIntrinsics()50         ~ReplaceUnsupportedIntrinsics() {}
51 
getPassName() const52         virtual llvm::StringRef getPassName() const override
53         {
54             return "ReplaceUnsupportedIntrinsics";
55         }
56 
57         virtual bool runOnFunction(llvm::Function& F) override;
58 
getAnalysisUsage(llvm::AnalysisUsage & AU) const59         virtual void getAnalysisUsage(llvm::AnalysisUsage& AU) const override
60         {
61             AU.addRequired<CodeGenContextWrapper>();
62         }
63 
64         void visitIntrinsicInst(llvm::IntrinsicInst& I);
65 
66     private:
67         CodeGenContext* m_Ctx;
68         std::vector<llvm::IntrinsicInst*> m_instsToReplace;
69 
70         /// Helper
71         ///
72         // if the value comes from a bitcast, return the source, otherwise return itself
SkipBitCast(Value * v)73         Value* SkipBitCast(Value* v) {
74             if (BitCastInst* bc = dyn_cast<BitCastInst>(v)) {
75                 // Don't skip if this is a pointer cast and the addrspace changed
76                 if (v->getType()->isPointerTy() &&
77                     bc->getOperand(0)->getType()->isPointerTy() &&
78                     v->getType()->getPointerAddressSpace() != bc->getOperand(0)->getType()->getPointerAddressSpace()) {
79                     return v;
80                 }
81                 v = bc->getOperand(0);
82             }
83             return v;
84         }
85 
86         // Get the largest of power-of-2 value that is <= C AND that can divide C.
getLargestPowerOfTwo(uint32_t C)87         uint32_t getLargestPowerOfTwo(uint32_t C) {
88             // If C == 0 (shouldn't happen), return a big one.
89             return (C == 0) ? 4096 : (C & (~C + 1));
90         }
91 
92         MemCpyInst* MemMoveToMemCpy(MemMoveInst* MM);
93         Instruction* insertReverseLoop(BasicBlock* Loc, BasicBlock* Post, Value* Length, StringRef BBName);
94         Instruction* insertLoop(Instruction* Loc, Value* Length, StringRef BBName);
95         Value* replicateScalar(Value* ScalarVal, Type* Ty, Instruction* InsertBefore);
96         void generalGroupI8Stream(
97             LLVMContext& C, uint32_t NumI8, uint32_t Align,
98             uint32_t& NumI32, Type** Vecs, uint32_t& L, uint32_t BaseTypeSize);
99 
100         /// replace member function
101         void replaceMemcpy(IntrinsicInst* I);
102         void replaceMemset(IntrinsicInst* I);
103         void replaceMemMove(IntrinsicInst* I);
104         void replaceExpect(IntrinsicInst* I);
105         void replaceFunnelShift(IntrinsicInst* I);
106 
107         static const std::map< Intrinsic::ID, MemFuncPtr_t > m_intrinsicToFunc;
108     };
109 }
110 
111 // Register pass to igc-opt
112 #define PASS_FLAG "igc-replace-unsupported-intrinsics"
113 #define PASS_DESCRIPTION "Replace calls to instrinsics which are not supported by the codegen"
114 #define PASS_CFG_ONLY false
115 #define PASS_ANALYSIS false
116 IGC_INITIALIZE_PASS_BEGIN(ReplaceUnsupportedIntrinsics, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
117 IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
118 IGC_INITIALIZE_PASS_END(ReplaceUnsupportedIntrinsics, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
119 
120 char ReplaceUnsupportedIntrinsics::ID = 0;
121 
122 const std::map< Intrinsic::ID, ReplaceUnsupportedIntrinsics::MemFuncPtr_t > ReplaceUnsupportedIntrinsics::m_intrinsicToFunc =
123 {
124     { Intrinsic::fshl,       &ReplaceUnsupportedIntrinsics::replaceFunnelShift },
125     { Intrinsic::fshr,       &ReplaceUnsupportedIntrinsics::replaceFunnelShift },
126     { Intrinsic::memcpy,     &ReplaceUnsupportedIntrinsics::replaceMemcpy },
127     { Intrinsic::memset,     &ReplaceUnsupportedIntrinsics::replaceMemset },
128     { Intrinsic::memmove,    &ReplaceUnsupportedIntrinsics::replaceMemMove },
129     { Intrinsic::expect,     &ReplaceUnsupportedIntrinsics::replaceExpect }
130 };
131 
ReplaceUnsupportedIntrinsics()132 ReplaceUnsupportedIntrinsics::ReplaceUnsupportedIntrinsics() : FunctionPass(ID)
133 {
134     initializeReplaceUnsupportedIntrinsicsPass(*PassRegistry::getPassRegistry());
135 }
136 
MemMoveToMemCpy(MemMoveInst * MM)137 MemCpyInst* ReplaceUnsupportedIntrinsics::MemMoveToMemCpy(MemMoveInst* MM)
138 {
139     SmallVector<Value*, 5> args;
140     for (unsigned i = 0; i < MM->getNumArgOperands(); i++)
141         args.push_back(MM->getArgOperand(i));
142 
143     auto* Dst = MM->getRawDest();
144     auto* Src = MM->getRawSource();
145     auto* Size = MM->getLength();
146 
147     Type* Tys[] = { Dst->getType(), Src->getType(), Size->getType() };
148     auto* M = MM->getParent()->getParent()->getParent();
149     auto TheFn = Intrinsic::getDeclaration(M, Intrinsic::memcpy, Tys);
150 
151     return cast<MemCpyInst>(MemCpyInst::Create(TheFn, args));
152 }
153 
154 // insertReverseLoop - Insert an empty loop at the end of BB 'Loc'.
155 // The loop's induction variable iterates from 'Length'-1 to 0.
156 // The return value is the value of the induction variable in the loop's body.
insertReverseLoop(BasicBlock * Loc,BasicBlock * Post,Value * Length,StringRef BBName)157 Instruction* ReplaceUnsupportedIntrinsics::insertReverseLoop(
158     BasicBlock* Loc, BasicBlock* Post, Value* Length, StringRef BBName)
159 {
160     DebugLoc DL = Loc->getTerminator()->getDebugLoc();
161     Function* F = Loc->getParent();
162     LLVMContext& C = F->getContext();
163     IntegerType* LengthType = cast<IntegerType>(Length->getType());
164     // Create an alloca for storing the loop's induction variable
165     Value* pIV = new AllocaInst(LengthType, 0, "pIV", &(*F->getEntryBlock().begin()));
166     // Split the BB at the location of the call
167     BasicBlock* Pre = Loc;
168     // Create a new BB for the loop Body
169     BasicBlock* Body = BasicBlock::Create(C, Twine(BBName) + ".body", F, Post);
170     ConstantInt* Zero = ConstantInt::get(LengthType, 0);
171     ConstantInt* One = ConstantInt::get(LengthType, 1);
172     {
173         // Remove the unconditional 'br' instruction which will be replaced by a conditional 'br'
174         Pre->getTerminator()->eraseFromParent();
175         IGCLLVM::IRBuilder<> B(Pre);
176         B.SetCurrentDebugLocation(DL);
177         // Init the IV
178         auto* Init = B.CreateSub(Length, One);
179         B.CreateStore(Init, pIV);
180         Value* IsContinue = B.CreateICmpSGE(Init, Zero);
181         B.CreateCondBr(IsContinue, Body, Post);
182     }
183     // The induction variable's value
184     Instruction* IV;
185     {
186         // Loop body's Basic Block
187         IGCLLVM::IRBuilder<> B(Body);
188         B.SetCurrentDebugLocation(DL);
189         IV = B.CreateLoad(pIV, "IV");
190         // User of function will add more instructions at this point ...
191         // Decrement the IV and check for end of loop
192         Value* Dec = B.CreateSub(IV, One);
193         B.CreateStore(Dec, pIV);
194         Value* IsContinue = B.CreateICmpSGE(Dec, Zero);
195         B.CreateCondBr(IsContinue, Body, Post);
196     }
197     return IV;
198 }
199 
200 // insertLoop - Insert an empty loop before instruction 'Loc'.
201 // The loop's induction variable iterates from 0 to 'Length'-1.
202 // The return value is the value of the induction variable in the loop's body.
insertLoop(Instruction * Loc,Value * Length,StringRef BBName)203 Instruction* ReplaceUnsupportedIntrinsics::insertLoop(Instruction* Loc, Value* Length, StringRef BBName)
204 {
205     DebugLoc DL = Loc->getDebugLoc();
206     Function* F = Loc->getParent()->getParent();
207     LLVMContext& C = F->getContext();
208     IntegerType* LengthType = cast<IntegerType>(Length->getType());
209     // Create an alloca for storing the loop's induction variable
210     Value* pIV = new AllocaInst(LengthType, 0, "pIV", &(*F->getEntryBlock().begin()));
211     // Split the BB at the location of the call
212     BasicBlock* Pre = Loc->getParent();
213     BasicBlock* Post = Pre->splitBasicBlock(
214         BasicBlock::iterator(Loc), Twine(BBName) + ".post");
215     // Create a new BB for the loop Body
216     BasicBlock* Body = BasicBlock::Create(C, Twine(BBName) + ".body", F, Post);
217     {
218         // Remove the unconditional 'br' instruction which will be replaced by a conditional 'br'
219         Pre->getTerminator()->eraseFromParent();
220         IGCLLVM::IRBuilder<> B(Pre);
221         B.SetCurrentDebugLocation(DL);
222         ConstantInt* Zero = ConstantInt::get(LengthType, 0);
223         // Init the IV
224         B.CreateStore(Zero, pIV);
225         Value* IsContinue = B.CreateICmpULT(Zero, Length);
226         B.CreateCondBr(IsContinue, Body, Post);
227     }
228     // The induction variable's value
229     Instruction* IV;
230     {
231         // Loop body's Basic Block
232         IGCLLVM::IRBuilder<> B(Body);
233         B.SetCurrentDebugLocation(DL);
234         IV = B.CreateLoad(pIV, "IV");
235         // User of function will add more instructions at this point ...
236         // Increment the IV and check for end of loop
237         Value* Inc = B.CreateAdd(IV, ConstantInt::get(LengthType, 1));
238         B.CreateStore(Inc, pIV);
239         Value* IsContinue = B.CreateICmpULT(Inc, Length);
240         B.CreateCondBr(IsContinue, Body, Post);
241     }
242     return IV;
243 }
244 
replicateScalar(Value * ScalarVal,Type * Ty,Instruction * InsertBefore)245 Value* ReplaceUnsupportedIntrinsics::replicateScalar(
246     Value* ScalarVal, Type* Ty, Instruction* InsertBefore)
247 {
248     IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
249     Type* ETy = VTy ? VTy->getElementType() : Ty;
250     uint32_t sBits = (unsigned int)ScalarVal->getType()->getPrimitiveSizeInBits();
251     uint32_t nBits = (unsigned int)ETy->getPrimitiveSizeInBits();
252     IGC_ASSERT(sBits);
253     IGC_ASSERT_MESSAGE((nBits % sBits) == 0, "Type mismatch in replicateScalar!");
254     IGC_ASSERT_MESSAGE(nBits <= 64, "Type mismatch in replicateScalar!");
255     uint32_t ratio = nBits / sBits;
256 
257     IGCLLVM::IRBuilder<> Builder(InsertBefore);
258     Value* NewVal;
259     if (ratio > 1)
260     {
261         if (ConstantInt* CI = dyn_cast<ConstantInt>(ScalarVal))
262         {
263             uint64_t s = CI->getZExtValue();
264             uint64_t n = s;
265             for (unsigned i = 1; i < ratio; ++i)
266             {
267                 n = (n << sBits) | s;
268             }
269             NewVal = ConstantInt::get(ETy, n);
270         }
271         else
272         {
273             Value* nScalarVal = Builder.CreateZExt(ScalarVal, ETy);
274             NewVal = nScalarVal;
275             for (unsigned i = 1; i < ratio; ++i)
276             {
277                 NewVal = Builder.CreateShl(NewVal, sBits);
278                 NewVal = Builder.CreateAdd(NewVal, nScalarVal);
279             }
280         }
281     }
282     else
283     {
284         NewVal = ScalarVal;
285     }
286 
287     Value* Res;
288     if (VTy)
289     {
290         Res = UndefValue::get(VTy);
291         Type* TyI32 = Type::getInt32Ty(ScalarVal->getContext());
292         for (unsigned i = 0; i < VTy->getNumElements(); ++i)
293         {
294             Value* Idx = ConstantInt::get(TyI32, i);
295             Res = Builder.CreateInsertElement(Res, NewVal, Idx);
296         }
297     }
298     else
299     {
300         Res = NewVal;
301     }
302     return Res;
303 
304 }
305 
306 // A help functions to generate vector load or stores for efficient
307 // memory operations.
308 // However, if size of base type is to kept the generated vectors will be different
309 // <8xi32>  for size of the base type = 32
310 // <16xi16> for size of the base type = 16
311 //
312 // generalGroupI8Stream() groups a stream of i8 into a stream of <8xi32> or <16xi16> as
313 // much as possible. Then for the remaining i8's ( < 32), group them
314 // into vectors of element type i32(i16) and/or i8. This results in at most
315 // the following 5 vectors and/or scalars:
316 //    <4xi32>, <3xi32> or <2xi32>, i32, <2xi8>, i8 or
317 //    <8xi16>, <4xi16>, <2xi16>, i16, i8
318 // Note that we will not generate <3xi8> (see also the code for details).
319 // For example, given 127 i8's, we can
320 // have:
321 //   <8xi32>, <8xi32>, <8xi32>, <4xi32>, <3xi32>, <2xi8>, i8
322 //
323 // The grouping result are kept in Vecs, L (actual length of Vecs),
324 // and NumI32 (the number of <8xi32>, ie. the number of Vecs[0]. For all
325 // the other vectors/scalars, ie Vecs[1 : L-1], the number is always 1).
326 // For the above case, they are:
327 //    Vecs[0] = <8xi32>
328 //    Vecs[1] = <4xi32>
329 //    Vecs[2] = <3xi32>
330 //    Vecs[3] = <2xi8>
331 //    Vecs[4] = i8
332 //    L = 5;
333 //    NumI32 = 3;
334 //
335 // We may generate <3xi32>, but not <3xi8> as <3xi32> can be loaded
336 // or stored by a single send instruction, where <3xi8> cannot (even
337 // <3xi8> can be splitted later in VectorProcessing, but it's better
338 // not generate <3xi8> vector in the first place).
339 //
340 // The same example with given 127 i8's but with keeping size of base
341 // type of initial vector as 16 we can have:
342 //   <16xi16>, <16xi16>, <16xi16>, <8xi16>, <4xi16>, <2xi16>, i16, i8
343 //
344 // The grouping result are kept in Vecs, L (actual length of Vecs),
345 // and NumI32 (the number of <16xi16>, ie. the number of Vecs[0]. For all
346 // the other vectors/scalars, ie Vecs[1 : L-1], the number is always 1).
347 // For the above case, they are:
348 //    Vecs[0] = <16xi16>
349 //    Vecs[1] = <8xi16>
350 //    Vecs[2] = <4xi16>
351 //    Vecs[3] = <2xi16>
352 //    Vecs[4] = i16
353 //    Vecs[5] = i8
354 //    L = 6;
355 //    NumI32 = 3;
356 //
357 // Note that Vecs[] should be allocated by callers with enough space
358 // to hold all vectors (6 should be enough; 1 for <8xi32>(<16xi16>),
359 // 5 for the others).
360 // We want from <4x<2xhalf>> [with size of the base type 16(half)]
361 // generate <8xi16> not <4xi32>
362 // Default BaseTypeSize=32 means that we don't concern about keeping
363 // size of the base type
generalGroupI8Stream(LLVMContext & C,uint32_t NumI8,uint32_t Align,uint32_t & VectorsNum,Type ** Vecs,uint32_t & L,uint32_t BaseTypeSize=32)364 void ReplaceUnsupportedIntrinsics::generalGroupI8Stream(
365     LLVMContext& C, uint32_t NumI8, uint32_t Align,
366     uint32_t& VectorsNum, Type** Vecs, uint32_t& L, uint32_t BaseTypeSize = 32)
367 {
368     VectorsNum = NumI8 / 32; // size of <8xi32> = 32. count of <8xi32> or <16xi16>
369     uint32_t RemI8 = NumI8 % 32;
370     uint32_t BaseTypeSizeInBytes = BaseTypeSize / 8;
371     uint32_t CntI = RemI8 / BaseTypeSizeInBytes;    // the number of i32(0..7) or i16(0..15)
372     uint32_t CntI8 = RemI8 % BaseTypeSizeInBytes;   // remaining number of i8(0-3) - for base_type_size = 32 or
373                                                     //                     i8(0-1) - for base_type_size = 16
374 
375     // To process all cases (3 for i32 and 4 for i16: it depends of how much CntI do we have)
376     uint32_t Power = 256 / BaseTypeSize;    // i32: (256 / 32) = 0b1000 = 8
377                                             // i16: (256 / 16) = 0b10000 = 16
378 
379     Type* BaseType = Type::getIntNTy(C, BaseTypeSize);
380     Type* TyI8 = Type::getInt8Ty(C);
381 
382     uint32_t n = 0;
383     Vecs[n++] = IGCLLVM::FixedVectorType::get(BaseType, Power);
384 
385     while ((Power >>= 1) > 1)
386     {
387         if (CntI >= Power)
388         {
389             Vecs[n++] = IGCLLVM::FixedVectorType::get(BaseType, Power);
390             CntI -= Power;
391         }
392         if (CntI == 3 && BaseTypeSize == 32 && Align >= 4) // special case for <8xi32> not to generate <3xi8> but to generate <3xi32>
393         {
394             Vecs[n++] = IGCLLVM::FixedVectorType::get(BaseType, 3);
395             CntI = 0;
396             break;
397         }
398     }
399     if (CntI >= 1)
400     {
401         Vecs[n++] = BaseType;
402         CntI -= Power; // Assume that pow should be 1 to generate i32(i16) and not <1xi32>(<1xi16>)
403     }
404     IGC_ASSERT_MESSAGE(CntI == 0, "Did not handle all types of base_type");
405 
406     Power = BaseTypeSize / 4;   // i32: 32 / 8 = 4
407                                 // i16: 16 / 8 = 2
408     while ((Power >>= 1) > 1)
409     {
410         if (CntI8 >= Power)
411         {
412             Vecs[n++] = IGCLLVM::FixedVectorType::get(TyI8, Power);
413             CntI8 -= Power;
414         }
415     }
416     if (CntI8 >= 1)
417     {
418         Vecs[n++] = TyI8;
419         CntI8 -= Power; // Assume that pow should be 1 to generate i8 not <1xi8>
420     }
421     IGC_ASSERT_MESSAGE(CntI8 == 0, "Did not handle all types of I8");
422 
423     L = n;
424 }
425 
replaceMemcpy(IntrinsicInst * I)426 void ReplaceUnsupportedIntrinsics::replaceMemcpy(IntrinsicInst* I)
427 {
428     // The idea is to convert
429     //
430     //   memcpy (i8* Dst, i8* Src, len)
431     //
432     // into a vector load and store for cases where "len" is
433     // constant. If "len" isn't constant,  just use i8 copy as
434     // this should not happen with OCL code (all memcpy is
435     // generated by the compiler for cases such as structure
436     // assignment, etc.)
437     //
438     // If len is constant, it will be transferred to
439     //
440     //   lenv8 = len / 32 (<8xi32>);
441     //   len_rem = len % 32;
442     //
443     //   // main loop
444     //   dstV8 = bitcast Dst, <8xi32>*
445     //   srcV8 = bitcast Src, <8xi32>*
446     //   for(i=0; i < lenv8; ++i)
447     //     dstV8[i] = srcV8[i];
448     //
449     //   // epilog, process remaining elements
450     //   for(i=0; i < len_rem; ++i)
451     //     Dst[lenv8*32 + i] = Src[lenv8*32 + i];
452     //
453     //   Note that the above epilog loop is optimized away with
454     //   as much as possible <nxi32> and <mxi8> loads and stores
455     //   or if we want to keep size of the base type
456     //   (for 16bit there will be <nxi16> and <mxi8>)
457     //
458     // Selecting 8 as vector length or 16 in case of i16 is due to
459     // that A64 messages can load eight i32 or sixteen i16 per SIMD channel.
460     // A32 will have 2 loads/stores for each vector, which is still efficient.
461     // Unaligned vector will be handled correctly and effciently later
462     // in vector load and store emit.
463     MemCpyInst* MC = cast<MemCpyInst>(I);
464     Value* Dst = MC->getRawDest();
465     Value* Src = MC->getRawSource();
466     Value* LPCount = MC->getLength();
467     uint32_t Align = MC->getDestAlignment();
468     Align = Align != 0 ? Align : 1;
469     const bool IsVolatile = MC->isVolatile();
470     const uint32_t SrcAS = MC->getSourceAddressSpace();
471     const uint32_t DstAS = MC->getDestAddressSpace();
472 
473     LLVMContext& C = MC->getContext();
474     Type* TySrcPtrI8 = Type::getInt8PtrTy(C, SrcAS);
475     Type* TyDstPtrI8 = Type::getInt8PtrTy(C, DstAS);
476 
477     IGCLLVM::IRBuilder<> Builder(MC);
478 
479     // BaseSize is flag if we want to handle algorithm in general way
480     // or want to keep size of base type to further optimizations
481     uint32_t BaseSize = 0;
482     Type* RawDstType = Dst->stripPointerCasts()->getType()->getPointerElementType();
483     if (Type* BaseType = GetBaseType(RawDstType))
484         BaseSize = BaseType->getScalarSizeInBits();
485 
486     if (BaseSize != 16)
487         // size 32 is equal to size of i32, so general algorithm will be applied
488         BaseSize = 32;
489 
490     ConstantInt* CI = dyn_cast<ConstantInt>(LPCount);
491     if (CI)
492     {
493         uint32_t Count = (uint32_t)CI->getZExtValue();
494 
495         Type* VecTys[8];
496         uint32_t Len, NewCount;
497         generalGroupI8Stream(C, Count, Align, NewCount, VecTys, Len, BaseSize);
498 
499         Value* NewSrc, * NewDst, * vDst, * vSrc;
500         uint32_t BOfst = 0; // Byte offset
501 
502         // First, insert main loop before MC.
503         // Note that if NewCount is small, we may directly generate ld/st
504         // without generating the loop.
505         if (NewCount > 0)
506         {
507             vSrc = Builder.CreateBitCast(SkipBitCast(Src), PointerType::get(VecTys[0], SrcAS), "memcpy_vsrc");
508             vDst = Builder.CreateBitCast(SkipBitCast(Dst), PointerType::get(VecTys[0], DstAS), "memcpy_vdst");
509 
510             // getPrimitiveSizeInBits() should be enough, no need to
511             // use DataLayout to get target-dependent size.
512             uint32_t SZ = (unsigned int)(VecTys[0]->getPrimitiveSizeInBits() / 8);
513 
514             // To set alignment correctly
515             uint32_t adjust_align = getLargestPowerOfTwo(SZ);
516             Align = adjust_align < Align ? adjust_align : Align;
517 
518             // If NewCount is less than 6,  don't generate loop.
519             // Note that 6 is just an arbitrary number here.
520             if (NewCount < 6)
521             {
522                 for (unsigned i = 0; i < NewCount; ++i)
523                 {
524                     Value* tSrc = Builder.CreateConstGEP1_32(vSrc, i);
525                     Value* tDst = Builder.CreateConstGEP1_32(vDst, i);
526                     LoadInst* L = Builder.CreateAlignedLoad(tSrc, getAlign(Align), IsVolatile);
527                     (void)Builder.CreateAlignedStore(L, tDst, getAlign(Align), IsVolatile);
528                 }
529             }
530             else
531             {
532                 Value* NewLPCount = ConstantInt::get(LPCount->getType(), NewCount);
533                 Instruction* IV = insertLoop(MC, NewLPCount, "memcpy");
534                 {
535                     IGCLLVM::IRBuilder<> B(&(*++BasicBlock::iterator(IV)));
536                     Value* tSrc = B.CreateGEP(vSrc, IV);
537                     Value* tDst = B.CreateGEP(vDst, IV);
538                     LoadInst* L = B.CreateAlignedLoad(tSrc, getAlign(Align), IsVolatile);
539                     (void)B.CreateAlignedStore(L, tDst, getAlign(Align), IsVolatile);
540                 }
541             }
542 
543             BOfst = NewCount * SZ;
544         }
545 
546         // Second, generate epilog code before MC.
547         // Note that as MC has been moved to a different BB by
548         //   inserting the main loop! Reset it to MC.
549         Builder.SetInsertPoint(MC);
550         if (Len > 1)
551         {
552             Src = Builder.CreateBitCast(SkipBitCast(Src), TySrcPtrI8, "memcpy_src");
553             Dst = Builder.CreateBitCast(SkipBitCast(Dst), TyDstPtrI8, "memcpy_dst");
554         }
555         for (unsigned i = 1; i < Len; ++i)
556         {
557             uint32_t SZ = (unsigned int)VecTys[i]->getPrimitiveSizeInBits() / 8;
558             uint32_t adjust_align = getLargestPowerOfTwo(SZ);
559             Align = adjust_align < Align ? adjust_align : Align;
560             NewSrc = BOfst > 0 ? Builder.CreateConstGEP1_32(Src, BOfst) : Src;
561             NewDst = BOfst > 0 ? Builder.CreateConstGEP1_32(Dst, BOfst) : Dst;
562             vSrc = Builder.CreateBitCast(SkipBitCast(NewSrc), PointerType::get(VecTys[i], SrcAS), "memcpy_rem");
563             vDst = Builder.CreateBitCast(SkipBitCast(NewDst), PointerType::get(VecTys[i], DstAS), "memcpy_rem");
564             LoadInst* L = Builder.CreateAlignedLoad(vSrc, getAlign(Align), IsVolatile);
565             (void)Builder.CreateAlignedStore(L, vDst, getAlign(Align), IsVolatile);
566             BOfst += SZ;
567         }
568     }
569     else
570     {
571         Src = Builder.CreateBitCast(SkipBitCast(Src), TySrcPtrI8, "memcpy_src");
572         Dst = Builder.CreateBitCast(SkipBitCast(Dst), TyDstPtrI8, "memcpy_dst");
573         // Fall back to i8 copy
574         Instruction* IV = insertLoop(MC, LPCount, "memcpy");
575         {
576             IGCLLVM::IRBuilder<> B(&(*++BasicBlock::iterator(IV)));
577             Value* tSrc = B.CreateGEP(Src, IV);
578             Value* tDst = B.CreateGEP(Dst, IV);
579             LoadInst* L = B.CreateAlignedLoad(tSrc, getAlign(Align), IsVolatile);
580             (void)B.CreateAlignedStore(L, tDst, getAlign(Align), IsVolatile);
581         }
582     }
583     MC->eraseFromParent();
584 }
585 
replaceMemMove(IntrinsicInst * I)586 void ReplaceUnsupportedIntrinsics::replaceMemMove(IntrinsicInst* I)
587 {
588     MemMoveInst* MM = cast<MemMoveInst>(I);
589     Value* Dst = MM->getRawDest();
590     Value* Src = MM->getRawSource();
591     Value* LPCount = MM->getLength();
592     uint32_t Align = MM->getDestAlignment();
593     if (Align == 0)
594         Align = 1;
595     const bool IsVolatile = MM->isVolatile();
596     const uint32_t SrcAS = MM->getSourceAddressSpace();
597     const uint32_t DstAS = MM->getDestAddressSpace();
598 
599     // If non-generic address spaces mismatch, they can't alias
600     // and we can do a memcpy.
601 
602     if (SrcAS < ADDRESS_SPACE_NUM_ADDRESSES &&
603         DstAS < ADDRESS_SPACE_NUM_ADDRESSES &&
604         SrcAS != ADDRESS_SPACE_GENERIC &&
605         DstAS != ADDRESS_SPACE_GENERIC &&
606         SrcAS != DstAS)
607     {
608         auto* MemCpy = MemMoveToMemCpy(MM);
609         MemCpy->insertBefore(MM);
610         replaceMemcpy(MemCpy);
611         MM->eraseFromParent();
612         return;
613     }
614 
615     LLVMContext& C = MM->getContext();
616     Type* TySrcPtrI8 = Type::getInt8PtrTy(C, SrcAS);
617     Type* TyDstPtrI8 = Type::getInt8PtrTy(C, DstAS);
618 
619     auto* F = MM->getParent()->getParent();
620 
621     IGCLLVM::IRBuilder<> B(MM);
622 
623     auto* i8Src = B.CreateBitCast(SkipBitCast(Src), TySrcPtrI8, "memcpy_src");
624     auto* i8Dst = B.CreateBitCast(SkipBitCast(Dst), TyDstPtrI8, "memcpy_dst");
625 
626     // Setup control flow to do:
627     // if (Src < Dst)
628     //   reverse copy data
629     // else
630     //   normal copy (such as memcpy())
631 
632     // Src < Dst
633     Value* pCmp = nullptr;
634     {
635         auto* cmpCastSrc = (DstAS == ADDRESS_SPACE_GENERIC) ?
636             B.CreateAddrSpaceCast(i8Src, TyDstPtrI8) : i8Src;
637         auto* cmpCastDst = (SrcAS == ADDRESS_SPACE_GENERIC) ?
638             B.CreateAddrSpaceCast(i8Dst, TySrcPtrI8) : i8Dst;
639 
640         pCmp = B.CreateICmpULT(cmpCastSrc, cmpCastDst);
641     }
642 
643     auto* Pre = MM->getParent();
644     auto* Post = Pre->splitBasicBlock(MM, "memmove.post");
645 
646     Pre->getTerminator()->eraseFromParent();
647 
648     auto* BBTrue = BasicBlock::Create(C, "memmove.true", F, Post);
649     auto* BBFalse = BasicBlock::Create(C, "memmove.false", F, Post);
650 
651     B.SetInsertPoint(Pre);
652     B.CreateCondBr(pCmp, BBTrue, BBFalse);
653 
654     B.SetInsertPoint(BBTrue);
655     B.CreateBr(Post);
656     B.SetInsertPoint(BBFalse);
657     B.CreateBr(Post);
658 
659     auto* CI = dyn_cast<ConstantInt>(LPCount);
660     if (CI)
661     {
662         uint32_t Count = (uint32_t)CI->getZExtValue();
663 
664         // noop
665         if (Count == 0)
666         {
667             MM->eraseFromParent();
668             return;
669         }
670 
671         Type* VecTys[8];
672         uint32_t Len, NewCount;
673         generalGroupI8Stream(C, Count, Align, NewCount, VecTys, Len);
674 
675         // for true block (Src < Dst), do a reverse copy.
676         {
677             B.SetInsertPoint(BBTrue->getTerminator());
678 
679             // calculate byte offsets so we can walk backwards through them
680             SmallVector<uint, 8> byteOffsets{ 0 };
681 
682             {
683                 uint32_t SZ = (unsigned int)(VecTys[0]->getPrimitiveSizeInBits() / 8);
684                 uint32_t BOfst = NewCount * SZ;
685 
686                 for (unsigned i = 1; i < Len; i++)
687                 {
688                     byteOffsets.push_back(BOfst);
689                     uint32_t SZ = (unsigned int)(VecTys[i]->getPrimitiveSizeInBits() / 8);
690                     BOfst += SZ;
691                 }
692             }
693 
694             // emit the smaller than <8 x i32> stores
695             for (unsigned i = Len - 1; i >= 1; i--)
696             {
697                 uint offset = byteOffsets[i];
698                 uint32_t newAlign = getLargestPowerOfTwo(Align + offset);
699                 auto* tSrc = B.CreateConstGEP1_32(i8Src, offset);
700                 auto* tDst = B.CreateConstGEP1_32(i8Dst, offset);
701 
702                 auto* vSrc = B.CreateBitCast(SkipBitCast(tSrc), PointerType::get(VecTys[i], SrcAS), "memcpy_rem");
703                 auto* vDst = B.CreateBitCast(SkipBitCast(tDst), PointerType::get(VecTys[i], DstAS), "memcpy_rem");
704                 LoadInst* L = B.CreateAlignedLoad(vSrc, getAlign(newAlign), IsVolatile);
705                 (void)B.CreateAlignedStore(L, vDst, getAlign(newAlign), IsVolatile);
706             }
707 
708             // now emit the <8 x i32> stores
709             auto* vSrc = B.CreateBitCast(SkipBitCast(Src), PointerType::get(VecTys[0], SrcAS), "memcpy_vsrc");
710             auto* vDst = B.CreateBitCast(SkipBitCast(Dst), PointerType::get(VecTys[0], DstAS), "memcpy_vdst");
711             // If NewCount is less than 6,  don't generate loop.
712             // Note that 6 is just an arbitrary number here.
713             uint32_t SZ = (unsigned int)(VecTys[0]->getPrimitiveSizeInBits() / 8);
714             uint32_t newAlign = getLargestPowerOfTwo(Align + SZ);
715             if (NewCount < 6)
716             {
717                 for (unsigned i = 0; i < NewCount; i++)
718                 {
719                     unsigned idx = NewCount - 1 - i;
720                     auto* tSrc = B.CreateConstGEP1_32(vSrc, idx);
721                     auto* tDst = B.CreateConstGEP1_32(vDst, idx);
722                     LoadInst* L = B.CreateAlignedLoad(tSrc, getAlign(newAlign), IsVolatile);
723                     (void)B.CreateAlignedStore(L, tDst, getAlign(newAlign), IsVolatile);
724                 }
725             }
726             else
727             {
728                 auto* NewLPCount = ConstantInt::get(LPCount->getType(), NewCount);
729                 Instruction* IV = insertReverseLoop(BBTrue, Post, NewLPCount, "memmmove");
730                 {
731                     IGCLLVM::IRBuilder<> B(&(*++BasicBlock::iterator(IV)));
732                     Value* tSrc = B.CreateGEP(vSrc, IV);
733                     Value* tDst = B.CreateGEP(vDst, IV);
734                     LoadInst* L = B.CreateAlignedLoad(tSrc, getAlign(newAlign), IsVolatile);
735                     (void)B.CreateAlignedStore(L, tDst, getAlign(newAlign), IsVolatile);
736                 }
737             }
738         }
739 
740         // for false block (Src >= Dst), just a plain memcpy.
741         {
742             auto* MemCpy = MemMoveToMemCpy(MM);
743             MemCpy->insertBefore(BBFalse->getTerminator());
744             replaceMemcpy(MemCpy);
745         }
746     }
747     else
748     {
749         // (Src < Dst)
750         {
751             B.SetInsertPoint(BBTrue->getTerminator());
752             // Fall back to i8 copy
753             Instruction* IV = insertReverseLoop(BBTrue, Post, LPCount, "memmove");
754             {
755                 IGCLLVM::IRBuilder<> B(&(*++BasicBlock::iterator(IV)));
756                 Value* tSrc = B.CreateGEP(i8Src, IV);
757                 Value* tDst = B.CreateGEP(i8Dst, IV);
758                 LoadInst* L = B.CreateAlignedLoad(tSrc, getAlign(1), IsVolatile);
759                 (void)B.CreateAlignedStore(L, tDst, getAlign(1), IsVolatile);
760             }
761         }
762 
763         // for false block (Src >= Dst), just a plain memcpy.
764         {
765             auto* MemCpy = MemMoveToMemCpy(MM);
766             MemCpy->insertBefore(BBFalse->getTerminator());
767             replaceMemcpy(MemCpy);
768         }
769     }
770 
771     MM->eraseFromParent();
772 }
773 
replaceMemset(IntrinsicInst * I)774 void ReplaceUnsupportedIntrinsics::replaceMemset(IntrinsicInst* I)
775 {
776     // Same idea as replaceMemcpy (see comment of replaceMemcpy).
777     MemSetInst* MS = cast<MemSetInst>(I);
778     Value* Dst = MS->getRawDest();
779     Value* Src = MS->getValue();
780     Value* LPCount = MS->getLength();
781     uint32_t Align = MS->getDestAlignment();
782     const bool IsVolatile = MS->isVolatile();
783     const uint32_t AS = MS->getDestAddressSpace();
784 
785     LLVMContext& C = MS->getContext();
786     Type* TyPtrI8 = Type::getInt8PtrTy(C, AS);
787 
788     IGCLLVM::IRBuilder<> Builder(MS);
789 
790     ConstantInt* CI = dyn_cast<ConstantInt>(LPCount);
791     if (CI)
792     {
793         uint32_t Count = (uint32_t)CI->getZExtValue();
794 
795         Type* VecTys[8];
796         uint32_t Len, NewCount;
797         generalGroupI8Stream(C, Count, Align, NewCount, VecTys, Len);
798 
799         Value* NewDst, * vDst, * vSrc;
800         uint32_t BOfst = 0; // Byte offset
801 
802         // First, insert main loop before MC.
803         if (NewCount > 0)
804         {
805             PointerType* PTy = PointerType::get(VecTys[0], AS);
806             vSrc = replicateScalar(Src, VecTys[0], MS);
807             vDst = Builder.CreateBitCast(SkipBitCast(Dst), PTy, "memset_vdst");
808 
809             // getPrimitiveSizeInBits() should be enough, no need to
810             // use DataLayout to get target-dependent size.
811             uint32_t SZ = (unsigned int)(VecTys[0]->getPrimitiveSizeInBits() / 8);
812 
813             // To set alignment correctly
814             uint32_t adjust_align = getLargestPowerOfTwo(SZ);
815             Align = adjust_align < Align ? adjust_align : Align;
816 
817             // If NewCount is less than 6,  don't generate loop.
818             // Note that 6 is just an arbitrary number here.
819             if (NewCount < 6)
820             {
821                 for (unsigned i = 0; i < NewCount; ++i)
822                 {
823                     Value* tDst = Builder.CreateConstGEP1_32(vDst, i);
824                     (void)Builder.CreateAlignedStore(vSrc, tDst, getAlign(Align), IsVolatile);
825                 }
826             }
827             else
828             {
829                 Value* NewLPCount = ConstantInt::get(LPCount->getType(), NewCount);
830                 Instruction* IV = insertLoop(MS, NewLPCount, "memset");
831                 {
832                     IGCLLVM::IRBuilder<> B(&(*++BasicBlock::iterator(IV)));
833                     Value* tDst = B.CreateGEP(vDst, IV);
834                     (void)B.CreateAlignedStore(vSrc, tDst, getAlign(Align), IsVolatile);
835                 }
836             }
837 
838             // Set offset for the remaining elements
839             BOfst = NewCount * SZ;
840         }
841 
842         // Second, generate epilog code before MS.
843         // Note that as MC has been moved to a different BB by
844         //   inserting the main loop! Reset it to MS.
845         Builder.SetInsertPoint(MS);
846         if (Len > 1)
847         {
848             Dst = Builder.CreateBitCast(SkipBitCast(Dst), TyPtrI8, "memset_dst");
849         }
850         for (unsigned i = 1; i < Len; ++i)
851         {
852             uint32_t SZ = (unsigned int)VecTys[i]->getPrimitiveSizeInBits() / 8;
853             uint32_t adjust_align = getLargestPowerOfTwo(SZ);
854             Align = adjust_align < Align ? adjust_align : Align;
855             PointerType* PTy = PointerType::get(VecTys[i], AS);
856             NewDst = BOfst > 0 ? Builder.CreateConstGEP1_32(Dst, BOfst) : Dst;
857             vSrc = replicateScalar(Src, VecTys[i], MS);
858             vDst = Builder.CreateBitCast(SkipBitCast(NewDst), PTy, "memset_rem");
859             (void)Builder.CreateAlignedStore(vSrc, vDst, getAlign(Align), IsVolatile);
860             BOfst += SZ;
861         }
862     }
863     else
864     {
865         Dst = Builder.CreateBitCast(SkipBitCast(Dst), TyPtrI8, "memset_dst");
866         // Fall back to i8 copy
867         Instruction* IV = insertLoop(MS, LPCount, "memset");
868         {
869             IGCLLVM::IRBuilder<> B(&(*++BasicBlock::iterator(IV)));
870             Value* tDst = B.CreateGEP(Dst, IV);
871             (void)B.CreateAlignedStore(Src, tDst, getAlign(Align), IsVolatile);
872         }
873     }
874     MS->eraseFromParent();
875 }
876 
replaceExpect(IntrinsicInst * MS)877 void ReplaceUnsupportedIntrinsics::replaceExpect(IntrinsicInst* MS)
878 {
879     MS->replaceAllUsesWith(MS->getOperand(0));
880     MS->eraseFromParent();
881 
882 }
883 
884 /*
885   Replaces llvm.fshl.* and llvm.fshr.* funnel shift intrinsics.
886   E.g. for fshl we would produce a following sequence:
887   %r = call i8 @llvm.fshl.i8(i8 %a, i8 %b, i8 %c) =>
888   %modRes = urem i8 %c, 8        // get the modulo of shift value
889   %subRes = sub i8 8, %modRes    // subtract from the type's number of bits
890   %shlRes = shl i8 %a, %modRes   // shift the bits according to instruction spec
891   %shrRes = lshr i8 %b, %subRes
892   %r = or i8 %shlRes, %shrRes    // compose the final result
893 */
replaceFunnelShift(IntrinsicInst * I)894 void ReplaceUnsupportedIntrinsics::replaceFunnelShift(IntrinsicInst* I) {
895     IGC_ASSERT(I->getIntrinsicID() == Intrinsic::fshl ||
896         I->getIntrinsicID() == Intrinsic::fshr);
897     IGCLLVM::IRBuilder<> Builder(I);
898     unsigned sizeInBits = I->getArgOperand(0)->getType()->getScalarSizeInBits();
899 
900     // Don't replace rotate
901     if (I->getArgOperand(0) == I->getArgOperand(1) && !I->getType()->isVectorTy() &&
902         m_Ctx->platform.supportRotateInstruction())
903     {
904         if (sizeInBits == 16 || sizeInBits == 32) {
905             return;
906         }
907     }
908 
909     Value* numBits = Builder.getIntN(sizeInBits, sizeInBits);
910     if (auto IVT = dyn_cast<IGCLLVM::FixedVectorType>(I->getType())) {
911         numBits = ConstantVector::getSplat(IGCLLVM::getElementCount((uint32_t)IVT->getNumElements()), cast<Constant>(numBits));
912     }
913     auto shiftModulo = Builder.CreateURem(I->getArgOperand(2), numBits);
914     auto negativeShift = Builder.CreateSub(numBits, shiftModulo);
915     if (I->getIntrinsicID() == Intrinsic::fshr) {
916         std::swap(shiftModulo, negativeShift);
917     }
918     auto upperShifted = Builder.CreateShl(I->getArgOperand(0), shiftModulo);
919     auto lowerShifted = Builder.CreateLShr(I->getArgOperand(1), negativeShift);
920     auto result = Builder.CreateOr(upperShifted, lowerShifted);
921 
922     I->replaceAllUsesWith(result);
923     I->eraseFromParent();
924 }
925 
visitIntrinsicInst(IntrinsicInst & I)926 void ReplaceUnsupportedIntrinsics::visitIntrinsicInst(IntrinsicInst& I) {
927     if (m_intrinsicToFunc.find(I.getIntrinsicID()) != m_intrinsicToFunc.end()) {
928         m_instsToReplace.push_back(&I);
929     }
930 }
931 
runOnFunction(Function & F)932 bool ReplaceUnsupportedIntrinsics::runOnFunction(Function& F)
933 {
934     m_Ctx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
935     m_instsToReplace.clear();
936     visit(F);
937     for (auto I : m_instsToReplace) {
938         (this->*m_intrinsicToFunc.at(I->getIntrinsicID())) (I);
939     }
940     return !m_instsToReplace.empty();
941 }
942 
createReplaceUnsupportedIntrinsicsPass()943 FunctionPass* IGC::createReplaceUnsupportedIntrinsicsPass()
944 {
945     return new ReplaceUnsupportedIntrinsics();
946 }
947