1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2017-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "Compiler/Optimizer/OpenCLPasses/ReplaceUnsupportedIntrinsics/ReplaceUnsupportedIntrinsics.hpp"
10 #include "Compiler/CodeGenContextWrapper.hpp"
11 #include "Compiler/CodeGenPublic.h"
12 #include "Compiler/IGCPassSupport.h"
13 #include "common/igc_regkeys.hpp"
14
15 #include "common/LLVMWarningsPush.hpp"
16 #include "llvm/Config/llvm-config.h"
17 #include "llvmWrapper/IR/DerivedTypes.h"
18 #include "llvmWrapper/IR/Instructions.h"
19 #include "llvmWrapper/IR/IRBuilder.h"
20 #include "llvmWrapper/Support/Alignment.h"
21 #include "llvmWrapper/Support/TypeSize.h"
22 #include "llvm/IR/DerivedTypes.h"
23 #include "llvm/IR/Module.h"
24 #include "llvm/IR/Function.h"
25 #include "llvm/IR/Instructions.h"
26 #include "llvm/IR/Intrinsics.h"
27 #include "llvm/IR/IntrinsicInst.h"
28 #include <llvm/IR/InstVisitor.h>
29 #include "common/LLVMWarningsPop.hpp"
30
31 #include <map>
32 #include "Probe/Assertion.h"
33
34 using namespace llvm;
35 using namespace IGC;
36 using IGCLLVM::getAlign;
37
38 namespace
39 {
40 /// ReplaceIntrinsics pass lowers calls to unsupported intrinsics functions.
41 // Two llvm instrinsics are replaced llvm.memcpy and llvm.memset. Both appear in SPIR spec.
42 class ReplaceUnsupportedIntrinsics : public llvm::FunctionPass, public llvm::InstVisitor<ReplaceUnsupportedIntrinsics>
43 {
44 public:
45 typedef void (ReplaceUnsupportedIntrinsics::* MemFuncPtr_t)(IntrinsicInst*);
46 static char ID;
47
48 ReplaceUnsupportedIntrinsics();
49
~ReplaceUnsupportedIntrinsics()50 ~ReplaceUnsupportedIntrinsics() {}
51
getPassName() const52 virtual llvm::StringRef getPassName() const override
53 {
54 return "ReplaceUnsupportedIntrinsics";
55 }
56
57 virtual bool runOnFunction(llvm::Function& F) override;
58
getAnalysisUsage(llvm::AnalysisUsage & AU) const59 virtual void getAnalysisUsage(llvm::AnalysisUsage& AU) const override
60 {
61 AU.addRequired<CodeGenContextWrapper>();
62 }
63
64 void visitIntrinsicInst(llvm::IntrinsicInst& I);
65
66 private:
67 CodeGenContext* m_Ctx;
68 std::vector<llvm::IntrinsicInst*> m_instsToReplace;
69
70 /// Helper
71 ///
72 // if the value comes from a bitcast, return the source, otherwise return itself
SkipBitCast(Value * v)73 Value* SkipBitCast(Value* v) {
74 if (BitCastInst* bc = dyn_cast<BitCastInst>(v)) {
75 // Don't skip if this is a pointer cast and the addrspace changed
76 if (v->getType()->isPointerTy() &&
77 bc->getOperand(0)->getType()->isPointerTy() &&
78 v->getType()->getPointerAddressSpace() != bc->getOperand(0)->getType()->getPointerAddressSpace()) {
79 return v;
80 }
81 v = bc->getOperand(0);
82 }
83 return v;
84 }
85
86 // Get the largest of power-of-2 value that is <= C AND that can divide C.
getLargestPowerOfTwo(uint32_t C)87 uint32_t getLargestPowerOfTwo(uint32_t C) {
88 // If C == 0 (shouldn't happen), return a big one.
89 return (C == 0) ? 4096 : (C & (~C + 1));
90 }
91
92 MemCpyInst* MemMoveToMemCpy(MemMoveInst* MM);
93 Instruction* insertReverseLoop(BasicBlock* Loc, BasicBlock* Post, Value* Length, StringRef BBName);
94 Instruction* insertLoop(Instruction* Loc, Value* Length, StringRef BBName);
95 Value* replicateScalar(Value* ScalarVal, Type* Ty, Instruction* InsertBefore);
96 void generalGroupI8Stream(
97 LLVMContext& C, uint32_t NumI8, uint32_t Align,
98 uint32_t& NumI32, Type** Vecs, uint32_t& L, uint32_t BaseTypeSize);
99
100 /// replace member function
101 void replaceMemcpy(IntrinsicInst* I);
102 void replaceMemset(IntrinsicInst* I);
103 void replaceMemMove(IntrinsicInst* I);
104 void replaceExpect(IntrinsicInst* I);
105 void replaceFunnelShift(IntrinsicInst* I);
106
107 static const std::map< Intrinsic::ID, MemFuncPtr_t > m_intrinsicToFunc;
108 };
109 }
110
111 // Register pass to igc-opt
112 #define PASS_FLAG "igc-replace-unsupported-intrinsics"
113 #define PASS_DESCRIPTION "Replace calls to instrinsics which are not supported by the codegen"
114 #define PASS_CFG_ONLY false
115 #define PASS_ANALYSIS false
116 IGC_INITIALIZE_PASS_BEGIN(ReplaceUnsupportedIntrinsics, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
117 IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
118 IGC_INITIALIZE_PASS_END(ReplaceUnsupportedIntrinsics, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
119
120 char ReplaceUnsupportedIntrinsics::ID = 0;
121
122 const std::map< Intrinsic::ID, ReplaceUnsupportedIntrinsics::MemFuncPtr_t > ReplaceUnsupportedIntrinsics::m_intrinsicToFunc =
123 {
124 { Intrinsic::fshl, &ReplaceUnsupportedIntrinsics::replaceFunnelShift },
125 { Intrinsic::fshr, &ReplaceUnsupportedIntrinsics::replaceFunnelShift },
126 { Intrinsic::memcpy, &ReplaceUnsupportedIntrinsics::replaceMemcpy },
127 { Intrinsic::memset, &ReplaceUnsupportedIntrinsics::replaceMemset },
128 { Intrinsic::memmove, &ReplaceUnsupportedIntrinsics::replaceMemMove },
129 { Intrinsic::expect, &ReplaceUnsupportedIntrinsics::replaceExpect }
130 };
131
ReplaceUnsupportedIntrinsics()132 ReplaceUnsupportedIntrinsics::ReplaceUnsupportedIntrinsics() : FunctionPass(ID)
133 {
134 initializeReplaceUnsupportedIntrinsicsPass(*PassRegistry::getPassRegistry());
135 }
136
MemMoveToMemCpy(MemMoveInst * MM)137 MemCpyInst* ReplaceUnsupportedIntrinsics::MemMoveToMemCpy(MemMoveInst* MM)
138 {
139 SmallVector<Value*, 5> args;
140 for (unsigned i = 0; i < MM->getNumArgOperands(); i++)
141 args.push_back(MM->getArgOperand(i));
142
143 auto* Dst = MM->getRawDest();
144 auto* Src = MM->getRawSource();
145 auto* Size = MM->getLength();
146
147 Type* Tys[] = { Dst->getType(), Src->getType(), Size->getType() };
148 auto* M = MM->getParent()->getParent()->getParent();
149 auto TheFn = Intrinsic::getDeclaration(M, Intrinsic::memcpy, Tys);
150
151 return cast<MemCpyInst>(MemCpyInst::Create(TheFn, args));
152 }
153
154 // insertReverseLoop - Insert an empty loop at the end of BB 'Loc'.
155 // The loop's induction variable iterates from 'Length'-1 to 0.
156 // The return value is the value of the induction variable in the loop's body.
insertReverseLoop(BasicBlock * Loc,BasicBlock * Post,Value * Length,StringRef BBName)157 Instruction* ReplaceUnsupportedIntrinsics::insertReverseLoop(
158 BasicBlock* Loc, BasicBlock* Post, Value* Length, StringRef BBName)
159 {
160 DebugLoc DL = Loc->getTerminator()->getDebugLoc();
161 Function* F = Loc->getParent();
162 LLVMContext& C = F->getContext();
163 IntegerType* LengthType = cast<IntegerType>(Length->getType());
164 // Create an alloca for storing the loop's induction variable
165 Value* pIV = new AllocaInst(LengthType, 0, "pIV", &(*F->getEntryBlock().begin()));
166 // Split the BB at the location of the call
167 BasicBlock* Pre = Loc;
168 // Create a new BB for the loop Body
169 BasicBlock* Body = BasicBlock::Create(C, Twine(BBName) + ".body", F, Post);
170 ConstantInt* Zero = ConstantInt::get(LengthType, 0);
171 ConstantInt* One = ConstantInt::get(LengthType, 1);
172 {
173 // Remove the unconditional 'br' instruction which will be replaced by a conditional 'br'
174 Pre->getTerminator()->eraseFromParent();
175 IGCLLVM::IRBuilder<> B(Pre);
176 B.SetCurrentDebugLocation(DL);
177 // Init the IV
178 auto* Init = B.CreateSub(Length, One);
179 B.CreateStore(Init, pIV);
180 Value* IsContinue = B.CreateICmpSGE(Init, Zero);
181 B.CreateCondBr(IsContinue, Body, Post);
182 }
183 // The induction variable's value
184 Instruction* IV;
185 {
186 // Loop body's Basic Block
187 IGCLLVM::IRBuilder<> B(Body);
188 B.SetCurrentDebugLocation(DL);
189 IV = B.CreateLoad(pIV, "IV");
190 // User of function will add more instructions at this point ...
191 // Decrement the IV and check for end of loop
192 Value* Dec = B.CreateSub(IV, One);
193 B.CreateStore(Dec, pIV);
194 Value* IsContinue = B.CreateICmpSGE(Dec, Zero);
195 B.CreateCondBr(IsContinue, Body, Post);
196 }
197 return IV;
198 }
199
200 // insertLoop - Insert an empty loop before instruction 'Loc'.
201 // The loop's induction variable iterates from 0 to 'Length'-1.
202 // The return value is the value of the induction variable in the loop's body.
insertLoop(Instruction * Loc,Value * Length,StringRef BBName)203 Instruction* ReplaceUnsupportedIntrinsics::insertLoop(Instruction* Loc, Value* Length, StringRef BBName)
204 {
205 DebugLoc DL = Loc->getDebugLoc();
206 Function* F = Loc->getParent()->getParent();
207 LLVMContext& C = F->getContext();
208 IntegerType* LengthType = cast<IntegerType>(Length->getType());
209 // Create an alloca for storing the loop's induction variable
210 Value* pIV = new AllocaInst(LengthType, 0, "pIV", &(*F->getEntryBlock().begin()));
211 // Split the BB at the location of the call
212 BasicBlock* Pre = Loc->getParent();
213 BasicBlock* Post = Pre->splitBasicBlock(
214 BasicBlock::iterator(Loc), Twine(BBName) + ".post");
215 // Create a new BB for the loop Body
216 BasicBlock* Body = BasicBlock::Create(C, Twine(BBName) + ".body", F, Post);
217 {
218 // Remove the unconditional 'br' instruction which will be replaced by a conditional 'br'
219 Pre->getTerminator()->eraseFromParent();
220 IGCLLVM::IRBuilder<> B(Pre);
221 B.SetCurrentDebugLocation(DL);
222 ConstantInt* Zero = ConstantInt::get(LengthType, 0);
223 // Init the IV
224 B.CreateStore(Zero, pIV);
225 Value* IsContinue = B.CreateICmpULT(Zero, Length);
226 B.CreateCondBr(IsContinue, Body, Post);
227 }
228 // The induction variable's value
229 Instruction* IV;
230 {
231 // Loop body's Basic Block
232 IGCLLVM::IRBuilder<> B(Body);
233 B.SetCurrentDebugLocation(DL);
234 IV = B.CreateLoad(pIV, "IV");
235 // User of function will add more instructions at this point ...
236 // Increment the IV and check for end of loop
237 Value* Inc = B.CreateAdd(IV, ConstantInt::get(LengthType, 1));
238 B.CreateStore(Inc, pIV);
239 Value* IsContinue = B.CreateICmpULT(Inc, Length);
240 B.CreateCondBr(IsContinue, Body, Post);
241 }
242 return IV;
243 }
244
replicateScalar(Value * ScalarVal,Type * Ty,Instruction * InsertBefore)245 Value* ReplaceUnsupportedIntrinsics::replicateScalar(
246 Value* ScalarVal, Type* Ty, Instruction* InsertBefore)
247 {
248 IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
249 Type* ETy = VTy ? VTy->getElementType() : Ty;
250 uint32_t sBits = (unsigned int)ScalarVal->getType()->getPrimitiveSizeInBits();
251 uint32_t nBits = (unsigned int)ETy->getPrimitiveSizeInBits();
252 IGC_ASSERT(sBits);
253 IGC_ASSERT_MESSAGE((nBits % sBits) == 0, "Type mismatch in replicateScalar!");
254 IGC_ASSERT_MESSAGE(nBits <= 64, "Type mismatch in replicateScalar!");
255 uint32_t ratio = nBits / sBits;
256
257 IGCLLVM::IRBuilder<> Builder(InsertBefore);
258 Value* NewVal;
259 if (ratio > 1)
260 {
261 if (ConstantInt* CI = dyn_cast<ConstantInt>(ScalarVal))
262 {
263 uint64_t s = CI->getZExtValue();
264 uint64_t n = s;
265 for (unsigned i = 1; i < ratio; ++i)
266 {
267 n = (n << sBits) | s;
268 }
269 NewVal = ConstantInt::get(ETy, n);
270 }
271 else
272 {
273 Value* nScalarVal = Builder.CreateZExt(ScalarVal, ETy);
274 NewVal = nScalarVal;
275 for (unsigned i = 1; i < ratio; ++i)
276 {
277 NewVal = Builder.CreateShl(NewVal, sBits);
278 NewVal = Builder.CreateAdd(NewVal, nScalarVal);
279 }
280 }
281 }
282 else
283 {
284 NewVal = ScalarVal;
285 }
286
287 Value* Res;
288 if (VTy)
289 {
290 Res = UndefValue::get(VTy);
291 Type* TyI32 = Type::getInt32Ty(ScalarVal->getContext());
292 for (unsigned i = 0; i < VTy->getNumElements(); ++i)
293 {
294 Value* Idx = ConstantInt::get(TyI32, i);
295 Res = Builder.CreateInsertElement(Res, NewVal, Idx);
296 }
297 }
298 else
299 {
300 Res = NewVal;
301 }
302 return Res;
303
304 }
305
306 // A help functions to generate vector load or stores for efficient
307 // memory operations.
308 // However, if size of base type is to kept the generated vectors will be different
309 // <8xi32> for size of the base type = 32
310 // <16xi16> for size of the base type = 16
311 //
312 // generalGroupI8Stream() groups a stream of i8 into a stream of <8xi32> or <16xi16> as
313 // much as possible. Then for the remaining i8's ( < 32), group them
314 // into vectors of element type i32(i16) and/or i8. This results in at most
315 // the following 5 vectors and/or scalars:
316 // <4xi32>, <3xi32> or <2xi32>, i32, <2xi8>, i8 or
317 // <8xi16>, <4xi16>, <2xi16>, i16, i8
318 // Note that we will not generate <3xi8> (see also the code for details).
319 // For example, given 127 i8's, we can
320 // have:
321 // <8xi32>, <8xi32>, <8xi32>, <4xi32>, <3xi32>, <2xi8>, i8
322 //
323 // The grouping result are kept in Vecs, L (actual length of Vecs),
324 // and NumI32 (the number of <8xi32>, ie. the number of Vecs[0]. For all
325 // the other vectors/scalars, ie Vecs[1 : L-1], the number is always 1).
326 // For the above case, they are:
327 // Vecs[0] = <8xi32>
328 // Vecs[1] = <4xi32>
329 // Vecs[2] = <3xi32>
330 // Vecs[3] = <2xi8>
331 // Vecs[4] = i8
332 // L = 5;
333 // NumI32 = 3;
334 //
335 // We may generate <3xi32>, but not <3xi8> as <3xi32> can be loaded
336 // or stored by a single send instruction, where <3xi8> cannot (even
337 // <3xi8> can be splitted later in VectorProcessing, but it's better
338 // not generate <3xi8> vector in the first place).
339 //
340 // The same example with given 127 i8's but with keeping size of base
341 // type of initial vector as 16 we can have:
342 // <16xi16>, <16xi16>, <16xi16>, <8xi16>, <4xi16>, <2xi16>, i16, i8
343 //
344 // The grouping result are kept in Vecs, L (actual length of Vecs),
345 // and NumI32 (the number of <16xi16>, ie. the number of Vecs[0]. For all
346 // the other vectors/scalars, ie Vecs[1 : L-1], the number is always 1).
347 // For the above case, they are:
348 // Vecs[0] = <16xi16>
349 // Vecs[1] = <8xi16>
350 // Vecs[2] = <4xi16>
351 // Vecs[3] = <2xi16>
352 // Vecs[4] = i16
353 // Vecs[5] = i8
354 // L = 6;
355 // NumI32 = 3;
356 //
357 // Note that Vecs[] should be allocated by callers with enough space
358 // to hold all vectors (6 should be enough; 1 for <8xi32>(<16xi16>),
359 // 5 for the others).
360 // We want from <4x<2xhalf>> [with size of the base type 16(half)]
361 // generate <8xi16> not <4xi32>
362 // Default BaseTypeSize=32 means that we don't concern about keeping
363 // size of the base type
generalGroupI8Stream(LLVMContext & C,uint32_t NumI8,uint32_t Align,uint32_t & VectorsNum,Type ** Vecs,uint32_t & L,uint32_t BaseTypeSize=32)364 void ReplaceUnsupportedIntrinsics::generalGroupI8Stream(
365 LLVMContext& C, uint32_t NumI8, uint32_t Align,
366 uint32_t& VectorsNum, Type** Vecs, uint32_t& L, uint32_t BaseTypeSize = 32)
367 {
368 VectorsNum = NumI8 / 32; // size of <8xi32> = 32. count of <8xi32> or <16xi16>
369 uint32_t RemI8 = NumI8 % 32;
370 uint32_t BaseTypeSizeInBytes = BaseTypeSize / 8;
371 uint32_t CntI = RemI8 / BaseTypeSizeInBytes; // the number of i32(0..7) or i16(0..15)
372 uint32_t CntI8 = RemI8 % BaseTypeSizeInBytes; // remaining number of i8(0-3) - for base_type_size = 32 or
373 // i8(0-1) - for base_type_size = 16
374
375 // To process all cases (3 for i32 and 4 for i16: it depends of how much CntI do we have)
376 uint32_t Power = 256 / BaseTypeSize; // i32: (256 / 32) = 0b1000 = 8
377 // i16: (256 / 16) = 0b10000 = 16
378
379 Type* BaseType = Type::getIntNTy(C, BaseTypeSize);
380 Type* TyI8 = Type::getInt8Ty(C);
381
382 uint32_t n = 0;
383 Vecs[n++] = IGCLLVM::FixedVectorType::get(BaseType, Power);
384
385 while ((Power >>= 1) > 1)
386 {
387 if (CntI >= Power)
388 {
389 Vecs[n++] = IGCLLVM::FixedVectorType::get(BaseType, Power);
390 CntI -= Power;
391 }
392 if (CntI == 3 && BaseTypeSize == 32 && Align >= 4) // special case for <8xi32> not to generate <3xi8> but to generate <3xi32>
393 {
394 Vecs[n++] = IGCLLVM::FixedVectorType::get(BaseType, 3);
395 CntI = 0;
396 break;
397 }
398 }
399 if (CntI >= 1)
400 {
401 Vecs[n++] = BaseType;
402 CntI -= Power; // Assume that pow should be 1 to generate i32(i16) and not <1xi32>(<1xi16>)
403 }
404 IGC_ASSERT_MESSAGE(CntI == 0, "Did not handle all types of base_type");
405
406 Power = BaseTypeSize / 4; // i32: 32 / 8 = 4
407 // i16: 16 / 8 = 2
408 while ((Power >>= 1) > 1)
409 {
410 if (CntI8 >= Power)
411 {
412 Vecs[n++] = IGCLLVM::FixedVectorType::get(TyI8, Power);
413 CntI8 -= Power;
414 }
415 }
416 if (CntI8 >= 1)
417 {
418 Vecs[n++] = TyI8;
419 CntI8 -= Power; // Assume that pow should be 1 to generate i8 not <1xi8>
420 }
421 IGC_ASSERT_MESSAGE(CntI8 == 0, "Did not handle all types of I8");
422
423 L = n;
424 }
425
replaceMemcpy(IntrinsicInst * I)426 void ReplaceUnsupportedIntrinsics::replaceMemcpy(IntrinsicInst* I)
427 {
428 // The idea is to convert
429 //
430 // memcpy (i8* Dst, i8* Src, len)
431 //
432 // into a vector load and store for cases where "len" is
433 // constant. If "len" isn't constant, just use i8 copy as
434 // this should not happen with OCL code (all memcpy is
435 // generated by the compiler for cases such as structure
436 // assignment, etc.)
437 //
438 // If len is constant, it will be transferred to
439 //
440 // lenv8 = len / 32 (<8xi32>);
441 // len_rem = len % 32;
442 //
443 // // main loop
444 // dstV8 = bitcast Dst, <8xi32>*
445 // srcV8 = bitcast Src, <8xi32>*
446 // for(i=0; i < lenv8; ++i)
447 // dstV8[i] = srcV8[i];
448 //
449 // // epilog, process remaining elements
450 // for(i=0; i < len_rem; ++i)
451 // Dst[lenv8*32 + i] = Src[lenv8*32 + i];
452 //
453 // Note that the above epilog loop is optimized away with
454 // as much as possible <nxi32> and <mxi8> loads and stores
455 // or if we want to keep size of the base type
456 // (for 16bit there will be <nxi16> and <mxi8>)
457 //
458 // Selecting 8 as vector length or 16 in case of i16 is due to
459 // that A64 messages can load eight i32 or sixteen i16 per SIMD channel.
460 // A32 will have 2 loads/stores for each vector, which is still efficient.
461 // Unaligned vector will be handled correctly and effciently later
462 // in vector load and store emit.
463 MemCpyInst* MC = cast<MemCpyInst>(I);
464 Value* Dst = MC->getRawDest();
465 Value* Src = MC->getRawSource();
466 Value* LPCount = MC->getLength();
467 uint32_t Align = MC->getDestAlignment();
468 Align = Align != 0 ? Align : 1;
469 const bool IsVolatile = MC->isVolatile();
470 const uint32_t SrcAS = MC->getSourceAddressSpace();
471 const uint32_t DstAS = MC->getDestAddressSpace();
472
473 LLVMContext& C = MC->getContext();
474 Type* TySrcPtrI8 = Type::getInt8PtrTy(C, SrcAS);
475 Type* TyDstPtrI8 = Type::getInt8PtrTy(C, DstAS);
476
477 IGCLLVM::IRBuilder<> Builder(MC);
478
479 // BaseSize is flag if we want to handle algorithm in general way
480 // or want to keep size of base type to further optimizations
481 uint32_t BaseSize = 0;
482 Type* RawDstType = Dst->stripPointerCasts()->getType()->getPointerElementType();
483 if (Type* BaseType = GetBaseType(RawDstType))
484 BaseSize = BaseType->getScalarSizeInBits();
485
486 if (BaseSize != 16)
487 // size 32 is equal to size of i32, so general algorithm will be applied
488 BaseSize = 32;
489
490 ConstantInt* CI = dyn_cast<ConstantInt>(LPCount);
491 if (CI)
492 {
493 uint32_t Count = (uint32_t)CI->getZExtValue();
494
495 Type* VecTys[8];
496 uint32_t Len, NewCount;
497 generalGroupI8Stream(C, Count, Align, NewCount, VecTys, Len, BaseSize);
498
499 Value* NewSrc, * NewDst, * vDst, * vSrc;
500 uint32_t BOfst = 0; // Byte offset
501
502 // First, insert main loop before MC.
503 // Note that if NewCount is small, we may directly generate ld/st
504 // without generating the loop.
505 if (NewCount > 0)
506 {
507 vSrc = Builder.CreateBitCast(SkipBitCast(Src), PointerType::get(VecTys[0], SrcAS), "memcpy_vsrc");
508 vDst = Builder.CreateBitCast(SkipBitCast(Dst), PointerType::get(VecTys[0], DstAS), "memcpy_vdst");
509
510 // getPrimitiveSizeInBits() should be enough, no need to
511 // use DataLayout to get target-dependent size.
512 uint32_t SZ = (unsigned int)(VecTys[0]->getPrimitiveSizeInBits() / 8);
513
514 // To set alignment correctly
515 uint32_t adjust_align = getLargestPowerOfTwo(SZ);
516 Align = adjust_align < Align ? adjust_align : Align;
517
518 // If NewCount is less than 6, don't generate loop.
519 // Note that 6 is just an arbitrary number here.
520 if (NewCount < 6)
521 {
522 for (unsigned i = 0; i < NewCount; ++i)
523 {
524 Value* tSrc = Builder.CreateConstGEP1_32(vSrc, i);
525 Value* tDst = Builder.CreateConstGEP1_32(vDst, i);
526 LoadInst* L = Builder.CreateAlignedLoad(tSrc, getAlign(Align), IsVolatile);
527 (void)Builder.CreateAlignedStore(L, tDst, getAlign(Align), IsVolatile);
528 }
529 }
530 else
531 {
532 Value* NewLPCount = ConstantInt::get(LPCount->getType(), NewCount);
533 Instruction* IV = insertLoop(MC, NewLPCount, "memcpy");
534 {
535 IGCLLVM::IRBuilder<> B(&(*++BasicBlock::iterator(IV)));
536 Value* tSrc = B.CreateGEP(vSrc, IV);
537 Value* tDst = B.CreateGEP(vDst, IV);
538 LoadInst* L = B.CreateAlignedLoad(tSrc, getAlign(Align), IsVolatile);
539 (void)B.CreateAlignedStore(L, tDst, getAlign(Align), IsVolatile);
540 }
541 }
542
543 BOfst = NewCount * SZ;
544 }
545
546 // Second, generate epilog code before MC.
547 // Note that as MC has been moved to a different BB by
548 // inserting the main loop! Reset it to MC.
549 Builder.SetInsertPoint(MC);
550 if (Len > 1)
551 {
552 Src = Builder.CreateBitCast(SkipBitCast(Src), TySrcPtrI8, "memcpy_src");
553 Dst = Builder.CreateBitCast(SkipBitCast(Dst), TyDstPtrI8, "memcpy_dst");
554 }
555 for (unsigned i = 1; i < Len; ++i)
556 {
557 uint32_t SZ = (unsigned int)VecTys[i]->getPrimitiveSizeInBits() / 8;
558 uint32_t adjust_align = getLargestPowerOfTwo(SZ);
559 Align = adjust_align < Align ? adjust_align : Align;
560 NewSrc = BOfst > 0 ? Builder.CreateConstGEP1_32(Src, BOfst) : Src;
561 NewDst = BOfst > 0 ? Builder.CreateConstGEP1_32(Dst, BOfst) : Dst;
562 vSrc = Builder.CreateBitCast(SkipBitCast(NewSrc), PointerType::get(VecTys[i], SrcAS), "memcpy_rem");
563 vDst = Builder.CreateBitCast(SkipBitCast(NewDst), PointerType::get(VecTys[i], DstAS), "memcpy_rem");
564 LoadInst* L = Builder.CreateAlignedLoad(vSrc, getAlign(Align), IsVolatile);
565 (void)Builder.CreateAlignedStore(L, vDst, getAlign(Align), IsVolatile);
566 BOfst += SZ;
567 }
568 }
569 else
570 {
571 Src = Builder.CreateBitCast(SkipBitCast(Src), TySrcPtrI8, "memcpy_src");
572 Dst = Builder.CreateBitCast(SkipBitCast(Dst), TyDstPtrI8, "memcpy_dst");
573 // Fall back to i8 copy
574 Instruction* IV = insertLoop(MC, LPCount, "memcpy");
575 {
576 IGCLLVM::IRBuilder<> B(&(*++BasicBlock::iterator(IV)));
577 Value* tSrc = B.CreateGEP(Src, IV);
578 Value* tDst = B.CreateGEP(Dst, IV);
579 LoadInst* L = B.CreateAlignedLoad(tSrc, getAlign(Align), IsVolatile);
580 (void)B.CreateAlignedStore(L, tDst, getAlign(Align), IsVolatile);
581 }
582 }
583 MC->eraseFromParent();
584 }
585
replaceMemMove(IntrinsicInst * I)586 void ReplaceUnsupportedIntrinsics::replaceMemMove(IntrinsicInst* I)
587 {
588 MemMoveInst* MM = cast<MemMoveInst>(I);
589 Value* Dst = MM->getRawDest();
590 Value* Src = MM->getRawSource();
591 Value* LPCount = MM->getLength();
592 uint32_t Align = MM->getDestAlignment();
593 if (Align == 0)
594 Align = 1;
595 const bool IsVolatile = MM->isVolatile();
596 const uint32_t SrcAS = MM->getSourceAddressSpace();
597 const uint32_t DstAS = MM->getDestAddressSpace();
598
599 // If non-generic address spaces mismatch, they can't alias
600 // and we can do a memcpy.
601
602 if (SrcAS < ADDRESS_SPACE_NUM_ADDRESSES &&
603 DstAS < ADDRESS_SPACE_NUM_ADDRESSES &&
604 SrcAS != ADDRESS_SPACE_GENERIC &&
605 DstAS != ADDRESS_SPACE_GENERIC &&
606 SrcAS != DstAS)
607 {
608 auto* MemCpy = MemMoveToMemCpy(MM);
609 MemCpy->insertBefore(MM);
610 replaceMemcpy(MemCpy);
611 MM->eraseFromParent();
612 return;
613 }
614
615 LLVMContext& C = MM->getContext();
616 Type* TySrcPtrI8 = Type::getInt8PtrTy(C, SrcAS);
617 Type* TyDstPtrI8 = Type::getInt8PtrTy(C, DstAS);
618
619 auto* F = MM->getParent()->getParent();
620
621 IGCLLVM::IRBuilder<> B(MM);
622
623 auto* i8Src = B.CreateBitCast(SkipBitCast(Src), TySrcPtrI8, "memcpy_src");
624 auto* i8Dst = B.CreateBitCast(SkipBitCast(Dst), TyDstPtrI8, "memcpy_dst");
625
626 // Setup control flow to do:
627 // if (Src < Dst)
628 // reverse copy data
629 // else
630 // normal copy (such as memcpy())
631
632 // Src < Dst
633 Value* pCmp = nullptr;
634 {
635 auto* cmpCastSrc = (DstAS == ADDRESS_SPACE_GENERIC) ?
636 B.CreateAddrSpaceCast(i8Src, TyDstPtrI8) : i8Src;
637 auto* cmpCastDst = (SrcAS == ADDRESS_SPACE_GENERIC) ?
638 B.CreateAddrSpaceCast(i8Dst, TySrcPtrI8) : i8Dst;
639
640 pCmp = B.CreateICmpULT(cmpCastSrc, cmpCastDst);
641 }
642
643 auto* Pre = MM->getParent();
644 auto* Post = Pre->splitBasicBlock(MM, "memmove.post");
645
646 Pre->getTerminator()->eraseFromParent();
647
648 auto* BBTrue = BasicBlock::Create(C, "memmove.true", F, Post);
649 auto* BBFalse = BasicBlock::Create(C, "memmove.false", F, Post);
650
651 B.SetInsertPoint(Pre);
652 B.CreateCondBr(pCmp, BBTrue, BBFalse);
653
654 B.SetInsertPoint(BBTrue);
655 B.CreateBr(Post);
656 B.SetInsertPoint(BBFalse);
657 B.CreateBr(Post);
658
659 auto* CI = dyn_cast<ConstantInt>(LPCount);
660 if (CI)
661 {
662 uint32_t Count = (uint32_t)CI->getZExtValue();
663
664 // noop
665 if (Count == 0)
666 {
667 MM->eraseFromParent();
668 return;
669 }
670
671 Type* VecTys[8];
672 uint32_t Len, NewCount;
673 generalGroupI8Stream(C, Count, Align, NewCount, VecTys, Len);
674
675 // for true block (Src < Dst), do a reverse copy.
676 {
677 B.SetInsertPoint(BBTrue->getTerminator());
678
679 // calculate byte offsets so we can walk backwards through them
680 SmallVector<uint, 8> byteOffsets{ 0 };
681
682 {
683 uint32_t SZ = (unsigned int)(VecTys[0]->getPrimitiveSizeInBits() / 8);
684 uint32_t BOfst = NewCount * SZ;
685
686 for (unsigned i = 1; i < Len; i++)
687 {
688 byteOffsets.push_back(BOfst);
689 uint32_t SZ = (unsigned int)(VecTys[i]->getPrimitiveSizeInBits() / 8);
690 BOfst += SZ;
691 }
692 }
693
694 // emit the smaller than <8 x i32> stores
695 for (unsigned i = Len - 1; i >= 1; i--)
696 {
697 uint offset = byteOffsets[i];
698 uint32_t newAlign = getLargestPowerOfTwo(Align + offset);
699 auto* tSrc = B.CreateConstGEP1_32(i8Src, offset);
700 auto* tDst = B.CreateConstGEP1_32(i8Dst, offset);
701
702 auto* vSrc = B.CreateBitCast(SkipBitCast(tSrc), PointerType::get(VecTys[i], SrcAS), "memcpy_rem");
703 auto* vDst = B.CreateBitCast(SkipBitCast(tDst), PointerType::get(VecTys[i], DstAS), "memcpy_rem");
704 LoadInst* L = B.CreateAlignedLoad(vSrc, getAlign(newAlign), IsVolatile);
705 (void)B.CreateAlignedStore(L, vDst, getAlign(newAlign), IsVolatile);
706 }
707
708 // now emit the <8 x i32> stores
709 auto* vSrc = B.CreateBitCast(SkipBitCast(Src), PointerType::get(VecTys[0], SrcAS), "memcpy_vsrc");
710 auto* vDst = B.CreateBitCast(SkipBitCast(Dst), PointerType::get(VecTys[0], DstAS), "memcpy_vdst");
711 // If NewCount is less than 6, don't generate loop.
712 // Note that 6 is just an arbitrary number here.
713 uint32_t SZ = (unsigned int)(VecTys[0]->getPrimitiveSizeInBits() / 8);
714 uint32_t newAlign = getLargestPowerOfTwo(Align + SZ);
715 if (NewCount < 6)
716 {
717 for (unsigned i = 0; i < NewCount; i++)
718 {
719 unsigned idx = NewCount - 1 - i;
720 auto* tSrc = B.CreateConstGEP1_32(vSrc, idx);
721 auto* tDst = B.CreateConstGEP1_32(vDst, idx);
722 LoadInst* L = B.CreateAlignedLoad(tSrc, getAlign(newAlign), IsVolatile);
723 (void)B.CreateAlignedStore(L, tDst, getAlign(newAlign), IsVolatile);
724 }
725 }
726 else
727 {
728 auto* NewLPCount = ConstantInt::get(LPCount->getType(), NewCount);
729 Instruction* IV = insertReverseLoop(BBTrue, Post, NewLPCount, "memmmove");
730 {
731 IGCLLVM::IRBuilder<> B(&(*++BasicBlock::iterator(IV)));
732 Value* tSrc = B.CreateGEP(vSrc, IV);
733 Value* tDst = B.CreateGEP(vDst, IV);
734 LoadInst* L = B.CreateAlignedLoad(tSrc, getAlign(newAlign), IsVolatile);
735 (void)B.CreateAlignedStore(L, tDst, getAlign(newAlign), IsVolatile);
736 }
737 }
738 }
739
740 // for false block (Src >= Dst), just a plain memcpy.
741 {
742 auto* MemCpy = MemMoveToMemCpy(MM);
743 MemCpy->insertBefore(BBFalse->getTerminator());
744 replaceMemcpy(MemCpy);
745 }
746 }
747 else
748 {
749 // (Src < Dst)
750 {
751 B.SetInsertPoint(BBTrue->getTerminator());
752 // Fall back to i8 copy
753 Instruction* IV = insertReverseLoop(BBTrue, Post, LPCount, "memmove");
754 {
755 IGCLLVM::IRBuilder<> B(&(*++BasicBlock::iterator(IV)));
756 Value* tSrc = B.CreateGEP(i8Src, IV);
757 Value* tDst = B.CreateGEP(i8Dst, IV);
758 LoadInst* L = B.CreateAlignedLoad(tSrc, getAlign(1), IsVolatile);
759 (void)B.CreateAlignedStore(L, tDst, getAlign(1), IsVolatile);
760 }
761 }
762
763 // for false block (Src >= Dst), just a plain memcpy.
764 {
765 auto* MemCpy = MemMoveToMemCpy(MM);
766 MemCpy->insertBefore(BBFalse->getTerminator());
767 replaceMemcpy(MemCpy);
768 }
769 }
770
771 MM->eraseFromParent();
772 }
773
replaceMemset(IntrinsicInst * I)774 void ReplaceUnsupportedIntrinsics::replaceMemset(IntrinsicInst* I)
775 {
776 // Same idea as replaceMemcpy (see comment of replaceMemcpy).
777 MemSetInst* MS = cast<MemSetInst>(I);
778 Value* Dst = MS->getRawDest();
779 Value* Src = MS->getValue();
780 Value* LPCount = MS->getLength();
781 uint32_t Align = MS->getDestAlignment();
782 const bool IsVolatile = MS->isVolatile();
783 const uint32_t AS = MS->getDestAddressSpace();
784
785 LLVMContext& C = MS->getContext();
786 Type* TyPtrI8 = Type::getInt8PtrTy(C, AS);
787
788 IGCLLVM::IRBuilder<> Builder(MS);
789
790 ConstantInt* CI = dyn_cast<ConstantInt>(LPCount);
791 if (CI)
792 {
793 uint32_t Count = (uint32_t)CI->getZExtValue();
794
795 Type* VecTys[8];
796 uint32_t Len, NewCount;
797 generalGroupI8Stream(C, Count, Align, NewCount, VecTys, Len);
798
799 Value* NewDst, * vDst, * vSrc;
800 uint32_t BOfst = 0; // Byte offset
801
802 // First, insert main loop before MC.
803 if (NewCount > 0)
804 {
805 PointerType* PTy = PointerType::get(VecTys[0], AS);
806 vSrc = replicateScalar(Src, VecTys[0], MS);
807 vDst = Builder.CreateBitCast(SkipBitCast(Dst), PTy, "memset_vdst");
808
809 // getPrimitiveSizeInBits() should be enough, no need to
810 // use DataLayout to get target-dependent size.
811 uint32_t SZ = (unsigned int)(VecTys[0]->getPrimitiveSizeInBits() / 8);
812
813 // To set alignment correctly
814 uint32_t adjust_align = getLargestPowerOfTwo(SZ);
815 Align = adjust_align < Align ? adjust_align : Align;
816
817 // If NewCount is less than 6, don't generate loop.
818 // Note that 6 is just an arbitrary number here.
819 if (NewCount < 6)
820 {
821 for (unsigned i = 0; i < NewCount; ++i)
822 {
823 Value* tDst = Builder.CreateConstGEP1_32(vDst, i);
824 (void)Builder.CreateAlignedStore(vSrc, tDst, getAlign(Align), IsVolatile);
825 }
826 }
827 else
828 {
829 Value* NewLPCount = ConstantInt::get(LPCount->getType(), NewCount);
830 Instruction* IV = insertLoop(MS, NewLPCount, "memset");
831 {
832 IGCLLVM::IRBuilder<> B(&(*++BasicBlock::iterator(IV)));
833 Value* tDst = B.CreateGEP(vDst, IV);
834 (void)B.CreateAlignedStore(vSrc, tDst, getAlign(Align), IsVolatile);
835 }
836 }
837
838 // Set offset for the remaining elements
839 BOfst = NewCount * SZ;
840 }
841
842 // Second, generate epilog code before MS.
843 // Note that as MC has been moved to a different BB by
844 // inserting the main loop! Reset it to MS.
845 Builder.SetInsertPoint(MS);
846 if (Len > 1)
847 {
848 Dst = Builder.CreateBitCast(SkipBitCast(Dst), TyPtrI8, "memset_dst");
849 }
850 for (unsigned i = 1; i < Len; ++i)
851 {
852 uint32_t SZ = (unsigned int)VecTys[i]->getPrimitiveSizeInBits() / 8;
853 uint32_t adjust_align = getLargestPowerOfTwo(SZ);
854 Align = adjust_align < Align ? adjust_align : Align;
855 PointerType* PTy = PointerType::get(VecTys[i], AS);
856 NewDst = BOfst > 0 ? Builder.CreateConstGEP1_32(Dst, BOfst) : Dst;
857 vSrc = replicateScalar(Src, VecTys[i], MS);
858 vDst = Builder.CreateBitCast(SkipBitCast(NewDst), PTy, "memset_rem");
859 (void)Builder.CreateAlignedStore(vSrc, vDst, getAlign(Align), IsVolatile);
860 BOfst += SZ;
861 }
862 }
863 else
864 {
865 Dst = Builder.CreateBitCast(SkipBitCast(Dst), TyPtrI8, "memset_dst");
866 // Fall back to i8 copy
867 Instruction* IV = insertLoop(MS, LPCount, "memset");
868 {
869 IGCLLVM::IRBuilder<> B(&(*++BasicBlock::iterator(IV)));
870 Value* tDst = B.CreateGEP(Dst, IV);
871 (void)B.CreateAlignedStore(Src, tDst, getAlign(Align), IsVolatile);
872 }
873 }
874 MS->eraseFromParent();
875 }
876
replaceExpect(IntrinsicInst * MS)877 void ReplaceUnsupportedIntrinsics::replaceExpect(IntrinsicInst* MS)
878 {
879 MS->replaceAllUsesWith(MS->getOperand(0));
880 MS->eraseFromParent();
881
882 }
883
884 /*
885 Replaces llvm.fshl.* and llvm.fshr.* funnel shift intrinsics.
886 E.g. for fshl we would produce a following sequence:
887 %r = call i8 @llvm.fshl.i8(i8 %a, i8 %b, i8 %c) =>
888 %modRes = urem i8 %c, 8 // get the modulo of shift value
889 %subRes = sub i8 8, %modRes // subtract from the type's number of bits
890 %shlRes = shl i8 %a, %modRes // shift the bits according to instruction spec
891 %shrRes = lshr i8 %b, %subRes
892 %r = or i8 %shlRes, %shrRes // compose the final result
893 */
replaceFunnelShift(IntrinsicInst * I)894 void ReplaceUnsupportedIntrinsics::replaceFunnelShift(IntrinsicInst* I) {
895 IGC_ASSERT(I->getIntrinsicID() == Intrinsic::fshl ||
896 I->getIntrinsicID() == Intrinsic::fshr);
897 IGCLLVM::IRBuilder<> Builder(I);
898 unsigned sizeInBits = I->getArgOperand(0)->getType()->getScalarSizeInBits();
899
900 // Don't replace rotate
901 if (I->getArgOperand(0) == I->getArgOperand(1) && !I->getType()->isVectorTy() &&
902 m_Ctx->platform.supportRotateInstruction())
903 {
904 if (sizeInBits == 16 || sizeInBits == 32) {
905 return;
906 }
907 }
908
909 Value* numBits = Builder.getIntN(sizeInBits, sizeInBits);
910 if (auto IVT = dyn_cast<IGCLLVM::FixedVectorType>(I->getType())) {
911 numBits = ConstantVector::getSplat(IGCLLVM::getElementCount((uint32_t)IVT->getNumElements()), cast<Constant>(numBits));
912 }
913 auto shiftModulo = Builder.CreateURem(I->getArgOperand(2), numBits);
914 auto negativeShift = Builder.CreateSub(numBits, shiftModulo);
915 if (I->getIntrinsicID() == Intrinsic::fshr) {
916 std::swap(shiftModulo, negativeShift);
917 }
918 auto upperShifted = Builder.CreateShl(I->getArgOperand(0), shiftModulo);
919 auto lowerShifted = Builder.CreateLShr(I->getArgOperand(1), negativeShift);
920 auto result = Builder.CreateOr(upperShifted, lowerShifted);
921
922 I->replaceAllUsesWith(result);
923 I->eraseFromParent();
924 }
925
visitIntrinsicInst(IntrinsicInst & I)926 void ReplaceUnsupportedIntrinsics::visitIntrinsicInst(IntrinsicInst& I) {
927 if (m_intrinsicToFunc.find(I.getIntrinsicID()) != m_intrinsicToFunc.end()) {
928 m_instsToReplace.push_back(&I);
929 }
930 }
931
runOnFunction(Function & F)932 bool ReplaceUnsupportedIntrinsics::runOnFunction(Function& F)
933 {
934 m_Ctx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
935 m_instsToReplace.clear();
936 visit(F);
937 for (auto I : m_instsToReplace) {
938 (this->*m_intrinsicToFunc.at(I->getIntrinsicID())) (I);
939 }
940 return !m_instsToReplace.empty();
941 }
942
createReplaceUnsupportedIntrinsicsPass()943 FunctionPass* IGC::createReplaceUnsupportedIntrinsicsPass()
944 {
945 return new ReplaceUnsupportedIntrinsics();
946 }
947