1 //===- AMDGPULibCalls.cpp -------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file does AMD library function optimizations.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPU.h"
15 #include "AMDGPULibFunc.h"
16 #include "GCNSubtarget.h"
17 #include "llvm/Analysis/AssumptionCache.h"
18 #include "llvm/Analysis/TargetLibraryInfo.h"
19 #include "llvm/Analysis/ValueTracking.h"
20 #include "llvm/IR/AttributeMask.h"
21 #include "llvm/IR/Dominators.h"
22 #include "llvm/IR/IRBuilder.h"
23 #include "llvm/IR/IntrinsicInst.h"
24 #include "llvm/IR/IntrinsicsAMDGPU.h"
25 #include "llvm/IR/PatternMatch.h"
26 #include "llvm/InitializePasses.h"
27 #include <cmath>
28
29 #define DEBUG_TYPE "amdgpu-simplifylib"
30
31 using namespace llvm;
32 using namespace llvm::PatternMatch;
33
34 static cl::opt<bool> EnablePreLink("amdgpu-prelink",
35 cl::desc("Enable pre-link mode optimizations"),
36 cl::init(false),
37 cl::Hidden);
38
39 static cl::list<std::string> UseNative("amdgpu-use-native",
40 cl::desc("Comma separated list of functions to replace with native, or all"),
41 cl::CommaSeparated, cl::ValueOptional,
42 cl::Hidden);
43
44 #define MATH_PI numbers::pi
45 #define MATH_E numbers::e
46 #define MATH_SQRT2 numbers::sqrt2
47 #define MATH_SQRT1_2 numbers::inv_sqrt2
48
49 namespace llvm {
50
51 class AMDGPULibCalls {
52 private:
53 const TargetLibraryInfo *TLInfo = nullptr;
54 AssumptionCache *AC = nullptr;
55 DominatorTree *DT = nullptr;
56
57 typedef llvm::AMDGPULibFunc FuncInfo;
58
59 bool UnsafeFPMath = false;
60
61 // -fuse-native.
62 bool AllNative = false;
63
64 bool useNativeFunc(const StringRef F) const;
65
66 // Return a pointer (pointer expr) to the function if function definition with
67 // "FuncName" exists. It may create a new function prototype in pre-link mode.
68 FunctionCallee getFunction(Module *M, const FuncInfo &fInfo);
69
70 bool parseFunctionName(const StringRef &FMangledName, FuncInfo &FInfo);
71
72 bool TDOFold(CallInst *CI, const FuncInfo &FInfo);
73
74 /* Specialized optimizations */
75
76 // pow/powr/pown
77 bool fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
78
79 // rootn
80 bool fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
81
82 // -fuse-native for sincos
83 bool sincosUseNative(CallInst *aCI, const FuncInfo &FInfo);
84
85 // evaluate calls if calls' arguments are constants.
86 bool evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0, double &Res1,
87 Constant *copr0, Constant *copr1);
88 bool evaluateCall(CallInst *aCI, const FuncInfo &FInfo);
89
90 /// Insert a value to sincos function \p Fsincos. Returns (value of sin, value
91 /// of cos, sincos call).
92 std::tuple<Value *, Value *, Value *> insertSinCos(Value *Arg,
93 FastMathFlags FMF,
94 IRBuilder<> &B,
95 FunctionCallee Fsincos);
96
97 // sin/cos
98 bool fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
99
100 // __read_pipe/__write_pipe
101 bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
102 const FuncInfo &FInfo);
103
104 // Get a scalar native builtin single argument FP function
105 FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo);
106
107 /// Substitute a call to a known libcall with an intrinsic call. If \p
108 /// AllowMinSize is true, allow the replacement in a minsize function.
109 bool shouldReplaceLibcallWithIntrinsic(const CallInst *CI,
110 bool AllowMinSizeF32 = false,
111 bool AllowF64 = false,
112 bool AllowStrictFP = false);
113 void replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI,
114 Intrinsic::ID IntrID);
115
116 bool tryReplaceLibcallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI,
117 Intrinsic::ID IntrID,
118 bool AllowMinSizeF32 = false,
119 bool AllowF64 = false,
120 bool AllowStrictFP = false);
121
122 protected:
123 bool isUnsafeMath(const FPMathOperator *FPOp) const;
124 bool isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const;
125
126 bool canIncreasePrecisionOfConstantFold(const FPMathOperator *FPOp) const;
127
replaceCall(Instruction * I,Value * With)128 static void replaceCall(Instruction *I, Value *With) {
129 I->replaceAllUsesWith(With);
130 I->eraseFromParent();
131 }
132
replaceCall(FPMathOperator * I,Value * With)133 static void replaceCall(FPMathOperator *I, Value *With) {
134 replaceCall(cast<Instruction>(I), With);
135 }
136
137 public:
AMDGPULibCalls()138 AMDGPULibCalls() {}
139
140 bool fold(CallInst *CI);
141
142 void initFunction(Function &F, FunctionAnalysisManager &FAM);
143 void initNativeFuncs();
144
145 // Replace a normal math function call with that native version
146 bool useNative(CallInst *CI);
147 };
148
149 } // end llvm namespace
150
151 template <typename IRB>
CreateCallEx(IRB & B,FunctionCallee Callee,Value * Arg,const Twine & Name="")152 static CallInst *CreateCallEx(IRB &B, FunctionCallee Callee, Value *Arg,
153 const Twine &Name = "") {
154 CallInst *R = B.CreateCall(Callee, Arg, Name);
155 if (Function *F = dyn_cast<Function>(Callee.getCallee()))
156 R->setCallingConv(F->getCallingConv());
157 return R;
158 }
159
160 template <typename IRB>
CreateCallEx2(IRB & B,FunctionCallee Callee,Value * Arg1,Value * Arg2,const Twine & Name="")161 static CallInst *CreateCallEx2(IRB &B, FunctionCallee Callee, Value *Arg1,
162 Value *Arg2, const Twine &Name = "") {
163 CallInst *R = B.CreateCall(Callee, {Arg1, Arg2}, Name);
164 if (Function *F = dyn_cast<Function>(Callee.getCallee()))
165 R->setCallingConv(F->getCallingConv());
166 return R;
167 }
168
getPownType(FunctionType * FT)169 static FunctionType *getPownType(FunctionType *FT) {
170 Type *PowNExpTy = Type::getInt32Ty(FT->getContext());
171 if (VectorType *VecTy = dyn_cast<VectorType>(FT->getReturnType()))
172 PowNExpTy = VectorType::get(PowNExpTy, VecTy->getElementCount());
173
174 return FunctionType::get(FT->getReturnType(),
175 {FT->getParamType(0), PowNExpTy}, false);
176 }
177
178 // Data structures for table-driven optimizations.
179 // FuncTbl works for both f32 and f64 functions with 1 input argument
180
181 struct TableEntry {
182 double result;
183 double input;
184 };
185
186 /* a list of {result, input} */
187 static const TableEntry tbl_acos[] = {
188 {MATH_PI / 2.0, 0.0},
189 {MATH_PI / 2.0, -0.0},
190 {0.0, 1.0},
191 {MATH_PI, -1.0}
192 };
193 static const TableEntry tbl_acosh[] = {
194 {0.0, 1.0}
195 };
196 static const TableEntry tbl_acospi[] = {
197 {0.5, 0.0},
198 {0.5, -0.0},
199 {0.0, 1.0},
200 {1.0, -1.0}
201 };
202 static const TableEntry tbl_asin[] = {
203 {0.0, 0.0},
204 {-0.0, -0.0},
205 {MATH_PI / 2.0, 1.0},
206 {-MATH_PI / 2.0, -1.0}
207 };
208 static const TableEntry tbl_asinh[] = {
209 {0.0, 0.0},
210 {-0.0, -0.0}
211 };
212 static const TableEntry tbl_asinpi[] = {
213 {0.0, 0.0},
214 {-0.0, -0.0},
215 {0.5, 1.0},
216 {-0.5, -1.0}
217 };
218 static const TableEntry tbl_atan[] = {
219 {0.0, 0.0},
220 {-0.0, -0.0},
221 {MATH_PI / 4.0, 1.0},
222 {-MATH_PI / 4.0, -1.0}
223 };
224 static const TableEntry tbl_atanh[] = {
225 {0.0, 0.0},
226 {-0.0, -0.0}
227 };
228 static const TableEntry tbl_atanpi[] = {
229 {0.0, 0.0},
230 {-0.0, -0.0},
231 {0.25, 1.0},
232 {-0.25, -1.0}
233 };
234 static const TableEntry tbl_cbrt[] = {
235 {0.0, 0.0},
236 {-0.0, -0.0},
237 {1.0, 1.0},
238 {-1.0, -1.0},
239 };
240 static const TableEntry tbl_cos[] = {
241 {1.0, 0.0},
242 {1.0, -0.0}
243 };
244 static const TableEntry tbl_cosh[] = {
245 {1.0, 0.0},
246 {1.0, -0.0}
247 };
248 static const TableEntry tbl_cospi[] = {
249 {1.0, 0.0},
250 {1.0, -0.0}
251 };
252 static const TableEntry tbl_erfc[] = {
253 {1.0, 0.0},
254 {1.0, -0.0}
255 };
256 static const TableEntry tbl_erf[] = {
257 {0.0, 0.0},
258 {-0.0, -0.0}
259 };
260 static const TableEntry tbl_exp[] = {
261 {1.0, 0.0},
262 {1.0, -0.0},
263 {MATH_E, 1.0}
264 };
265 static const TableEntry tbl_exp2[] = {
266 {1.0, 0.0},
267 {1.0, -0.0},
268 {2.0, 1.0}
269 };
270 static const TableEntry tbl_exp10[] = {
271 {1.0, 0.0},
272 {1.0, -0.0},
273 {10.0, 1.0}
274 };
275 static const TableEntry tbl_expm1[] = {
276 {0.0, 0.0},
277 {-0.0, -0.0}
278 };
279 static const TableEntry tbl_log[] = {
280 {0.0, 1.0},
281 {1.0, MATH_E}
282 };
283 static const TableEntry tbl_log2[] = {
284 {0.0, 1.0},
285 {1.0, 2.0}
286 };
287 static const TableEntry tbl_log10[] = {
288 {0.0, 1.0},
289 {1.0, 10.0}
290 };
291 static const TableEntry tbl_rsqrt[] = {
292 {1.0, 1.0},
293 {MATH_SQRT1_2, 2.0}
294 };
295 static const TableEntry tbl_sin[] = {
296 {0.0, 0.0},
297 {-0.0, -0.0}
298 };
299 static const TableEntry tbl_sinh[] = {
300 {0.0, 0.0},
301 {-0.0, -0.0}
302 };
303 static const TableEntry tbl_sinpi[] = {
304 {0.0, 0.0},
305 {-0.0, -0.0}
306 };
307 static const TableEntry tbl_sqrt[] = {
308 {0.0, 0.0},
309 {1.0, 1.0},
310 {MATH_SQRT2, 2.0}
311 };
312 static const TableEntry tbl_tan[] = {
313 {0.0, 0.0},
314 {-0.0, -0.0}
315 };
316 static const TableEntry tbl_tanh[] = {
317 {0.0, 0.0},
318 {-0.0, -0.0}
319 };
320 static const TableEntry tbl_tanpi[] = {
321 {0.0, 0.0},
322 {-0.0, -0.0}
323 };
324 static const TableEntry tbl_tgamma[] = {
325 {1.0, 1.0},
326 {1.0, 2.0},
327 {2.0, 3.0},
328 {6.0, 4.0}
329 };
330
HasNative(AMDGPULibFunc::EFuncId id)331 static bool HasNative(AMDGPULibFunc::EFuncId id) {
332 switch(id) {
333 case AMDGPULibFunc::EI_DIVIDE:
334 case AMDGPULibFunc::EI_COS:
335 case AMDGPULibFunc::EI_EXP:
336 case AMDGPULibFunc::EI_EXP2:
337 case AMDGPULibFunc::EI_EXP10:
338 case AMDGPULibFunc::EI_LOG:
339 case AMDGPULibFunc::EI_LOG2:
340 case AMDGPULibFunc::EI_LOG10:
341 case AMDGPULibFunc::EI_POWR:
342 case AMDGPULibFunc::EI_RECIP:
343 case AMDGPULibFunc::EI_RSQRT:
344 case AMDGPULibFunc::EI_SIN:
345 case AMDGPULibFunc::EI_SINCOS:
346 case AMDGPULibFunc::EI_SQRT:
347 case AMDGPULibFunc::EI_TAN:
348 return true;
349 default:;
350 }
351 return false;
352 }
353
354 using TableRef = ArrayRef<TableEntry>;
355
getOptTable(AMDGPULibFunc::EFuncId id)356 static TableRef getOptTable(AMDGPULibFunc::EFuncId id) {
357 switch(id) {
358 case AMDGPULibFunc::EI_ACOS: return TableRef(tbl_acos);
359 case AMDGPULibFunc::EI_ACOSH: return TableRef(tbl_acosh);
360 case AMDGPULibFunc::EI_ACOSPI: return TableRef(tbl_acospi);
361 case AMDGPULibFunc::EI_ASIN: return TableRef(tbl_asin);
362 case AMDGPULibFunc::EI_ASINH: return TableRef(tbl_asinh);
363 case AMDGPULibFunc::EI_ASINPI: return TableRef(tbl_asinpi);
364 case AMDGPULibFunc::EI_ATAN: return TableRef(tbl_atan);
365 case AMDGPULibFunc::EI_ATANH: return TableRef(tbl_atanh);
366 case AMDGPULibFunc::EI_ATANPI: return TableRef(tbl_atanpi);
367 case AMDGPULibFunc::EI_CBRT: return TableRef(tbl_cbrt);
368 case AMDGPULibFunc::EI_NCOS:
369 case AMDGPULibFunc::EI_COS: return TableRef(tbl_cos);
370 case AMDGPULibFunc::EI_COSH: return TableRef(tbl_cosh);
371 case AMDGPULibFunc::EI_COSPI: return TableRef(tbl_cospi);
372 case AMDGPULibFunc::EI_ERFC: return TableRef(tbl_erfc);
373 case AMDGPULibFunc::EI_ERF: return TableRef(tbl_erf);
374 case AMDGPULibFunc::EI_EXP: return TableRef(tbl_exp);
375 case AMDGPULibFunc::EI_NEXP2:
376 case AMDGPULibFunc::EI_EXP2: return TableRef(tbl_exp2);
377 case AMDGPULibFunc::EI_EXP10: return TableRef(tbl_exp10);
378 case AMDGPULibFunc::EI_EXPM1: return TableRef(tbl_expm1);
379 case AMDGPULibFunc::EI_LOG: return TableRef(tbl_log);
380 case AMDGPULibFunc::EI_NLOG2:
381 case AMDGPULibFunc::EI_LOG2: return TableRef(tbl_log2);
382 case AMDGPULibFunc::EI_LOG10: return TableRef(tbl_log10);
383 case AMDGPULibFunc::EI_NRSQRT:
384 case AMDGPULibFunc::EI_RSQRT: return TableRef(tbl_rsqrt);
385 case AMDGPULibFunc::EI_NSIN:
386 case AMDGPULibFunc::EI_SIN: return TableRef(tbl_sin);
387 case AMDGPULibFunc::EI_SINH: return TableRef(tbl_sinh);
388 case AMDGPULibFunc::EI_SINPI: return TableRef(tbl_sinpi);
389 case AMDGPULibFunc::EI_NSQRT:
390 case AMDGPULibFunc::EI_SQRT: return TableRef(tbl_sqrt);
391 case AMDGPULibFunc::EI_TAN: return TableRef(tbl_tan);
392 case AMDGPULibFunc::EI_TANH: return TableRef(tbl_tanh);
393 case AMDGPULibFunc::EI_TANPI: return TableRef(tbl_tanpi);
394 case AMDGPULibFunc::EI_TGAMMA: return TableRef(tbl_tgamma);
395 default:;
396 }
397 return TableRef();
398 }
399
getVecSize(const AMDGPULibFunc & FInfo)400 static inline int getVecSize(const AMDGPULibFunc& FInfo) {
401 return FInfo.getLeads()[0].VectorSize;
402 }
403
getArgType(const AMDGPULibFunc & FInfo)404 static inline AMDGPULibFunc::EType getArgType(const AMDGPULibFunc& FInfo) {
405 return (AMDGPULibFunc::EType)FInfo.getLeads()[0].ArgType;
406 }
407
getFunction(Module * M,const FuncInfo & fInfo)408 FunctionCallee AMDGPULibCalls::getFunction(Module *M, const FuncInfo &fInfo) {
409 // If we are doing PreLinkOpt, the function is external. So it is safe to
410 // use getOrInsertFunction() at this stage.
411
412 return EnablePreLink ? AMDGPULibFunc::getOrInsertFunction(M, fInfo)
413 : AMDGPULibFunc::getFunction(M, fInfo);
414 }
415
parseFunctionName(const StringRef & FMangledName,FuncInfo & FInfo)416 bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName,
417 FuncInfo &FInfo) {
418 return AMDGPULibFunc::parse(FMangledName, FInfo);
419 }
420
isUnsafeMath(const FPMathOperator * FPOp) const421 bool AMDGPULibCalls::isUnsafeMath(const FPMathOperator *FPOp) const {
422 return UnsafeFPMath || FPOp->isFast();
423 }
424
isUnsafeFiniteOnlyMath(const FPMathOperator * FPOp) const425 bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const {
426 return UnsafeFPMath ||
427 (FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs());
428 }
429
canIncreasePrecisionOfConstantFold(const FPMathOperator * FPOp) const430 bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold(
431 const FPMathOperator *FPOp) const {
432 // TODO: Refine to approxFunc or contract
433 return isUnsafeMath(FPOp);
434 }
435
initFunction(Function & F,FunctionAnalysisManager & FAM)436 void AMDGPULibCalls::initFunction(Function &F, FunctionAnalysisManager &FAM) {
437 UnsafeFPMath = F.getFnAttribute("unsafe-fp-math").getValueAsBool();
438 AC = &FAM.getResult<AssumptionAnalysis>(F);
439 TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F);
440 DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
441 }
442
useNativeFunc(const StringRef F) const443 bool AMDGPULibCalls::useNativeFunc(const StringRef F) const {
444 return AllNative || llvm::is_contained(UseNative, F);
445 }
446
initNativeFuncs()447 void AMDGPULibCalls::initNativeFuncs() {
448 AllNative = useNativeFunc("all") ||
449 (UseNative.getNumOccurrences() && UseNative.size() == 1 &&
450 UseNative.begin()->empty());
451 }
452
sincosUseNative(CallInst * aCI,const FuncInfo & FInfo)453 bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) {
454 bool native_sin = useNativeFunc("sin");
455 bool native_cos = useNativeFunc("cos");
456
457 if (native_sin && native_cos) {
458 Module *M = aCI->getModule();
459 Value *opr0 = aCI->getArgOperand(0);
460
461 AMDGPULibFunc nf;
462 nf.getLeads()[0].ArgType = FInfo.getLeads()[0].ArgType;
463 nf.getLeads()[0].VectorSize = FInfo.getLeads()[0].VectorSize;
464
465 nf.setPrefix(AMDGPULibFunc::NATIVE);
466 nf.setId(AMDGPULibFunc::EI_SIN);
467 FunctionCallee sinExpr = getFunction(M, nf);
468
469 nf.setPrefix(AMDGPULibFunc::NATIVE);
470 nf.setId(AMDGPULibFunc::EI_COS);
471 FunctionCallee cosExpr = getFunction(M, nf);
472 if (sinExpr && cosExpr) {
473 Value *sinval = CallInst::Create(sinExpr, opr0, "splitsin", aCI);
474 Value *cosval = CallInst::Create(cosExpr, opr0, "splitcos", aCI);
475 new StoreInst(cosval, aCI->getArgOperand(1), aCI);
476
477 DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
478 << " with native version of sin/cos");
479
480 replaceCall(aCI, sinval);
481 return true;
482 }
483 }
484 return false;
485 }
486
useNative(CallInst * aCI)487 bool AMDGPULibCalls::useNative(CallInst *aCI) {
488 Function *Callee = aCI->getCalledFunction();
489 if (!Callee || aCI->isNoBuiltin())
490 return false;
491
492 FuncInfo FInfo;
493 if (!parseFunctionName(Callee->getName(), FInfo) || !FInfo.isMangled() ||
494 FInfo.getPrefix() != AMDGPULibFunc::NOPFX ||
495 getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(FInfo.getId()) ||
496 !(AllNative || useNativeFunc(FInfo.getName()))) {
497 return false;
498 }
499
500 if (FInfo.getId() == AMDGPULibFunc::EI_SINCOS)
501 return sincosUseNative(aCI, FInfo);
502
503 FInfo.setPrefix(AMDGPULibFunc::NATIVE);
504 FunctionCallee F = getFunction(aCI->getModule(), FInfo);
505 if (!F)
506 return false;
507
508 aCI->setCalledFunction(F);
509 DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
510 << " with native version");
511 return true;
512 }
513
514 // Clang emits call of __read_pipe_2 or __read_pipe_4 for OpenCL read_pipe
515 // builtin, with appended type size and alignment arguments, where 2 or 4
516 // indicates the original number of arguments. The library has optimized version
517 // of __read_pipe_2/__read_pipe_4 when the type size and alignment has the same
518 // power of 2 value. This function transforms __read_pipe_2 to __read_pipe_2_N
519 // for such cases where N is the size in bytes of the type (N = 1, 2, 4, 8, ...,
520 // 128). The same for __read_pipe_4, write_pipe_2, and write_pipe_4.
fold_read_write_pipe(CallInst * CI,IRBuilder<> & B,const FuncInfo & FInfo)521 bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
522 const FuncInfo &FInfo) {
523 auto *Callee = CI->getCalledFunction();
524 if (!Callee->isDeclaration())
525 return false;
526
527 assert(Callee->hasName() && "Invalid read_pipe/write_pipe function");
528 auto *M = Callee->getParent();
529 std::string Name = std::string(Callee->getName());
530 auto NumArg = CI->arg_size();
531 if (NumArg != 4 && NumArg != 6)
532 return false;
533 ConstantInt *PacketSize =
534 dyn_cast<ConstantInt>(CI->getArgOperand(NumArg - 2));
535 ConstantInt *PacketAlign =
536 dyn_cast<ConstantInt>(CI->getArgOperand(NumArg - 1));
537 if (!PacketSize || !PacketAlign)
538 return false;
539
540 unsigned Size = PacketSize->getZExtValue();
541 Align Alignment = PacketAlign->getAlignValue();
542 if (Alignment != Size)
543 return false;
544
545 unsigned PtrArgLoc = CI->arg_size() - 3;
546 Value *PtrArg = CI->getArgOperand(PtrArgLoc);
547 Type *PtrTy = PtrArg->getType();
548
549 SmallVector<llvm::Type *, 6> ArgTys;
550 for (unsigned I = 0; I != PtrArgLoc; ++I)
551 ArgTys.push_back(CI->getArgOperand(I)->getType());
552 ArgTys.push_back(PtrTy);
553
554 Name = Name + "_" + std::to_string(Size);
555 auto *FTy = FunctionType::get(Callee->getReturnType(),
556 ArrayRef<Type *>(ArgTys), false);
557 AMDGPULibFunc NewLibFunc(Name, FTy);
558 FunctionCallee F = AMDGPULibFunc::getOrInsertFunction(M, NewLibFunc);
559 if (!F)
560 return false;
561
562 SmallVector<Value *, 6> Args;
563 for (unsigned I = 0; I != PtrArgLoc; ++I)
564 Args.push_back(CI->getArgOperand(I));
565 Args.push_back(PtrArg);
566
567 auto *NCI = B.CreateCall(F, Args);
568 NCI->setAttributes(CI->getAttributes());
569 CI->replaceAllUsesWith(NCI);
570 CI->dropAllReferences();
571 CI->eraseFromParent();
572
573 return true;
574 }
575
isKnownIntegral(const Value * V,const DataLayout & DL,FastMathFlags FMF)576 static bool isKnownIntegral(const Value *V, const DataLayout &DL,
577 FastMathFlags FMF) {
578 if (isa<UndefValue>(V))
579 return true;
580
581 if (const ConstantFP *CF = dyn_cast<ConstantFP>(V))
582 return CF->getValueAPF().isInteger();
583
584 if (const ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(V)) {
585 for (unsigned i = 0, e = CDV->getNumElements(); i != e; ++i) {
586 Constant *ConstElt = CDV->getElementAsConstant(i);
587 if (isa<UndefValue>(ConstElt))
588 continue;
589 const ConstantFP *CFP = dyn_cast<ConstantFP>(ConstElt);
590 if (!CFP || !CFP->getValue().isInteger())
591 return false;
592 }
593
594 return true;
595 }
596
597 const Instruction *I = dyn_cast<Instruction>(V);
598 if (!I)
599 return false;
600
601 switch (I->getOpcode()) {
602 case Instruction::SIToFP:
603 case Instruction::UIToFP:
604 // TODO: Could check nofpclass(inf) on incoming argument
605 if (FMF.noInfs())
606 return true;
607
608 // Need to check int size cannot produce infinity, which computeKnownFPClass
609 // knows how to do already.
610 return isKnownNeverInfinity(I, DL);
611 case Instruction::Call: {
612 const CallInst *CI = cast<CallInst>(I);
613 switch (CI->getIntrinsicID()) {
614 case Intrinsic::trunc:
615 case Intrinsic::floor:
616 case Intrinsic::ceil:
617 case Intrinsic::rint:
618 case Intrinsic::nearbyint:
619 case Intrinsic::round:
620 case Intrinsic::roundeven:
621 return (FMF.noInfs() && FMF.noNaNs()) ||
622 isKnownNeverInfOrNaN(I, DL, nullptr);
623 default:
624 break;
625 }
626
627 break;
628 }
629 default:
630 break;
631 }
632
633 return false;
634 }
635
636 // This function returns false if no change; return true otherwise.
fold(CallInst * CI)637 bool AMDGPULibCalls::fold(CallInst *CI) {
638 Function *Callee = CI->getCalledFunction();
639 // Ignore indirect calls.
640 if (!Callee || Callee->isIntrinsic() || CI->isNoBuiltin())
641 return false;
642
643 FuncInfo FInfo;
644 if (!parseFunctionName(Callee->getName(), FInfo))
645 return false;
646
647 // Further check the number of arguments to see if they match.
648 // TODO: Check calling convention matches too
649 if (!FInfo.isCompatibleSignature(CI->getFunctionType()))
650 return false;
651
652 LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << '\n');
653
654 if (TDOFold(CI, FInfo))
655 return true;
656
657 IRBuilder<> B(CI);
658
659 if (FPMathOperator *FPOp = dyn_cast<FPMathOperator>(CI)) {
660 // Under unsafe-math, evaluate calls if possible.
661 // According to Brian Sumner, we can do this for all f32 function calls
662 // using host's double function calls.
663 if (canIncreasePrecisionOfConstantFold(FPOp) && evaluateCall(CI, FInfo))
664 return true;
665
666 // Copy fast flags from the original call.
667 FastMathFlags FMF = FPOp->getFastMathFlags();
668 B.setFastMathFlags(FMF);
669
670 // Specialized optimizations for each function call.
671 //
672 // TODO: Handle native functions
673 switch (FInfo.getId()) {
674 case AMDGPULibFunc::EI_EXP:
675 if (FMF.none())
676 return false;
677 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::exp,
678 FMF.approxFunc());
679 case AMDGPULibFunc::EI_EXP2:
680 if (FMF.none())
681 return false;
682 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::exp2,
683 FMF.approxFunc());
684 case AMDGPULibFunc::EI_LOG:
685 if (FMF.none())
686 return false;
687 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::log,
688 FMF.approxFunc());
689 case AMDGPULibFunc::EI_LOG2:
690 if (FMF.none())
691 return false;
692 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::log2,
693 FMF.approxFunc());
694 case AMDGPULibFunc::EI_LOG10:
695 if (FMF.none())
696 return false;
697 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::log10,
698 FMF.approxFunc());
699 case AMDGPULibFunc::EI_FMIN:
700 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::minnum,
701 true, true);
702 case AMDGPULibFunc::EI_FMAX:
703 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::maxnum,
704 true, true);
705 case AMDGPULibFunc::EI_FMA:
706 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::fma, true,
707 true);
708 case AMDGPULibFunc::EI_MAD:
709 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::fmuladd,
710 true, true);
711 case AMDGPULibFunc::EI_FABS:
712 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::fabs, true,
713 true, true);
714 case AMDGPULibFunc::EI_COPYSIGN:
715 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::copysign,
716 true, true, true);
717 case AMDGPULibFunc::EI_FLOOR:
718 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::floor, true,
719 true);
720 case AMDGPULibFunc::EI_CEIL:
721 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::ceil, true,
722 true);
723 case AMDGPULibFunc::EI_TRUNC:
724 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::trunc, true,
725 true);
726 case AMDGPULibFunc::EI_RINT:
727 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::rint, true,
728 true);
729 case AMDGPULibFunc::EI_ROUND:
730 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::round, true,
731 true);
732 case AMDGPULibFunc::EI_LDEXP: {
733 if (!shouldReplaceLibcallWithIntrinsic(CI, true, true))
734 return false;
735
736 Value *Arg1 = CI->getArgOperand(1);
737 if (VectorType *VecTy = dyn_cast<VectorType>(CI->getType());
738 VecTy && !isa<VectorType>(Arg1->getType())) {
739 Value *SplatArg1 = B.CreateVectorSplat(VecTy->getElementCount(), Arg1);
740 CI->setArgOperand(1, SplatArg1);
741 }
742
743 CI->setCalledFunction(Intrinsic::getDeclaration(
744 CI->getModule(), Intrinsic::ldexp,
745 {CI->getType(), CI->getArgOperand(1)->getType()}));
746 return true;
747 }
748 case AMDGPULibFunc::EI_POW: {
749 Module *M = Callee->getParent();
750 AMDGPULibFunc PowrInfo(AMDGPULibFunc::EI_POWR, FInfo);
751 FunctionCallee PowrFunc = getFunction(M, PowrInfo);
752 CallInst *Call = cast<CallInst>(FPOp);
753
754 // pow(x, y) -> powr(x, y) for x >= -0.0
755 // TODO: Account for flags on current call
756 if (PowrFunc &&
757 cannotBeOrderedLessThanZero(FPOp->getOperand(0), M->getDataLayout(),
758 TLInfo, 0, AC, Call, DT)) {
759 Call->setCalledFunction(PowrFunc);
760 return fold_pow(FPOp, B, PowrInfo) || true;
761 }
762
763 // pow(x, y) -> pown(x, y) for known integral y
764 if (isKnownIntegral(FPOp->getOperand(1), M->getDataLayout(),
765 FPOp->getFastMathFlags())) {
766 FunctionType *PownType = getPownType(CI->getFunctionType());
767 AMDGPULibFunc PownInfo(AMDGPULibFunc::EI_POWN, PownType, true);
768 FunctionCallee PownFunc = getFunction(M, PownInfo);
769 if (PownFunc) {
770 // TODO: If the incoming integral value is an sitofp/uitofp, it won't
771 // fold out without a known range. We can probably take the source
772 // value directly.
773 Value *CastedArg =
774 B.CreateFPToSI(FPOp->getOperand(1), PownType->getParamType(1));
775 // Have to drop any nofpclass attributes on the original call site.
776 Call->removeParamAttrs(
777 1, AttributeFuncs::typeIncompatible(CastedArg->getType()));
778 Call->setCalledFunction(PownFunc);
779 Call->setArgOperand(1, CastedArg);
780 return fold_pow(FPOp, B, PownInfo) || true;
781 }
782 }
783
784 return fold_pow(FPOp, B, FInfo);
785 }
786 case AMDGPULibFunc::EI_POWR:
787 case AMDGPULibFunc::EI_POWN:
788 return fold_pow(FPOp, B, FInfo);
789 case AMDGPULibFunc::EI_ROOTN:
790 return fold_rootn(FPOp, B, FInfo);
791 case AMDGPULibFunc::EI_SQRT:
792 // TODO: Allow with strictfp + constrained intrinsic
793 return tryReplaceLibcallWithSimpleIntrinsic(
794 B, CI, Intrinsic::sqrt, true, true, /*AllowStrictFP=*/false);
795 case AMDGPULibFunc::EI_COS:
796 case AMDGPULibFunc::EI_SIN:
797 return fold_sincos(FPOp, B, FInfo);
798 default:
799 break;
800 }
801 } else {
802 // Specialized optimizations for each function call
803 switch (FInfo.getId()) {
804 case AMDGPULibFunc::EI_READ_PIPE_2:
805 case AMDGPULibFunc::EI_READ_PIPE_4:
806 case AMDGPULibFunc::EI_WRITE_PIPE_2:
807 case AMDGPULibFunc::EI_WRITE_PIPE_4:
808 return fold_read_write_pipe(CI, B, FInfo);
809 default:
810 break;
811 }
812 }
813
814 return false;
815 }
816
TDOFold(CallInst * CI,const FuncInfo & FInfo)817 bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
818 // Table-Driven optimization
819 const TableRef tr = getOptTable(FInfo.getId());
820 if (tr.empty())
821 return false;
822
823 int const sz = (int)tr.size();
824 Value *opr0 = CI->getArgOperand(0);
825
826 if (getVecSize(FInfo) > 1) {
827 if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(opr0)) {
828 SmallVector<double, 0> DVal;
829 for (int eltNo = 0; eltNo < getVecSize(FInfo); ++eltNo) {
830 ConstantFP *eltval = dyn_cast<ConstantFP>(
831 CV->getElementAsConstant((unsigned)eltNo));
832 assert(eltval && "Non-FP arguments in math function!");
833 bool found = false;
834 for (int i=0; i < sz; ++i) {
835 if (eltval->isExactlyValue(tr[i].input)) {
836 DVal.push_back(tr[i].result);
837 found = true;
838 break;
839 }
840 }
841 if (!found) {
842 // This vector constants not handled yet.
843 return false;
844 }
845 }
846 LLVMContext &context = CI->getParent()->getParent()->getContext();
847 Constant *nval;
848 if (getArgType(FInfo) == AMDGPULibFunc::F32) {
849 SmallVector<float, 0> FVal;
850 for (unsigned i = 0; i < DVal.size(); ++i) {
851 FVal.push_back((float)DVal[i]);
852 }
853 ArrayRef<float> tmp(FVal);
854 nval = ConstantDataVector::get(context, tmp);
855 } else { // F64
856 ArrayRef<double> tmp(DVal);
857 nval = ConstantDataVector::get(context, tmp);
858 }
859 LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
860 replaceCall(CI, nval);
861 return true;
862 }
863 } else {
864 // Scalar version
865 if (ConstantFP *CF = dyn_cast<ConstantFP>(opr0)) {
866 for (int i = 0; i < sz; ++i) {
867 if (CF->isExactlyValue(tr[i].input)) {
868 Value *nval = ConstantFP::get(CF->getType(), tr[i].result);
869 LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
870 replaceCall(CI, nval);
871 return true;
872 }
873 }
874 }
875 }
876
877 return false;
878 }
879
880 namespace llvm {
log2(double V)881 static double log2(double V) {
882 #if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L
883 return ::log2(V);
884 #else
885 return log(V) / numbers::ln2;
886 #endif
887 }
888 }
889
fold_pow(FPMathOperator * FPOp,IRBuilder<> & B,const FuncInfo & FInfo)890 bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
891 const FuncInfo &FInfo) {
892 assert((FInfo.getId() == AMDGPULibFunc::EI_POW ||
893 FInfo.getId() == AMDGPULibFunc::EI_POWR ||
894 FInfo.getId() == AMDGPULibFunc::EI_POWN) &&
895 "fold_pow: encounter a wrong function call");
896
897 Module *M = B.GetInsertBlock()->getModule();
898 Type *eltType = FPOp->getType()->getScalarType();
899 Value *opr0 = FPOp->getOperand(0);
900 Value *opr1 = FPOp->getOperand(1);
901
902 const APFloat *CF = nullptr;
903 const APInt *CINT = nullptr;
904 if (!match(opr1, m_APFloatAllowUndef(CF)))
905 match(opr1, m_APIntAllowUndef(CINT));
906
907 // 0x1111111 means that we don't do anything for this call.
908 int ci_opr1 = (CINT ? (int)CINT->getSExtValue() : 0x1111111);
909
910 if ((CF && CF->isZero()) || (CINT && ci_opr1 == 0)) {
911 // pow/powr/pown(x, 0) == 1
912 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1\n");
913 Constant *cnval = ConstantFP::get(eltType, 1.0);
914 if (getVecSize(FInfo) > 1) {
915 cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
916 }
917 replaceCall(FPOp, cnval);
918 return true;
919 }
920 if ((CF && CF->isExactlyValue(1.0)) || (CINT && ci_opr1 == 1)) {
921 // pow/powr/pown(x, 1.0) = x
922 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << "\n");
923 replaceCall(FPOp, opr0);
924 return true;
925 }
926 if ((CF && CF->isExactlyValue(2.0)) || (CINT && ci_opr1 == 2)) {
927 // pow/powr/pown(x, 2.0) = x*x
928 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << " * "
929 << *opr0 << "\n");
930 Value *nval = B.CreateFMul(opr0, opr0, "__pow2");
931 replaceCall(FPOp, nval);
932 return true;
933 }
934 if ((CF && CF->isExactlyValue(-1.0)) || (CINT && ci_opr1 == -1)) {
935 // pow/powr/pown(x, -1.0) = 1.0/x
936 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1 / " << *opr0 << "\n");
937 Constant *cnval = ConstantFP::get(eltType, 1.0);
938 if (getVecSize(FInfo) > 1) {
939 cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
940 }
941 Value *nval = B.CreateFDiv(cnval, opr0, "__powrecip");
942 replaceCall(FPOp, nval);
943 return true;
944 }
945
946 if (CF && (CF->isExactlyValue(0.5) || CF->isExactlyValue(-0.5))) {
947 // pow[r](x, [-]0.5) = sqrt(x)
948 bool issqrt = CF->isExactlyValue(0.5);
949 if (FunctionCallee FPExpr =
950 getFunction(M, AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT
951 : AMDGPULibFunc::EI_RSQRT,
952 FInfo))) {
953 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << FInfo.getName()
954 << '(' << *opr0 << ")\n");
955 Value *nval = CreateCallEx(B,FPExpr, opr0, issqrt ? "__pow2sqrt"
956 : "__pow2rsqrt");
957 replaceCall(FPOp, nval);
958 return true;
959 }
960 }
961
962 if (!isUnsafeFiniteOnlyMath(FPOp))
963 return false;
964
965 // Unsafe Math optimization
966
967 // Remember that ci_opr1 is set if opr1 is integral
968 if (CF) {
969 double dval = (getArgType(FInfo) == AMDGPULibFunc::F32)
970 ? (double)CF->convertToFloat()
971 : CF->convertToDouble();
972 int ival = (int)dval;
973 if ((double)ival == dval) {
974 ci_opr1 = ival;
975 } else
976 ci_opr1 = 0x11111111;
977 }
978
979 // pow/powr/pown(x, c) = [1/](x*x*..x); where
980 // trunc(c) == c && the number of x == c && |c| <= 12
981 unsigned abs_opr1 = (ci_opr1 < 0) ? -ci_opr1 : ci_opr1;
982 if (abs_opr1 <= 12) {
983 Constant *cnval;
984 Value *nval;
985 if (abs_opr1 == 0) {
986 cnval = ConstantFP::get(eltType, 1.0);
987 if (getVecSize(FInfo) > 1) {
988 cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
989 }
990 nval = cnval;
991 } else {
992 Value *valx2 = nullptr;
993 nval = nullptr;
994 while (abs_opr1 > 0) {
995 valx2 = valx2 ? B.CreateFMul(valx2, valx2, "__powx2") : opr0;
996 if (abs_opr1 & 1) {
997 nval = nval ? B.CreateFMul(nval, valx2, "__powprod") : valx2;
998 }
999 abs_opr1 >>= 1;
1000 }
1001 }
1002
1003 if (ci_opr1 < 0) {
1004 cnval = ConstantFP::get(eltType, 1.0);
1005 if (getVecSize(FInfo) > 1) {
1006 cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
1007 }
1008 nval = B.CreateFDiv(cnval, nval, "__1powprod");
1009 }
1010 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
1011 << ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0
1012 << ")\n");
1013 replaceCall(FPOp, nval);
1014 return true;
1015 }
1016
1017 // If we should use the generic intrinsic instead of emitting a libcall
1018 const bool ShouldUseIntrinsic = eltType->isFloatTy() || eltType->isHalfTy();
1019
1020 // powr ---> exp2(y * log2(x))
1021 // pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31))
1022 FunctionCallee ExpExpr;
1023 if (ShouldUseIntrinsic)
1024 ExpExpr = Intrinsic::getDeclaration(M, Intrinsic::exp2, {FPOp->getType()});
1025 else {
1026 ExpExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo));
1027 if (!ExpExpr)
1028 return false;
1029 }
1030
1031 bool needlog = false;
1032 bool needabs = false;
1033 bool needcopysign = false;
1034 Constant *cnval = nullptr;
1035 if (getVecSize(FInfo) == 1) {
1036 CF = nullptr;
1037 match(opr0, m_APFloatAllowUndef(CF));
1038
1039 if (CF) {
1040 double V = (getArgType(FInfo) == AMDGPULibFunc::F32)
1041 ? (double)CF->convertToFloat()
1042 : CF->convertToDouble();
1043
1044 V = log2(std::abs(V));
1045 cnval = ConstantFP::get(eltType, V);
1046 needcopysign = (FInfo.getId() != AMDGPULibFunc::EI_POWR) &&
1047 CF->isNegative();
1048 } else {
1049 needlog = true;
1050 needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR;
1051 }
1052 } else {
1053 ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(opr0);
1054
1055 if (!CDV) {
1056 needlog = true;
1057 needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR;
1058 } else {
1059 assert ((int)CDV->getNumElements() == getVecSize(FInfo) &&
1060 "Wrong vector size detected");
1061
1062 SmallVector<double, 0> DVal;
1063 for (int i=0; i < getVecSize(FInfo); ++i) {
1064 double V = CDV->getElementAsAPFloat(i).convertToDouble();
1065 if (V < 0.0) needcopysign = true;
1066 V = log2(std::abs(V));
1067 DVal.push_back(V);
1068 }
1069 if (getArgType(FInfo) == AMDGPULibFunc::F32) {
1070 SmallVector<float, 0> FVal;
1071 for (unsigned i=0; i < DVal.size(); ++i) {
1072 FVal.push_back((float)DVal[i]);
1073 }
1074 ArrayRef<float> tmp(FVal);
1075 cnval = ConstantDataVector::get(M->getContext(), tmp);
1076 } else {
1077 ArrayRef<double> tmp(DVal);
1078 cnval = ConstantDataVector::get(M->getContext(), tmp);
1079 }
1080 }
1081 }
1082
1083 if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW)) {
1084 // We cannot handle corner cases for a general pow() function, give up
1085 // unless y is a constant integral value. Then proceed as if it were pown.
1086 if (!isKnownIntegral(opr1, M->getDataLayout(), FPOp->getFastMathFlags()))
1087 return false;
1088 }
1089
1090 Value *nval;
1091 if (needabs) {
1092 nval = B.CreateUnaryIntrinsic(Intrinsic::fabs, opr0, nullptr, "__fabs");
1093 } else {
1094 nval = cnval ? cnval : opr0;
1095 }
1096 if (needlog) {
1097 FunctionCallee LogExpr;
1098 if (ShouldUseIntrinsic) {
1099 LogExpr =
1100 Intrinsic::getDeclaration(M, Intrinsic::log2, {FPOp->getType()});
1101 } else {
1102 LogExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo));
1103 if (!LogExpr)
1104 return false;
1105 }
1106
1107 nval = CreateCallEx(B,LogExpr, nval, "__log2");
1108 }
1109
1110 if (FInfo.getId() == AMDGPULibFunc::EI_POWN) {
1111 // convert int(32) to fp(f32 or f64)
1112 opr1 = B.CreateSIToFP(opr1, nval->getType(), "pownI2F");
1113 }
1114 nval = B.CreateFMul(opr1, nval, "__ylogx");
1115 nval = CreateCallEx(B,ExpExpr, nval, "__exp2");
1116
1117 if (needcopysign) {
1118 Value *opr_n;
1119 Type* rTy = opr0->getType();
1120 Type* nTyS = B.getIntNTy(eltType->getPrimitiveSizeInBits());
1121 Type *nTy = nTyS;
1122 if (const auto *vTy = dyn_cast<FixedVectorType>(rTy))
1123 nTy = FixedVectorType::get(nTyS, vTy);
1124 unsigned size = nTy->getScalarSizeInBits();
1125 opr_n = FPOp->getOperand(1);
1126 if (opr_n->getType()->isIntegerTy())
1127 opr_n = B.CreateZExtOrTrunc(opr_n, nTy, "__ytou");
1128 else
1129 opr_n = B.CreateFPToSI(opr1, nTy, "__ytou");
1130
1131 Value *sign = B.CreateShl(opr_n, size-1, "__yeven");
1132 sign = B.CreateAnd(B.CreateBitCast(opr0, nTy), sign, "__pow_sign");
1133 nval = B.CreateOr(B.CreateBitCast(nval, nTy), sign);
1134 nval = B.CreateBitCast(nval, opr0->getType());
1135 }
1136
1137 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
1138 << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n");
1139 replaceCall(FPOp, nval);
1140
1141 return true;
1142 }
1143
fold_rootn(FPMathOperator * FPOp,IRBuilder<> & B,const FuncInfo & FInfo)1144 bool AMDGPULibCalls::fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B,
1145 const FuncInfo &FInfo) {
1146 // skip vector function
1147 if (getVecSize(FInfo) != 1)
1148 return false;
1149
1150 Value *opr0 = FPOp->getOperand(0);
1151 Value *opr1 = FPOp->getOperand(1);
1152
1153 ConstantInt *CINT = dyn_cast<ConstantInt>(opr1);
1154 if (!CINT) {
1155 return false;
1156 }
1157 int ci_opr1 = (int)CINT->getSExtValue();
1158 if (ci_opr1 == 1) { // rootn(x, 1) = x
1159 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << "\n");
1160 replaceCall(FPOp, opr0);
1161 return true;
1162 }
1163
1164 Module *M = B.GetInsertBlock()->getModule();
1165 if (ci_opr1 == 2) { // rootn(x, 2) = sqrt(x)
1166 if (FunctionCallee FPExpr =
1167 getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) {
1168 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> sqrt(" << *opr0
1169 << ")\n");
1170 Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2sqrt");
1171 replaceCall(FPOp, nval);
1172 return true;
1173 }
1174 } else if (ci_opr1 == 3) { // rootn(x, 3) = cbrt(x)
1175 if (FunctionCallee FPExpr =
1176 getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_CBRT, FInfo))) {
1177 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> cbrt(" << *opr0
1178 << ")\n");
1179 Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2cbrt");
1180 replaceCall(FPOp, nval);
1181 return true;
1182 }
1183 } else if (ci_opr1 == -1) { // rootn(x, -1) = 1.0/x
1184 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1.0 / " << *opr0 << "\n");
1185 Value *nval = B.CreateFDiv(ConstantFP::get(opr0->getType(), 1.0),
1186 opr0,
1187 "__rootn2div");
1188 replaceCall(FPOp, nval);
1189 return true;
1190 } else if (ci_opr1 == -2) { // rootn(x, -2) = rsqrt(x)
1191 if (FunctionCallee FPExpr =
1192 getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_RSQRT, FInfo))) {
1193 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> rsqrt(" << *opr0
1194 << ")\n");
1195 Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2rsqrt");
1196 replaceCall(FPOp, nval);
1197 return true;
1198 }
1199 }
1200 return false;
1201 }
1202
1203 // Get a scalar native builtin single argument FP function
getNativeFunction(Module * M,const FuncInfo & FInfo)1204 FunctionCallee AMDGPULibCalls::getNativeFunction(Module *M,
1205 const FuncInfo &FInfo) {
1206 if (getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(FInfo.getId()))
1207 return nullptr;
1208 FuncInfo nf = FInfo;
1209 nf.setPrefix(AMDGPULibFunc::NATIVE);
1210 return getFunction(M, nf);
1211 }
1212
1213 // Some library calls are just wrappers around llvm intrinsics, but compiled
1214 // conservatively. Preserve the flags from the original call site by
1215 // substituting them with direct calls with all the flags.
shouldReplaceLibcallWithIntrinsic(const CallInst * CI,bool AllowMinSizeF32,bool AllowF64,bool AllowStrictFP)1216 bool AMDGPULibCalls::shouldReplaceLibcallWithIntrinsic(const CallInst *CI,
1217 bool AllowMinSizeF32,
1218 bool AllowF64,
1219 bool AllowStrictFP) {
1220 Type *FltTy = CI->getType()->getScalarType();
1221 const bool IsF32 = FltTy->isFloatTy();
1222
1223 // f64 intrinsics aren't implemented for most operations.
1224 if (!IsF32 && !FltTy->isHalfTy() && (!AllowF64 || !FltTy->isDoubleTy()))
1225 return false;
1226
1227 // We're implicitly inlining by replacing the libcall with the intrinsic, so
1228 // don't do it for noinline call sites.
1229 if (CI->isNoInline())
1230 return false;
1231
1232 const Function *ParentF = CI->getFunction();
1233 // TODO: Handle strictfp
1234 if (!AllowStrictFP && ParentF->hasFnAttribute(Attribute::StrictFP))
1235 return false;
1236
1237 if (IsF32 && !AllowMinSizeF32 && ParentF->hasMinSize())
1238 return false;
1239 return true;
1240 }
1241
replaceLibCallWithSimpleIntrinsic(IRBuilder<> & B,CallInst * CI,Intrinsic::ID IntrID)1242 void AMDGPULibCalls::replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B,
1243 CallInst *CI,
1244 Intrinsic::ID IntrID) {
1245 if (CI->arg_size() == 2) {
1246 Value *Arg0 = CI->getArgOperand(0);
1247 Value *Arg1 = CI->getArgOperand(1);
1248 VectorType *Arg0VecTy = dyn_cast<VectorType>(Arg0->getType());
1249 VectorType *Arg1VecTy = dyn_cast<VectorType>(Arg1->getType());
1250 if (Arg0VecTy && !Arg1VecTy) {
1251 Value *SplatRHS = B.CreateVectorSplat(Arg0VecTy->getElementCount(), Arg1);
1252 CI->setArgOperand(1, SplatRHS);
1253 } else if (!Arg0VecTy && Arg1VecTy) {
1254 Value *SplatLHS = B.CreateVectorSplat(Arg1VecTy->getElementCount(), Arg0);
1255 CI->setArgOperand(0, SplatLHS);
1256 }
1257 }
1258
1259 CI->setCalledFunction(
1260 Intrinsic::getDeclaration(CI->getModule(), IntrID, {CI->getType()}));
1261 }
1262
tryReplaceLibcallWithSimpleIntrinsic(IRBuilder<> & B,CallInst * CI,Intrinsic::ID IntrID,bool AllowMinSizeF32,bool AllowF64,bool AllowStrictFP)1263 bool AMDGPULibCalls::tryReplaceLibcallWithSimpleIntrinsic(
1264 IRBuilder<> &B, CallInst *CI, Intrinsic::ID IntrID, bool AllowMinSizeF32,
1265 bool AllowF64, bool AllowStrictFP) {
1266 if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32, AllowF64,
1267 AllowStrictFP))
1268 return false;
1269 replaceLibCallWithSimpleIntrinsic(B, CI, IntrID);
1270 return true;
1271 }
1272
1273 std::tuple<Value *, Value *, Value *>
insertSinCos(Value * Arg,FastMathFlags FMF,IRBuilder<> & B,FunctionCallee Fsincos)1274 AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B,
1275 FunctionCallee Fsincos) {
1276 DebugLoc DL = B.getCurrentDebugLocation();
1277 Function *F = B.GetInsertBlock()->getParent();
1278 B.SetInsertPointPastAllocas(F);
1279
1280 AllocaInst *Alloc = B.CreateAlloca(Arg->getType(), nullptr, "__sincos_");
1281
1282 if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
1283 // If the argument is an instruction, it must dominate all uses so put our
1284 // sincos call there. Otherwise, right after the allocas works well enough
1285 // if it's an argument or constant.
1286
1287 B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator());
1288
1289 // SetInsertPoint unwelcomely always tries to set the debug loc.
1290 B.SetCurrentDebugLocation(DL);
1291 }
1292
1293 Type *CosPtrTy = Fsincos.getFunctionType()->getParamType(1);
1294
1295 // The allocaInst allocates the memory in private address space. This need
1296 // to be addrspacecasted to point to the address space of cos pointer type.
1297 // In OpenCL 2.0 this is generic, while in 1.2 that is private.
1298 Value *CastAlloc = B.CreateAddrSpaceCast(Alloc, CosPtrTy);
1299
1300 CallInst *SinCos = CreateCallEx2(B, Fsincos, Arg, CastAlloc);
1301
1302 // TODO: Is it worth trying to preserve the location for the cos calls for the
1303 // load?
1304
1305 LoadInst *LoadCos = B.CreateLoad(Alloc->getAllocatedType(), Alloc);
1306 return {SinCos, LoadCos, SinCos};
1307 }
1308
1309 // fold sin, cos -> sincos.
fold_sincos(FPMathOperator * FPOp,IRBuilder<> & B,const FuncInfo & fInfo)1310 bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
1311 const FuncInfo &fInfo) {
1312 assert(fInfo.getId() == AMDGPULibFunc::EI_SIN ||
1313 fInfo.getId() == AMDGPULibFunc::EI_COS);
1314
1315 if ((getArgType(fInfo) != AMDGPULibFunc::F32 &&
1316 getArgType(fInfo) != AMDGPULibFunc::F64) ||
1317 fInfo.getPrefix() != AMDGPULibFunc::NOPFX)
1318 return false;
1319
1320 bool const isSin = fInfo.getId() == AMDGPULibFunc::EI_SIN;
1321
1322 Value *CArgVal = FPOp->getOperand(0);
1323 CallInst *CI = cast<CallInst>(FPOp);
1324
1325 Function *F = B.GetInsertBlock()->getParent();
1326 Module *M = F->getParent();
1327
1328 // Merge the sin and cos. For OpenCL 2.0, there may only be a generic pointer
1329 // implementation. Prefer the private form if available.
1330 AMDGPULibFunc SinCosLibFuncPrivate(AMDGPULibFunc::EI_SINCOS, fInfo);
1331 SinCosLibFuncPrivate.getLeads()[0].PtrKind =
1332 AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::PRIVATE_ADDRESS);
1333
1334 AMDGPULibFunc SinCosLibFuncGeneric(AMDGPULibFunc::EI_SINCOS, fInfo);
1335 SinCosLibFuncGeneric.getLeads()[0].PtrKind =
1336 AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS);
1337
1338 FunctionCallee FSinCosPrivate = getFunction(M, SinCosLibFuncPrivate);
1339 FunctionCallee FSinCosGeneric = getFunction(M, SinCosLibFuncGeneric);
1340 FunctionCallee FSinCos = FSinCosPrivate ? FSinCosPrivate : FSinCosGeneric;
1341 if (!FSinCos)
1342 return false;
1343
1344 SmallVector<CallInst *> SinCalls;
1345 SmallVector<CallInst *> CosCalls;
1346 SmallVector<CallInst *> SinCosCalls;
1347 FuncInfo PartnerInfo(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN,
1348 fInfo);
1349 const std::string PairName = PartnerInfo.mangle();
1350
1351 StringRef SinName = isSin ? CI->getCalledFunction()->getName() : PairName;
1352 StringRef CosName = isSin ? PairName : CI->getCalledFunction()->getName();
1353 const std::string SinCosPrivateName = SinCosLibFuncPrivate.mangle();
1354 const std::string SinCosGenericName = SinCosLibFuncGeneric.mangle();
1355
1356 // Intersect the two sets of flags.
1357 FastMathFlags FMF = FPOp->getFastMathFlags();
1358 MDNode *FPMath = CI->getMetadata(LLVMContext::MD_fpmath);
1359
1360 SmallVector<DILocation *> MergeDbgLocs = {CI->getDebugLoc()};
1361
1362 for (User* U : CArgVal->users()) {
1363 CallInst *XI = dyn_cast<CallInst>(U);
1364 if (!XI || XI->getFunction() != F || XI->isNoBuiltin())
1365 continue;
1366
1367 Function *UCallee = XI->getCalledFunction();
1368 if (!UCallee)
1369 continue;
1370
1371 bool Handled = true;
1372
1373 if (UCallee->getName() == SinName)
1374 SinCalls.push_back(XI);
1375 else if (UCallee->getName() == CosName)
1376 CosCalls.push_back(XI);
1377 else if (UCallee->getName() == SinCosPrivateName ||
1378 UCallee->getName() == SinCosGenericName)
1379 SinCosCalls.push_back(XI);
1380 else
1381 Handled = false;
1382
1383 if (Handled) {
1384 MergeDbgLocs.push_back(XI->getDebugLoc());
1385 auto *OtherOp = cast<FPMathOperator>(XI);
1386 FMF &= OtherOp->getFastMathFlags();
1387 FPMath = MDNode::getMostGenericFPMath(
1388 FPMath, XI->getMetadata(LLVMContext::MD_fpmath));
1389 }
1390 }
1391
1392 if (SinCalls.empty() || CosCalls.empty())
1393 return false;
1394
1395 B.setFastMathFlags(FMF);
1396 B.setDefaultFPMathTag(FPMath);
1397 DILocation *DbgLoc = DILocation::getMergedLocations(MergeDbgLocs);
1398 B.SetCurrentDebugLocation(DbgLoc);
1399
1400 auto [Sin, Cos, SinCos] = insertSinCos(CArgVal, FMF, B, FSinCos);
1401
1402 auto replaceTrigInsts = [](ArrayRef<CallInst *> Calls, Value *Res) {
1403 for (CallInst *C : Calls)
1404 C->replaceAllUsesWith(Res);
1405
1406 // Leave the other dead instructions to avoid clobbering iterators.
1407 };
1408
1409 replaceTrigInsts(SinCalls, Sin);
1410 replaceTrigInsts(CosCalls, Cos);
1411 replaceTrigInsts(SinCosCalls, SinCos);
1412
1413 // It's safe to delete the original now.
1414 CI->eraseFromParent();
1415 return true;
1416 }
1417
evaluateScalarMathFunc(const FuncInfo & FInfo,double & Res0,double & Res1,Constant * copr0,Constant * copr1)1418 bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0,
1419 double &Res1, Constant *copr0,
1420 Constant *copr1) {
1421 // By default, opr0/opr1/opr3 holds values of float/double type.
1422 // If they are not float/double, each function has to its
1423 // operand separately.
1424 double opr0 = 0.0, opr1 = 0.0;
1425 ConstantFP *fpopr0 = dyn_cast_or_null<ConstantFP>(copr0);
1426 ConstantFP *fpopr1 = dyn_cast_or_null<ConstantFP>(copr1);
1427 if (fpopr0) {
1428 opr0 = (getArgType(FInfo) == AMDGPULibFunc::F64)
1429 ? fpopr0->getValueAPF().convertToDouble()
1430 : (double)fpopr0->getValueAPF().convertToFloat();
1431 }
1432
1433 if (fpopr1) {
1434 opr1 = (getArgType(FInfo) == AMDGPULibFunc::F64)
1435 ? fpopr1->getValueAPF().convertToDouble()
1436 : (double)fpopr1->getValueAPF().convertToFloat();
1437 }
1438
1439 switch (FInfo.getId()) {
1440 default : return false;
1441
1442 case AMDGPULibFunc::EI_ACOS:
1443 Res0 = acos(opr0);
1444 return true;
1445
1446 case AMDGPULibFunc::EI_ACOSH:
1447 // acosh(x) == log(x + sqrt(x*x - 1))
1448 Res0 = log(opr0 + sqrt(opr0*opr0 - 1.0));
1449 return true;
1450
1451 case AMDGPULibFunc::EI_ACOSPI:
1452 Res0 = acos(opr0) / MATH_PI;
1453 return true;
1454
1455 case AMDGPULibFunc::EI_ASIN:
1456 Res0 = asin(opr0);
1457 return true;
1458
1459 case AMDGPULibFunc::EI_ASINH:
1460 // asinh(x) == log(x + sqrt(x*x + 1))
1461 Res0 = log(opr0 + sqrt(opr0*opr0 + 1.0));
1462 return true;
1463
1464 case AMDGPULibFunc::EI_ASINPI:
1465 Res0 = asin(opr0) / MATH_PI;
1466 return true;
1467
1468 case AMDGPULibFunc::EI_ATAN:
1469 Res0 = atan(opr0);
1470 return true;
1471
1472 case AMDGPULibFunc::EI_ATANH:
1473 // atanh(x) == (log(x+1) - log(x-1))/2;
1474 Res0 = (log(opr0 + 1.0) - log(opr0 - 1.0))/2.0;
1475 return true;
1476
1477 case AMDGPULibFunc::EI_ATANPI:
1478 Res0 = atan(opr0) / MATH_PI;
1479 return true;
1480
1481 case AMDGPULibFunc::EI_CBRT:
1482 Res0 = (opr0 < 0.0) ? -pow(-opr0, 1.0/3.0) : pow(opr0, 1.0/3.0);
1483 return true;
1484
1485 case AMDGPULibFunc::EI_COS:
1486 Res0 = cos(opr0);
1487 return true;
1488
1489 case AMDGPULibFunc::EI_COSH:
1490 Res0 = cosh(opr0);
1491 return true;
1492
1493 case AMDGPULibFunc::EI_COSPI:
1494 Res0 = cos(MATH_PI * opr0);
1495 return true;
1496
1497 case AMDGPULibFunc::EI_EXP:
1498 Res0 = exp(opr0);
1499 return true;
1500
1501 case AMDGPULibFunc::EI_EXP2:
1502 Res0 = pow(2.0, opr0);
1503 return true;
1504
1505 case AMDGPULibFunc::EI_EXP10:
1506 Res0 = pow(10.0, opr0);
1507 return true;
1508
1509 case AMDGPULibFunc::EI_LOG:
1510 Res0 = log(opr0);
1511 return true;
1512
1513 case AMDGPULibFunc::EI_LOG2:
1514 Res0 = log(opr0) / log(2.0);
1515 return true;
1516
1517 case AMDGPULibFunc::EI_LOG10:
1518 Res0 = log(opr0) / log(10.0);
1519 return true;
1520
1521 case AMDGPULibFunc::EI_RSQRT:
1522 Res0 = 1.0 / sqrt(opr0);
1523 return true;
1524
1525 case AMDGPULibFunc::EI_SIN:
1526 Res0 = sin(opr0);
1527 return true;
1528
1529 case AMDGPULibFunc::EI_SINH:
1530 Res0 = sinh(opr0);
1531 return true;
1532
1533 case AMDGPULibFunc::EI_SINPI:
1534 Res0 = sin(MATH_PI * opr0);
1535 return true;
1536
1537 case AMDGPULibFunc::EI_TAN:
1538 Res0 = tan(opr0);
1539 return true;
1540
1541 case AMDGPULibFunc::EI_TANH:
1542 Res0 = tanh(opr0);
1543 return true;
1544
1545 case AMDGPULibFunc::EI_TANPI:
1546 Res0 = tan(MATH_PI * opr0);
1547 return true;
1548
1549 // two-arg functions
1550 case AMDGPULibFunc::EI_POW:
1551 case AMDGPULibFunc::EI_POWR:
1552 Res0 = pow(opr0, opr1);
1553 return true;
1554
1555 case AMDGPULibFunc::EI_POWN: {
1556 if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) {
1557 double val = (double)iopr1->getSExtValue();
1558 Res0 = pow(opr0, val);
1559 return true;
1560 }
1561 return false;
1562 }
1563
1564 case AMDGPULibFunc::EI_ROOTN: {
1565 if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) {
1566 double val = (double)iopr1->getSExtValue();
1567 Res0 = pow(opr0, 1.0 / val);
1568 return true;
1569 }
1570 return false;
1571 }
1572
1573 // with ptr arg
1574 case AMDGPULibFunc::EI_SINCOS:
1575 Res0 = sin(opr0);
1576 Res1 = cos(opr0);
1577 return true;
1578 }
1579
1580 return false;
1581 }
1582
evaluateCall(CallInst * aCI,const FuncInfo & FInfo)1583 bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
1584 int numArgs = (int)aCI->arg_size();
1585 if (numArgs > 3)
1586 return false;
1587
1588 Constant *copr0 = nullptr;
1589 Constant *copr1 = nullptr;
1590 if (numArgs > 0) {
1591 if ((copr0 = dyn_cast<Constant>(aCI->getArgOperand(0))) == nullptr)
1592 return false;
1593 }
1594
1595 if (numArgs > 1) {
1596 if ((copr1 = dyn_cast<Constant>(aCI->getArgOperand(1))) == nullptr) {
1597 if (FInfo.getId() != AMDGPULibFunc::EI_SINCOS)
1598 return false;
1599 }
1600 }
1601
1602 // At this point, all arguments to aCI are constants.
1603
1604 // max vector size is 16, and sincos will generate two results.
1605 double DVal0[16], DVal1[16];
1606 int FuncVecSize = getVecSize(FInfo);
1607 bool hasTwoResults = (FInfo.getId() == AMDGPULibFunc::EI_SINCOS);
1608 if (FuncVecSize == 1) {
1609 if (!evaluateScalarMathFunc(FInfo, DVal0[0], DVal1[0], copr0, copr1)) {
1610 return false;
1611 }
1612 } else {
1613 ConstantDataVector *CDV0 = dyn_cast_or_null<ConstantDataVector>(copr0);
1614 ConstantDataVector *CDV1 = dyn_cast_or_null<ConstantDataVector>(copr1);
1615 for (int i = 0; i < FuncVecSize; ++i) {
1616 Constant *celt0 = CDV0 ? CDV0->getElementAsConstant(i) : nullptr;
1617 Constant *celt1 = CDV1 ? CDV1->getElementAsConstant(i) : nullptr;
1618 if (!evaluateScalarMathFunc(FInfo, DVal0[i], DVal1[i], celt0, celt1)) {
1619 return false;
1620 }
1621 }
1622 }
1623
1624 LLVMContext &context = aCI->getContext();
1625 Constant *nval0, *nval1;
1626 if (FuncVecSize == 1) {
1627 nval0 = ConstantFP::get(aCI->getType(), DVal0[0]);
1628 if (hasTwoResults)
1629 nval1 = ConstantFP::get(aCI->getType(), DVal1[0]);
1630 } else {
1631 if (getArgType(FInfo) == AMDGPULibFunc::F32) {
1632 SmallVector <float, 0> FVal0, FVal1;
1633 for (int i = 0; i < FuncVecSize; ++i)
1634 FVal0.push_back((float)DVal0[i]);
1635 ArrayRef<float> tmp0(FVal0);
1636 nval0 = ConstantDataVector::get(context, tmp0);
1637 if (hasTwoResults) {
1638 for (int i = 0; i < FuncVecSize; ++i)
1639 FVal1.push_back((float)DVal1[i]);
1640 ArrayRef<float> tmp1(FVal1);
1641 nval1 = ConstantDataVector::get(context, tmp1);
1642 }
1643 } else {
1644 ArrayRef<double> tmp0(DVal0);
1645 nval0 = ConstantDataVector::get(context, tmp0);
1646 if (hasTwoResults) {
1647 ArrayRef<double> tmp1(DVal1);
1648 nval1 = ConstantDataVector::get(context, tmp1);
1649 }
1650 }
1651 }
1652
1653 if (hasTwoResults) {
1654 // sincos
1655 assert(FInfo.getId() == AMDGPULibFunc::EI_SINCOS &&
1656 "math function with ptr arg not supported yet");
1657 new StoreInst(nval1, aCI->getArgOperand(1), aCI);
1658 }
1659
1660 replaceCall(aCI, nval0);
1661 return true;
1662 }
1663
run(Function & F,FunctionAnalysisManager & AM)1664 PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F,
1665 FunctionAnalysisManager &AM) {
1666 AMDGPULibCalls Simplifier;
1667 Simplifier.initNativeFuncs();
1668 Simplifier.initFunction(F, AM);
1669
1670 bool Changed = false;
1671
1672 LLVM_DEBUG(dbgs() << "AMDIC: process function ";
1673 F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';);
1674
1675 for (auto &BB : F) {
1676 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
1677 // Ignore non-calls.
1678 CallInst *CI = dyn_cast<CallInst>(I);
1679 ++I;
1680
1681 if (CI) {
1682 if (Simplifier.fold(CI))
1683 Changed = true;
1684 }
1685 }
1686 }
1687 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
1688 }
1689
run(Function & F,FunctionAnalysisManager & AM)1690 PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F,
1691 FunctionAnalysisManager &AM) {
1692 if (UseNative.empty())
1693 return PreservedAnalyses::all();
1694
1695 AMDGPULibCalls Simplifier;
1696 Simplifier.initNativeFuncs();
1697 Simplifier.initFunction(F, AM);
1698
1699 bool Changed = false;
1700 for (auto &BB : F) {
1701 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
1702 // Ignore non-calls.
1703 CallInst *CI = dyn_cast<CallInst>(I);
1704 ++I;
1705 if (CI && Simplifier.useNative(CI))
1706 Changed = true;
1707 }
1708 }
1709 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
1710 }
1711