1 //===---- CGBuiltin.cpp - Emit LLVM Code for builtins ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This contains code to emit Builtin calls as LLVM code.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "ABIInfo.h"
14 #include "CGCUDARuntime.h"
15 #include "CGCXXABI.h"
16 #include "CGObjCRuntime.h"
17 #include "CGOpenCLRuntime.h"
18 #include "CGRecordLayout.h"
19 #include "CodeGenFunction.h"
20 #include "CodeGenModule.h"
21 #include "ConstantEmitter.h"
22 #include "PatternInit.h"
23 #include "TargetInfo.h"
24 #include "clang/AST/ASTContext.h"
25 #include "clang/AST/Attr.h"
26 #include "clang/AST/Decl.h"
27 #include "clang/AST/OSLog.h"
28 #include "clang/AST/OperationKinds.h"
29 #include "clang/Basic/TargetBuiltins.h"
30 #include "clang/Basic/TargetInfo.h"
31 #include "clang/Basic/TargetOptions.h"
32 #include "clang/CodeGen/CGFunctionInfo.h"
33 #include "clang/Frontend/FrontendDiagnostic.h"
34 #include "llvm/ADT/APFloat.h"
35 #include "llvm/ADT/APInt.h"
36 #include "llvm/ADT/FloatingPointMode.h"
37 #include "llvm/ADT/SmallPtrSet.h"
38 #include "llvm/ADT/StringExtras.h"
39 #include "llvm/Analysis/ValueTracking.h"
40 #include "llvm/IR/DataLayout.h"
41 #include "llvm/IR/InlineAsm.h"
42 #include "llvm/IR/Intrinsics.h"
43 #include "llvm/IR/IntrinsicsAArch64.h"
44 #include "llvm/IR/IntrinsicsAMDGPU.h"
45 #include "llvm/IR/IntrinsicsARM.h"
46 #include "llvm/IR/IntrinsicsBPF.h"
47 #include "llvm/IR/IntrinsicsHexagon.h"
48 #include "llvm/IR/IntrinsicsNVPTX.h"
49 #include "llvm/IR/IntrinsicsPowerPC.h"
50 #include "llvm/IR/IntrinsicsR600.h"
51 #include "llvm/IR/IntrinsicsRISCV.h"
52 #include "llvm/IR/IntrinsicsS390.h"
53 #include "llvm/IR/IntrinsicsVE.h"
54 #include "llvm/IR/IntrinsicsWebAssembly.h"
55 #include "llvm/IR/IntrinsicsX86.h"
56 #include "llvm/IR/MDBuilder.h"
57 #include "llvm/IR/MatrixBuilder.h"
58 #include "llvm/Support/ConvertUTF.h"
59 #include "llvm/Support/MathExtras.h"
60 #include "llvm/Support/ScopedPrinter.h"
61 #include "llvm/TargetParser/AArch64TargetParser.h"
62 #include "llvm/TargetParser/X86TargetParser.h"
63 #include <optional>
64 #include <sstream>
65 
66 using namespace clang;
67 using namespace CodeGen;
68 using namespace llvm;
69 
70 static void initializeAlloca(CodeGenFunction &CGF, AllocaInst *AI, Value *Size,
71                              Align AlignmentInBytes) {
72   ConstantInt *Byte;
73   switch (CGF.getLangOpts().getTrivialAutoVarInit()) {
74   case LangOptions::TrivialAutoVarInitKind::Uninitialized:
75     // Nothing to initialize.
76     return;
77   case LangOptions::TrivialAutoVarInitKind::Zero:
78     Byte = CGF.Builder.getInt8(0x00);
79     break;
80   case LangOptions::TrivialAutoVarInitKind::Pattern: {
81     llvm::Type *Int8 = llvm::IntegerType::getInt8Ty(CGF.CGM.getLLVMContext());
82     Byte = llvm::dyn_cast<llvm::ConstantInt>(
83         initializationPatternFor(CGF.CGM, Int8));
84     break;
85   }
86   }
87   if (CGF.CGM.stopAutoInit())
88     return;
89   auto *I = CGF.Builder.CreateMemSet(AI, Byte, Size, AlignmentInBytes);
90   I->addAnnotationMetadata("auto-init");
91 }
92 
93 /// getBuiltinLibFunction - Given a builtin id for a function like
94 /// "__builtin_fabsf", return a Function* for "fabsf".
95 llvm::Constant *CodeGenModule::getBuiltinLibFunction(const FunctionDecl *FD,
96                                                      unsigned BuiltinID) {
97   assert(Context.BuiltinInfo.isLibFunction(BuiltinID));
98 
99   // Get the name, skip over the __builtin_ prefix (if necessary).
100   StringRef Name;
101   GlobalDecl D(FD);
102 
103   // TODO: This list should be expanded or refactored after all GCC-compatible
104   // std libcall builtins are implemented.
105   static SmallDenseMap<unsigned, StringRef, 64> F128Builtins{
106       {Builtin::BI__builtin___fprintf_chk, "__fprintf_chkieee128"},
107       {Builtin::BI__builtin___printf_chk, "__printf_chkieee128"},
108       {Builtin::BI__builtin___snprintf_chk, "__snprintf_chkieee128"},
109       {Builtin::BI__builtin___sprintf_chk, "__sprintf_chkieee128"},
110       {Builtin::BI__builtin___vfprintf_chk, "__vfprintf_chkieee128"},
111       {Builtin::BI__builtin___vprintf_chk, "__vprintf_chkieee128"},
112       {Builtin::BI__builtin___vsnprintf_chk, "__vsnprintf_chkieee128"},
113       {Builtin::BI__builtin___vsprintf_chk, "__vsprintf_chkieee128"},
114       {Builtin::BI__builtin_fprintf, "__fprintfieee128"},
115       {Builtin::BI__builtin_printf, "__printfieee128"},
116       {Builtin::BI__builtin_snprintf, "__snprintfieee128"},
117       {Builtin::BI__builtin_sprintf, "__sprintfieee128"},
118       {Builtin::BI__builtin_vfprintf, "__vfprintfieee128"},
119       {Builtin::BI__builtin_vprintf, "__vprintfieee128"},
120       {Builtin::BI__builtin_vsnprintf, "__vsnprintfieee128"},
121       {Builtin::BI__builtin_vsprintf, "__vsprintfieee128"},
122       {Builtin::BI__builtin_fscanf, "__fscanfieee128"},
123       {Builtin::BI__builtin_scanf, "__scanfieee128"},
124       {Builtin::BI__builtin_sscanf, "__sscanfieee128"},
125       {Builtin::BI__builtin_vfscanf, "__vfscanfieee128"},
126       {Builtin::BI__builtin_vscanf, "__vscanfieee128"},
127       {Builtin::BI__builtin_vsscanf, "__vsscanfieee128"},
128       {Builtin::BI__builtin_nexttowardf128, "__nexttowardieee128"},
129   };
130 
131   // The AIX library functions frexpl, ldexpl, and modfl are for 128-bit
132   // IBM 'long double' (i.e. __ibm128). Map to the 'double' versions
133   // if it is 64-bit 'long double' mode.
134   static SmallDenseMap<unsigned, StringRef, 4> AIXLongDouble64Builtins{
135       {Builtin::BI__builtin_frexpl, "frexp"},
136       {Builtin::BI__builtin_ldexpl, "ldexp"},
137       {Builtin::BI__builtin_modfl, "modf"},
138   };
139 
140   // If the builtin has been declared explicitly with an assembler label,
141   // use the mangled name. This differs from the plain label on platforms
142   // that prefix labels.
143   if (FD->hasAttr<AsmLabelAttr>())
144     Name = getMangledName(D);
145   else {
146     // TODO: This mutation should also be applied to other targets other than
147     // PPC, after backend supports IEEE 128-bit style libcalls.
148     if (getTriple().isPPC64() &&
149         &getTarget().getLongDoubleFormat() == &llvm::APFloat::IEEEquad() &&
150         F128Builtins.contains(BuiltinID))
151       Name = F128Builtins[BuiltinID];
152     else if (getTriple().isOSAIX() &&
153              &getTarget().getLongDoubleFormat() ==
154                  &llvm::APFloat::IEEEdouble() &&
155              AIXLongDouble64Builtins.contains(BuiltinID))
156       Name = AIXLongDouble64Builtins[BuiltinID];
157     else
158       Name = Context.BuiltinInfo.getName(BuiltinID).substr(10);
159   }
160 
161   llvm::FunctionType *Ty =
162     cast<llvm::FunctionType>(getTypes().ConvertType(FD->getType()));
163 
164   return GetOrCreateLLVMFunction(Name, Ty, D, /*ForVTable=*/false);
165 }
166 
167 /// Emit the conversions required to turn the given value into an
168 /// integer of the given size.
169 static Value *EmitToInt(CodeGenFunction &CGF, llvm::Value *V,
170                         QualType T, llvm::IntegerType *IntType) {
171   V = CGF.EmitToMemory(V, T);
172 
173   if (V->getType()->isPointerTy())
174     return CGF.Builder.CreatePtrToInt(V, IntType);
175 
176   assert(V->getType() == IntType);
177   return V;
178 }
179 
180 static Value *EmitFromInt(CodeGenFunction &CGF, llvm::Value *V,
181                           QualType T, llvm::Type *ResultType) {
182   V = CGF.EmitFromMemory(V, T);
183 
184   if (ResultType->isPointerTy())
185     return CGF.Builder.CreateIntToPtr(V, ResultType);
186 
187   assert(V->getType() == ResultType);
188   return V;
189 }
190 
191 static Address CheckAtomicAlignment(CodeGenFunction &CGF, const CallExpr *E) {
192   ASTContext &Ctx = CGF.getContext();
193   Address Ptr = CGF.EmitPointerWithAlignment(E->getArg(0));
194   unsigned Bytes = Ptr.getElementType()->isPointerTy()
195                        ? Ctx.getTypeSizeInChars(Ctx.VoidPtrTy).getQuantity()
196                        : Ptr.getElementType()->getScalarSizeInBits() / 8;
197   unsigned Align = Ptr.getAlignment().getQuantity();
198   if (Align % Bytes != 0) {
199     DiagnosticsEngine &Diags = CGF.CGM.getDiags();
200     Diags.Report(E->getBeginLoc(), diag::warn_sync_op_misaligned);
201     // Force address to be at least naturally-aligned.
202     return Ptr.withAlignment(CharUnits::fromQuantity(Bytes));
203   }
204   return Ptr;
205 }
206 
207 /// Utility to insert an atomic instruction based on Intrinsic::ID
208 /// and the expression node.
209 static Value *MakeBinaryAtomicValue(
210     CodeGenFunction &CGF, llvm::AtomicRMWInst::BinOp Kind, const CallExpr *E,
211     AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent) {
212 
213   QualType T = E->getType();
214   assert(E->getArg(0)->getType()->isPointerType());
215   assert(CGF.getContext().hasSameUnqualifiedType(T,
216                                   E->getArg(0)->getType()->getPointeeType()));
217   assert(CGF.getContext().hasSameUnqualifiedType(T, E->getArg(1)->getType()));
218 
219   Address DestAddr = CheckAtomicAlignment(CGF, E);
220 
221   llvm::IntegerType *IntType = llvm::IntegerType::get(
222       CGF.getLLVMContext(), CGF.getContext().getTypeSize(T));
223 
224   llvm::Value *Val = CGF.EmitScalarExpr(E->getArg(1));
225   llvm::Type *ValueType = Val->getType();
226   Val = EmitToInt(CGF, Val, T, IntType);
227 
228   llvm::Value *Result =
229       CGF.Builder.CreateAtomicRMW(Kind, DestAddr, Val, Ordering);
230   return EmitFromInt(CGF, Result, T, ValueType);
231 }
232 
233 static Value *EmitNontemporalStore(CodeGenFunction &CGF, const CallExpr *E) {
234   Value *Val = CGF.EmitScalarExpr(E->getArg(0));
235   Address Addr = CGF.EmitPointerWithAlignment(E->getArg(1));
236 
237   Val = CGF.EmitToMemory(Val, E->getArg(0)->getType());
238   LValue LV = CGF.MakeAddrLValue(Addr, E->getArg(0)->getType());
239   LV.setNontemporal(true);
240   CGF.EmitStoreOfScalar(Val, LV, false);
241   return nullptr;
242 }
243 
244 static Value *EmitNontemporalLoad(CodeGenFunction &CGF, const CallExpr *E) {
245   Address Addr = CGF.EmitPointerWithAlignment(E->getArg(0));
246 
247   LValue LV = CGF.MakeAddrLValue(Addr, E->getType());
248   LV.setNontemporal(true);
249   return CGF.EmitLoadOfScalar(LV, E->getExprLoc());
250 }
251 
252 static RValue EmitBinaryAtomic(CodeGenFunction &CGF,
253                                llvm::AtomicRMWInst::BinOp Kind,
254                                const CallExpr *E) {
255   return RValue::get(MakeBinaryAtomicValue(CGF, Kind, E));
256 }
257 
258 /// Utility to insert an atomic instruction based Intrinsic::ID and
259 /// the expression node, where the return value is the result of the
260 /// operation.
261 static RValue EmitBinaryAtomicPost(CodeGenFunction &CGF,
262                                    llvm::AtomicRMWInst::BinOp Kind,
263                                    const CallExpr *E,
264                                    Instruction::BinaryOps Op,
265                                    bool Invert = false) {
266   QualType T = E->getType();
267   assert(E->getArg(0)->getType()->isPointerType());
268   assert(CGF.getContext().hasSameUnqualifiedType(T,
269                                   E->getArg(0)->getType()->getPointeeType()));
270   assert(CGF.getContext().hasSameUnqualifiedType(T, E->getArg(1)->getType()));
271 
272   Address DestAddr = CheckAtomicAlignment(CGF, E);
273 
274   llvm::IntegerType *IntType = llvm::IntegerType::get(
275       CGF.getLLVMContext(), CGF.getContext().getTypeSize(T));
276 
277   llvm::Value *Val = CGF.EmitScalarExpr(E->getArg(1));
278   llvm::Type *ValueType = Val->getType();
279   Val = EmitToInt(CGF, Val, T, IntType);
280 
281   llvm::Value *Result = CGF.Builder.CreateAtomicRMW(
282       Kind, DestAddr, Val, llvm::AtomicOrdering::SequentiallyConsistent);
283   Result = CGF.Builder.CreateBinOp(Op, Result, Val);
284   if (Invert)
285     Result =
286         CGF.Builder.CreateBinOp(llvm::Instruction::Xor, Result,
287                                 llvm::ConstantInt::getAllOnesValue(IntType));
288   Result = EmitFromInt(CGF, Result, T, ValueType);
289   return RValue::get(Result);
290 }
291 
292 /// Utility to insert an atomic cmpxchg instruction.
293 ///
294 /// @param CGF The current codegen function.
295 /// @param E   Builtin call expression to convert to cmpxchg.
296 ///            arg0 - address to operate on
297 ///            arg1 - value to compare with
298 ///            arg2 - new value
299 /// @param ReturnBool Specifies whether to return success flag of
300 ///                   cmpxchg result or the old value.
301 ///
302 /// @returns result of cmpxchg, according to ReturnBool
303 ///
304 /// Note: In order to lower Microsoft's _InterlockedCompareExchange* intrinsics
305 /// invoke the function EmitAtomicCmpXchgForMSIntrin.
306 static Value *MakeAtomicCmpXchgValue(CodeGenFunction &CGF, const CallExpr *E,
307                                      bool ReturnBool) {
308   QualType T = ReturnBool ? E->getArg(1)->getType() : E->getType();
309   Address DestAddr = CheckAtomicAlignment(CGF, E);
310 
311   llvm::IntegerType *IntType = llvm::IntegerType::get(
312       CGF.getLLVMContext(), CGF.getContext().getTypeSize(T));
313 
314   Value *Cmp = CGF.EmitScalarExpr(E->getArg(1));
315   llvm::Type *ValueType = Cmp->getType();
316   Cmp = EmitToInt(CGF, Cmp, T, IntType);
317   Value *New = EmitToInt(CGF, CGF.EmitScalarExpr(E->getArg(2)), T, IntType);
318 
319   Value *Pair = CGF.Builder.CreateAtomicCmpXchg(
320       DestAddr, Cmp, New, llvm::AtomicOrdering::SequentiallyConsistent,
321       llvm::AtomicOrdering::SequentiallyConsistent);
322   if (ReturnBool)
323     // Extract boolean success flag and zext it to int.
324     return CGF.Builder.CreateZExt(CGF.Builder.CreateExtractValue(Pair, 1),
325                                   CGF.ConvertType(E->getType()));
326   else
327     // Extract old value and emit it using the same type as compare value.
328     return EmitFromInt(CGF, CGF.Builder.CreateExtractValue(Pair, 0), T,
329                        ValueType);
330 }
331 
332 /// This function should be invoked to emit atomic cmpxchg for Microsoft's
333 /// _InterlockedCompareExchange* intrinsics which have the following signature:
334 /// T _InterlockedCompareExchange(T volatile *Destination,
335 ///                               T Exchange,
336 ///                               T Comparand);
337 ///
338 /// Whereas the llvm 'cmpxchg' instruction has the following syntax:
339 /// cmpxchg *Destination, Comparand, Exchange.
340 /// So we need to swap Comparand and Exchange when invoking
341 /// CreateAtomicCmpXchg. That is the reason we could not use the above utility
342 /// function MakeAtomicCmpXchgValue since it expects the arguments to be
343 /// already swapped.
344 
345 static
346 Value *EmitAtomicCmpXchgForMSIntrin(CodeGenFunction &CGF, const CallExpr *E,
347     AtomicOrdering SuccessOrdering = AtomicOrdering::SequentiallyConsistent) {
348   assert(E->getArg(0)->getType()->isPointerType());
349   assert(CGF.getContext().hasSameUnqualifiedType(
350       E->getType(), E->getArg(0)->getType()->getPointeeType()));
351   assert(CGF.getContext().hasSameUnqualifiedType(E->getType(),
352                                                  E->getArg(1)->getType()));
353   assert(CGF.getContext().hasSameUnqualifiedType(E->getType(),
354                                                  E->getArg(2)->getType()));
355 
356   Address DestAddr = CheckAtomicAlignment(CGF, E);
357 
358   auto *Comparand = CGF.EmitScalarExpr(E->getArg(2));
359   auto *Exchange = CGF.EmitScalarExpr(E->getArg(1));
360 
361   // For Release ordering, the failure ordering should be Monotonic.
362   auto FailureOrdering = SuccessOrdering == AtomicOrdering::Release ?
363                          AtomicOrdering::Monotonic :
364                          SuccessOrdering;
365 
366   // The atomic instruction is marked volatile for consistency with MSVC. This
367   // blocks the few atomics optimizations that LLVM has. If we want to optimize
368   // _Interlocked* operations in the future, we will have to remove the volatile
369   // marker.
370   auto *Result = CGF.Builder.CreateAtomicCmpXchg(
371       DestAddr, Comparand, Exchange, SuccessOrdering, FailureOrdering);
372   Result->setVolatile(true);
373   return CGF.Builder.CreateExtractValue(Result, 0);
374 }
375 
376 // 64-bit Microsoft platforms support 128 bit cmpxchg operations. They are
377 // prototyped like this:
378 //
379 // unsigned char _InterlockedCompareExchange128...(
380 //     __int64 volatile * _Destination,
381 //     __int64 _ExchangeHigh,
382 //     __int64 _ExchangeLow,
383 //     __int64 * _ComparandResult);
384 //
385 // Note that Destination is assumed to be at least 16-byte aligned, despite
386 // being typed int64.
387 
388 static Value *EmitAtomicCmpXchg128ForMSIntrin(CodeGenFunction &CGF,
389                                               const CallExpr *E,
390                                               AtomicOrdering SuccessOrdering) {
391   assert(E->getNumArgs() == 4);
392   llvm::Value *DestPtr = CGF.EmitScalarExpr(E->getArg(0));
393   llvm::Value *ExchangeHigh = CGF.EmitScalarExpr(E->getArg(1));
394   llvm::Value *ExchangeLow = CGF.EmitScalarExpr(E->getArg(2));
395   Address ComparandAddr = CGF.EmitPointerWithAlignment(E->getArg(3));
396 
397   assert(DestPtr->getType()->isPointerTy());
398   assert(!ExchangeHigh->getType()->isPointerTy());
399   assert(!ExchangeLow->getType()->isPointerTy());
400 
401   // For Release ordering, the failure ordering should be Monotonic.
402   auto FailureOrdering = SuccessOrdering == AtomicOrdering::Release
403                              ? AtomicOrdering::Monotonic
404                              : SuccessOrdering;
405 
406   // Convert to i128 pointers and values. Alignment is also overridden for
407   // destination pointer.
408   llvm::Type *Int128Ty = llvm::IntegerType::get(CGF.getLLVMContext(), 128);
409   Address DestAddr(DestPtr, Int128Ty,
410                    CGF.getContext().toCharUnitsFromBits(128));
411   ComparandAddr = ComparandAddr.withElementType(Int128Ty);
412 
413   // (((i128)hi) << 64) | ((i128)lo)
414   ExchangeHigh = CGF.Builder.CreateZExt(ExchangeHigh, Int128Ty);
415   ExchangeLow = CGF.Builder.CreateZExt(ExchangeLow, Int128Ty);
416   ExchangeHigh =
417       CGF.Builder.CreateShl(ExchangeHigh, llvm::ConstantInt::get(Int128Ty, 64));
418   llvm::Value *Exchange = CGF.Builder.CreateOr(ExchangeHigh, ExchangeLow);
419 
420   // Load the comparand for the instruction.
421   llvm::Value *Comparand = CGF.Builder.CreateLoad(ComparandAddr);
422 
423   auto *CXI = CGF.Builder.CreateAtomicCmpXchg(DestAddr, Comparand, Exchange,
424                                               SuccessOrdering, FailureOrdering);
425 
426   // The atomic instruction is marked volatile for consistency with MSVC. This
427   // blocks the few atomics optimizations that LLVM has. If we want to optimize
428   // _Interlocked* operations in the future, we will have to remove the volatile
429   // marker.
430   CXI->setVolatile(true);
431 
432   // Store the result as an outparameter.
433   CGF.Builder.CreateStore(CGF.Builder.CreateExtractValue(CXI, 0),
434                           ComparandAddr);
435 
436   // Get the success boolean and zero extend it to i8.
437   Value *Success = CGF.Builder.CreateExtractValue(CXI, 1);
438   return CGF.Builder.CreateZExt(Success, CGF.Int8Ty);
439 }
440 
441 static Value *EmitAtomicIncrementValue(CodeGenFunction &CGF, const CallExpr *E,
442     AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent) {
443   assert(E->getArg(0)->getType()->isPointerType());
444 
445   auto *IntTy = CGF.ConvertType(E->getType());
446   Address DestAddr = CheckAtomicAlignment(CGF, E);
447   auto *Result = CGF.Builder.CreateAtomicRMW(
448       AtomicRMWInst::Add, DestAddr, ConstantInt::get(IntTy, 1), Ordering);
449   return CGF.Builder.CreateAdd(Result, ConstantInt::get(IntTy, 1));
450 }
451 
452 static Value *EmitAtomicDecrementValue(
453     CodeGenFunction &CGF, const CallExpr *E,
454     AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent) {
455   assert(E->getArg(0)->getType()->isPointerType());
456 
457   auto *IntTy = CGF.ConvertType(E->getType());
458   Address DestAddr = CheckAtomicAlignment(CGF, E);
459   auto *Result = CGF.Builder.CreateAtomicRMW(
460       AtomicRMWInst::Sub, DestAddr, ConstantInt::get(IntTy, 1), Ordering);
461   return CGF.Builder.CreateSub(Result, ConstantInt::get(IntTy, 1));
462 }
463 
464 // Build a plain volatile load.
465 static Value *EmitISOVolatileLoad(CodeGenFunction &CGF, const CallExpr *E) {
466   Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
467   QualType ElTy = E->getArg(0)->getType()->getPointeeType();
468   CharUnits LoadSize = CGF.getContext().getTypeSizeInChars(ElTy);
469   llvm::Type *ITy =
470       llvm::IntegerType::get(CGF.getLLVMContext(), LoadSize.getQuantity() * 8);
471   llvm::LoadInst *Load = CGF.Builder.CreateAlignedLoad(ITy, Ptr, LoadSize);
472   Load->setVolatile(true);
473   return Load;
474 }
475 
476 // Build a plain volatile store.
477 static Value *EmitISOVolatileStore(CodeGenFunction &CGF, const CallExpr *E) {
478   Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
479   Value *Value = CGF.EmitScalarExpr(E->getArg(1));
480   QualType ElTy = E->getArg(0)->getType()->getPointeeType();
481   CharUnits StoreSize = CGF.getContext().getTypeSizeInChars(ElTy);
482   llvm::StoreInst *Store =
483       CGF.Builder.CreateAlignedStore(Value, Ptr, StoreSize);
484   Store->setVolatile(true);
485   return Store;
486 }
487 
488 // Emit a simple mangled intrinsic that has 1 argument and a return type
489 // matching the argument type. Depending on mode, this may be a constrained
490 // floating-point intrinsic.
491 static Value *emitUnaryMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
492                                 const CallExpr *E, unsigned IntrinsicID,
493                                 unsigned ConstrainedIntrinsicID) {
494   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
495 
496   CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
497   if (CGF.Builder.getIsFPConstrained()) {
498     Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Src0->getType());
499     return CGF.Builder.CreateConstrainedFPCall(F, { Src0 });
500   } else {
501     Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
502     return CGF.Builder.CreateCall(F, Src0);
503   }
504 }
505 
506 // Emit an intrinsic that has 2 operands of the same type as its result.
507 // Depending on mode, this may be a constrained floating-point intrinsic.
508 static Value *emitBinaryMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
509                                 const CallExpr *E, unsigned IntrinsicID,
510                                 unsigned ConstrainedIntrinsicID) {
511   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
512   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
513 
514   if (CGF.Builder.getIsFPConstrained()) {
515     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
516     Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Src0->getType());
517     return CGF.Builder.CreateConstrainedFPCall(F, { Src0, Src1 });
518   } else {
519     Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
520     return CGF.Builder.CreateCall(F, { Src0, Src1 });
521   }
522 }
523 
524 // Has second type mangled argument.
525 static Value *emitBinaryExpMaybeConstrainedFPBuiltin(
526     CodeGenFunction &CGF, const CallExpr *E, llvm::Intrinsic::ID IntrinsicID,
527     llvm::Intrinsic::ID ConstrainedIntrinsicID) {
528   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
529   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
530 
531   if (CGF.Builder.getIsFPConstrained()) {
532     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
533     Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID,
534                                        {Src0->getType(), Src1->getType()});
535     return CGF.Builder.CreateConstrainedFPCall(F, {Src0, Src1});
536   }
537 
538   Function *F =
539       CGF.CGM.getIntrinsic(IntrinsicID, {Src0->getType(), Src1->getType()});
540   return CGF.Builder.CreateCall(F, {Src0, Src1});
541 }
542 
543 // Emit an intrinsic that has 3 operands of the same type as its result.
544 // Depending on mode, this may be a constrained floating-point intrinsic.
545 static Value *emitTernaryMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
546                                  const CallExpr *E, unsigned IntrinsicID,
547                                  unsigned ConstrainedIntrinsicID) {
548   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
549   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
550   llvm::Value *Src2 = CGF.EmitScalarExpr(E->getArg(2));
551 
552   if (CGF.Builder.getIsFPConstrained()) {
553     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
554     Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Src0->getType());
555     return CGF.Builder.CreateConstrainedFPCall(F, { Src0, Src1, Src2 });
556   } else {
557     Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
558     return CGF.Builder.CreateCall(F, { Src0, Src1, Src2 });
559   }
560 }
561 
562 // Emit an intrinsic where all operands are of the same type as the result.
563 // Depending on mode, this may be a constrained floating-point intrinsic.
564 static Value *emitCallMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
565                                                 unsigned IntrinsicID,
566                                                 unsigned ConstrainedIntrinsicID,
567                                                 llvm::Type *Ty,
568                                                 ArrayRef<Value *> Args) {
569   Function *F;
570   if (CGF.Builder.getIsFPConstrained())
571     F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Ty);
572   else
573     F = CGF.CGM.getIntrinsic(IntrinsicID, Ty);
574 
575   if (CGF.Builder.getIsFPConstrained())
576     return CGF.Builder.CreateConstrainedFPCall(F, Args);
577   else
578     return CGF.Builder.CreateCall(F, Args);
579 }
580 
581 // Emit a simple mangled intrinsic that has 1 argument and a return type
582 // matching the argument type.
583 static Value *emitUnaryBuiltin(CodeGenFunction &CGF, const CallExpr *E,
584                                unsigned IntrinsicID,
585                                llvm::StringRef Name = "") {
586   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
587 
588   Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
589   return CGF.Builder.CreateCall(F, Src0, Name);
590 }
591 
592 // Emit an intrinsic that has 2 operands of the same type as its result.
593 static Value *emitBinaryBuiltin(CodeGenFunction &CGF,
594                                 const CallExpr *E,
595                                 unsigned IntrinsicID) {
596   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
597   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
598 
599   Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
600   return CGF.Builder.CreateCall(F, { Src0, Src1 });
601 }
602 
603 // Emit an intrinsic that has 3 operands of the same type as its result.
604 static Value *emitTernaryBuiltin(CodeGenFunction &CGF,
605                                  const CallExpr *E,
606                                  unsigned IntrinsicID) {
607   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
608   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
609   llvm::Value *Src2 = CGF.EmitScalarExpr(E->getArg(2));
610 
611   Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
612   return CGF.Builder.CreateCall(F, { Src0, Src1, Src2 });
613 }
614 
615 // Emit an intrinsic that has 1 float or double operand, and 1 integer.
616 static Value *emitFPIntBuiltin(CodeGenFunction &CGF,
617                                const CallExpr *E,
618                                unsigned IntrinsicID) {
619   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
620   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
621 
622   Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
623   return CGF.Builder.CreateCall(F, {Src0, Src1});
624 }
625 
626 // Emit an intrinsic that has overloaded integer result and fp operand.
627 static Value *
628 emitMaybeConstrainedFPToIntRoundBuiltin(CodeGenFunction &CGF, const CallExpr *E,
629                                         unsigned IntrinsicID,
630                                         unsigned ConstrainedIntrinsicID) {
631   llvm::Type *ResultType = CGF.ConvertType(E->getType());
632   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
633 
634   if (CGF.Builder.getIsFPConstrained()) {
635     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
636     Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID,
637                                        {ResultType, Src0->getType()});
638     return CGF.Builder.CreateConstrainedFPCall(F, {Src0});
639   } else {
640     Function *F =
641         CGF.CGM.getIntrinsic(IntrinsicID, {ResultType, Src0->getType()});
642     return CGF.Builder.CreateCall(F, Src0);
643   }
644 }
645 
646 static Value *emitFrexpBuiltin(CodeGenFunction &CGF, const CallExpr *E,
647                                llvm::Intrinsic::ID IntrinsicID) {
648   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
649   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
650 
651   QualType IntPtrTy = E->getArg(1)->getType()->getPointeeType();
652   llvm::Type *IntTy = CGF.ConvertType(IntPtrTy);
653   llvm::Function *F =
654       CGF.CGM.getIntrinsic(IntrinsicID, {Src0->getType(), IntTy});
655   llvm::Value *Call = CGF.Builder.CreateCall(F, Src0);
656 
657   llvm::Value *Exp = CGF.Builder.CreateExtractValue(Call, 1);
658   LValue LV = CGF.MakeNaturalAlignAddrLValue(Src1, IntPtrTy);
659   CGF.EmitStoreOfScalar(Exp, LV);
660 
661   return CGF.Builder.CreateExtractValue(Call, 0);
662 }
663 
664 /// EmitFAbs - Emit a call to @llvm.fabs().
665 static Value *EmitFAbs(CodeGenFunction &CGF, Value *V) {
666   Function *F = CGF.CGM.getIntrinsic(Intrinsic::fabs, V->getType());
667   llvm::CallInst *Call = CGF.Builder.CreateCall(F, V);
668   Call->setDoesNotAccessMemory();
669   return Call;
670 }
671 
672 /// Emit the computation of the sign bit for a floating point value. Returns
673 /// the i1 sign bit value.
674 static Value *EmitSignBit(CodeGenFunction &CGF, Value *V) {
675   LLVMContext &C = CGF.CGM.getLLVMContext();
676 
677   llvm::Type *Ty = V->getType();
678   int Width = Ty->getPrimitiveSizeInBits();
679   llvm::Type *IntTy = llvm::IntegerType::get(C, Width);
680   V = CGF.Builder.CreateBitCast(V, IntTy);
681   if (Ty->isPPC_FP128Ty()) {
682     // We want the sign bit of the higher-order double. The bitcast we just
683     // did works as if the double-double was stored to memory and then
684     // read as an i128. The "store" will put the higher-order double in the
685     // lower address in both little- and big-Endian modes, but the "load"
686     // will treat those bits as a different part of the i128: the low bits in
687     // little-Endian, the high bits in big-Endian. Therefore, on big-Endian
688     // we need to shift the high bits down to the low before truncating.
689     Width >>= 1;
690     if (CGF.getTarget().isBigEndian()) {
691       Value *ShiftCst = llvm::ConstantInt::get(IntTy, Width);
692       V = CGF.Builder.CreateLShr(V, ShiftCst);
693     }
694     // We are truncating value in order to extract the higher-order
695     // double, which we will be using to extract the sign from.
696     IntTy = llvm::IntegerType::get(C, Width);
697     V = CGF.Builder.CreateTrunc(V, IntTy);
698   }
699   Value *Zero = llvm::Constant::getNullValue(IntTy);
700   return CGF.Builder.CreateICmpSLT(V, Zero);
701 }
702 
703 static RValue emitLibraryCall(CodeGenFunction &CGF, const FunctionDecl *FD,
704                               const CallExpr *E, llvm::Constant *calleeValue) {
705   CGCallee callee = CGCallee::forDirect(calleeValue, GlobalDecl(FD));
706   return CGF.EmitCall(E->getCallee()->getType(), callee, E, ReturnValueSlot());
707 }
708 
709 /// Emit a call to llvm.{sadd,uadd,ssub,usub,smul,umul}.with.overflow.*
710 /// depending on IntrinsicID.
711 ///
712 /// \arg CGF The current codegen function.
713 /// \arg IntrinsicID The ID for the Intrinsic we wish to generate.
714 /// \arg X The first argument to the llvm.*.with.overflow.*.
715 /// \arg Y The second argument to the llvm.*.with.overflow.*.
716 /// \arg Carry The carry returned by the llvm.*.with.overflow.*.
717 /// \returns The result (i.e. sum/product) returned by the intrinsic.
718 static llvm::Value *EmitOverflowIntrinsic(CodeGenFunction &CGF,
719                                           const llvm::Intrinsic::ID IntrinsicID,
720                                           llvm::Value *X, llvm::Value *Y,
721                                           llvm::Value *&Carry) {
722   // Make sure we have integers of the same width.
723   assert(X->getType() == Y->getType() &&
724          "Arguments must be the same type. (Did you forget to make sure both "
725          "arguments have the same integer width?)");
726 
727   Function *Callee = CGF.CGM.getIntrinsic(IntrinsicID, X->getType());
728   llvm::Value *Tmp = CGF.Builder.CreateCall(Callee, {X, Y});
729   Carry = CGF.Builder.CreateExtractValue(Tmp, 1);
730   return CGF.Builder.CreateExtractValue(Tmp, 0);
731 }
732 
733 static Value *emitRangedBuiltin(CodeGenFunction &CGF,
734                                 unsigned IntrinsicID,
735                                 int low, int high) {
736     llvm::MDBuilder MDHelper(CGF.getLLVMContext());
737     llvm::MDNode *RNode = MDHelper.createRange(APInt(32, low), APInt(32, high));
738     Function *F = CGF.CGM.getIntrinsic(IntrinsicID, {});
739     llvm::Instruction *Call = CGF.Builder.CreateCall(F);
740     Call->setMetadata(llvm::LLVMContext::MD_range, RNode);
741     Call->setMetadata(llvm::LLVMContext::MD_noundef,
742                       llvm::MDNode::get(CGF.getLLVMContext(), std::nullopt));
743     return Call;
744 }
745 
746 namespace {
747   struct WidthAndSignedness {
748     unsigned Width;
749     bool Signed;
750   };
751 }
752 
753 static WidthAndSignedness
754 getIntegerWidthAndSignedness(const clang::ASTContext &context,
755                              const clang::QualType Type) {
756   assert(Type->isIntegerType() && "Given type is not an integer.");
757   unsigned Width = Type->isBooleanType()  ? 1
758                    : Type->isBitIntType() ? context.getIntWidth(Type)
759                                           : context.getTypeInfo(Type).Width;
760   bool Signed = Type->isSignedIntegerType();
761   return {Width, Signed};
762 }
763 
764 // Given one or more integer types, this function produces an integer type that
765 // encompasses them: any value in one of the given types could be expressed in
766 // the encompassing type.
767 static struct WidthAndSignedness
768 EncompassingIntegerType(ArrayRef<struct WidthAndSignedness> Types) {
769   assert(Types.size() > 0 && "Empty list of types.");
770 
771   // If any of the given types is signed, we must return a signed type.
772   bool Signed = false;
773   for (const auto &Type : Types) {
774     Signed |= Type.Signed;
775   }
776 
777   // The encompassing type must have a width greater than or equal to the width
778   // of the specified types.  Additionally, if the encompassing type is signed,
779   // its width must be strictly greater than the width of any unsigned types
780   // given.
781   unsigned Width = 0;
782   for (const auto &Type : Types) {
783     unsigned MinWidth = Type.Width + (Signed && !Type.Signed);
784     if (Width < MinWidth) {
785       Width = MinWidth;
786     }
787   }
788 
789   return {Width, Signed};
790 }
791 
792 Value *CodeGenFunction::EmitVAStartEnd(Value *ArgValue, bool IsStart) {
793   Intrinsic::ID inst = IsStart ? Intrinsic::vastart : Intrinsic::vaend;
794   return Builder.CreateCall(CGM.getIntrinsic(inst), ArgValue);
795 }
796 
797 /// Checks if using the result of __builtin_object_size(p, @p From) in place of
798 /// __builtin_object_size(p, @p To) is correct
799 static bool areBOSTypesCompatible(int From, int To) {
800   // Note: Our __builtin_object_size implementation currently treats Type=0 and
801   // Type=2 identically. Encoding this implementation detail here may make
802   // improving __builtin_object_size difficult in the future, so it's omitted.
803   return From == To || (From == 0 && To == 1) || (From == 3 && To == 2);
804 }
805 
806 static llvm::Value *
807 getDefaultBuiltinObjectSizeResult(unsigned Type, llvm::IntegerType *ResType) {
808   return ConstantInt::get(ResType, (Type & 2) ? 0 : -1, /*isSigned=*/true);
809 }
810 
811 llvm::Value *
812 CodeGenFunction::evaluateOrEmitBuiltinObjectSize(const Expr *E, unsigned Type,
813                                                  llvm::IntegerType *ResType,
814                                                  llvm::Value *EmittedE,
815                                                  bool IsDynamic) {
816   uint64_t ObjectSize;
817   if (!E->tryEvaluateObjectSize(ObjectSize, getContext(), Type))
818     return emitBuiltinObjectSize(E, Type, ResType, EmittedE, IsDynamic);
819   return ConstantInt::get(ResType, ObjectSize, /*isSigned=*/true);
820 }
821 
822 const FieldDecl *CodeGenFunction::FindFlexibleArrayMemberField(
823     ASTContext &Ctx, const RecordDecl *RD, StringRef Name, uint64_t &Offset) {
824   const LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel =
825       getLangOpts().getStrictFlexArraysLevel();
826   uint32_t FieldNo = 0;
827 
828   if (RD->isImplicit())
829     return nullptr;
830 
831   for (const FieldDecl *FD : RD->fields()) {
832     if ((Name.empty() || FD->getNameAsString() == Name) &&
833         Decl::isFlexibleArrayMemberLike(
834             Ctx, FD, FD->getType(), StrictFlexArraysLevel,
835             /*IgnoreTemplateOrMacroSubstitution=*/true)) {
836       const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(RD);
837       Offset += Layout.getFieldOffset(FieldNo);
838       return FD;
839     }
840 
841     QualType Ty = FD->getType();
842     if (Ty->isRecordType()) {
843       if (const FieldDecl *Field = FindFlexibleArrayMemberField(
844               Ctx, Ty->getAsRecordDecl(), Name, Offset)) {
845         const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(RD);
846         Offset += Layout.getFieldOffset(FieldNo);
847         return Field;
848       }
849     }
850 
851     if (!RD->isUnion())
852       ++FieldNo;
853   }
854 
855   return nullptr;
856 }
857 
858 static unsigned CountCountedByAttrs(const RecordDecl *RD) {
859   unsigned Num = 0;
860 
861   for (const Decl *D : RD->decls()) {
862     if (const auto *FD = dyn_cast<FieldDecl>(D);
863         FD && FD->hasAttr<CountedByAttr>()) {
864       return ++Num;
865     }
866 
867     if (const auto *Rec = dyn_cast<RecordDecl>(D))
868       Num += CountCountedByAttrs(Rec);
869   }
870 
871   return Num;
872 }
873 
874 llvm::Value *
875 CodeGenFunction::emitFlexibleArrayMemberSize(const Expr *E, unsigned Type,
876                                              llvm::IntegerType *ResType) {
877   // The code generated here calculates the size of a struct with a flexible
878   // array member that uses the counted_by attribute. There are two instances
879   // we handle:
880   //
881   //       struct s {
882   //         unsigned long flags;
883   //         int count;
884   //         int array[] __attribute__((counted_by(count)));
885   //       }
886   //
887   //   1) bdos of the flexible array itself:
888   //
889   //     __builtin_dynamic_object_size(p->array, 1) ==
890   //         p->count * sizeof(*p->array)
891   //
892   //   2) bdos of a pointer into the flexible array:
893   //
894   //     __builtin_dynamic_object_size(&p->array[42], 1) ==
895   //         (p->count - 42) * sizeof(*p->array)
896   //
897   //   2) bdos of the whole struct, including the flexible array:
898   //
899   //     __builtin_dynamic_object_size(p, 1) ==
900   //        max(sizeof(struct s),
901   //            offsetof(struct s, array) + p->count * sizeof(*p->array))
902   //
903   ASTContext &Ctx = getContext();
904   const Expr *Base = E->IgnoreParenImpCasts();
905   const Expr *Idx = nullptr;
906 
907   if (const auto *UO = dyn_cast<UnaryOperator>(Base);
908       UO && UO->getOpcode() == UO_AddrOf) {
909     Expr *SubExpr = UO->getSubExpr()->IgnoreParenImpCasts();
910     if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(SubExpr)) {
911       Base = ASE->getBase()->IgnoreParenImpCasts();
912       Idx = ASE->getIdx()->IgnoreParenImpCasts();
913 
914       if (const auto *IL = dyn_cast<IntegerLiteral>(Idx)) {
915         int64_t Val = IL->getValue().getSExtValue();
916         if (Val < 0)
917           return getDefaultBuiltinObjectSizeResult(Type, ResType);
918 
919         if (Val == 0)
920           // The index is 0, so we don't need to take it into account.
921           Idx = nullptr;
922       }
923     } else {
924       // Potential pointer to another element in the struct.
925       Base = SubExpr;
926     }
927   }
928 
929   // Get the flexible array member Decl.
930   const RecordDecl *OuterRD = nullptr;
931   std::string FAMName;
932   if (const auto *ME = dyn_cast<MemberExpr>(Base)) {
933     // Check if \p Base is referencing the FAM itself.
934     const ValueDecl *VD = ME->getMemberDecl();
935     OuterRD = VD->getDeclContext()->getOuterLexicalRecordContext();
936     FAMName = VD->getNameAsString();
937   } else if (const auto *DRE = dyn_cast<DeclRefExpr>(Base)) {
938     // Check if we're pointing to the whole struct.
939     QualType Ty = DRE->getDecl()->getType();
940     if (Ty->isPointerType())
941       Ty = Ty->getPointeeType();
942     OuterRD = Ty->getAsRecordDecl();
943 
944     // If we have a situation like this:
945     //
946     //     struct union_of_fams {
947     //         int flags;
948     //         union {
949     //             signed char normal_field;
950     //             struct {
951     //                 int count1;
952     //                 int arr1[] __counted_by(count1);
953     //             };
954     //             struct {
955     //                 signed char count2;
956     //                 int arr2[] __counted_by(count2);
957     //             };
958     //         };
959     //    };
960     //
961     // We don't konw which 'count' to use in this scenario:
962     //
963     //     size_t get_size(struct union_of_fams *p) {
964     //         return __builtin_dynamic_object_size(p, 1);
965     //     }
966     //
967     // Instead of calculating a wrong number, we give up.
968     if (OuterRD && CountCountedByAttrs(OuterRD) > 1)
969       return nullptr;
970   }
971 
972   if (!OuterRD)
973     return nullptr;
974 
975   uint64_t Offset = 0;
976   const FieldDecl *FAMDecl =
977       FindFlexibleArrayMemberField(Ctx, OuterRD, FAMName, Offset);
978   Offset = Ctx.toCharUnitsFromBits(Offset).getQuantity();
979 
980   if (!FAMDecl || !FAMDecl->hasAttr<CountedByAttr>())
981     // No flexible array member found or it doesn't have the "counted_by"
982     // attribute.
983     return nullptr;
984 
985   const FieldDecl *CountedByFD = FindCountedByField(FAMDecl);
986   if (!CountedByFD)
987     // Can't find the field referenced by the "counted_by" attribute.
988     return nullptr;
989 
990   // Build a load of the counted_by field.
991   bool IsSigned = CountedByFD->getType()->isSignedIntegerType();
992   Value *CountedByInst = EmitCountedByFieldExpr(Base, FAMDecl, CountedByFD);
993   if (!CountedByInst)
994     return getDefaultBuiltinObjectSizeResult(Type, ResType);
995 
996   CountedByInst = Builder.CreateIntCast(CountedByInst, ResType, IsSigned);
997 
998   // Build a load of the index and subtract it from the count.
999   Value *IdxInst = nullptr;
1000   if (Idx) {
1001     if (Idx->HasSideEffects(getContext()))
1002       // We can't have side-effects.
1003       return getDefaultBuiltinObjectSizeResult(Type, ResType);
1004 
1005     bool IdxSigned = Idx->getType()->isSignedIntegerType();
1006     IdxInst = EmitAnyExprToTemp(Idx).getScalarVal();
1007     IdxInst = Builder.CreateIntCast(IdxInst, ResType, IdxSigned);
1008 
1009     // We go ahead with the calculation here. If the index turns out to be
1010     // negative, we'll catch it at the end.
1011     CountedByInst =
1012         Builder.CreateSub(CountedByInst, IdxInst, "", !IsSigned, IsSigned);
1013   }
1014 
1015   // Calculate how large the flexible array member is in bytes.
1016   const ArrayType *ArrayTy = Ctx.getAsArrayType(FAMDecl->getType());
1017   CharUnits Size = Ctx.getTypeSizeInChars(ArrayTy->getElementType());
1018   llvm::Constant *ElemSize =
1019       llvm::ConstantInt::get(ResType, Size.getQuantity(), IsSigned);
1020   Value *FAMSize =
1021       Builder.CreateMul(CountedByInst, ElemSize, "", !IsSigned, IsSigned);
1022   FAMSize = Builder.CreateIntCast(FAMSize, ResType, IsSigned);
1023   Value *Res = FAMSize;
1024 
1025   if (isa<DeclRefExpr>(Base)) {
1026     // The whole struct is specificed in the __bdos.
1027     const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(OuterRD);
1028 
1029     // Get the offset of the FAM.
1030     llvm::Constant *FAMOffset = ConstantInt::get(ResType, Offset, IsSigned);
1031     Value *OffsetAndFAMSize =
1032         Builder.CreateAdd(FAMOffset, Res, "", !IsSigned, IsSigned);
1033 
1034     // Get the full size of the struct.
1035     llvm::Constant *SizeofStruct =
1036         ConstantInt::get(ResType, Layout.getSize().getQuantity(), IsSigned);
1037 
1038     // max(sizeof(struct s),
1039     //     offsetof(struct s, array) + p->count * sizeof(*p->array))
1040     Res = IsSigned
1041               ? Builder.CreateBinaryIntrinsic(llvm::Intrinsic::smax,
1042                                               OffsetAndFAMSize, SizeofStruct)
1043               : Builder.CreateBinaryIntrinsic(llvm::Intrinsic::umax,
1044                                               OffsetAndFAMSize, SizeofStruct);
1045   }
1046 
1047   // A negative \p IdxInst or \p CountedByInst means that the index lands
1048   // outside of the flexible array member. If that's the case, we want to
1049   // return 0.
1050   Value *Cmp = Builder.CreateIsNotNeg(CountedByInst);
1051   if (IdxInst)
1052     Cmp = Builder.CreateAnd(Builder.CreateIsNotNeg(IdxInst), Cmp);
1053 
1054   return Builder.CreateSelect(Cmp, Res, ConstantInt::get(ResType, 0, IsSigned));
1055 }
1056 
1057 /// Returns a Value corresponding to the size of the given expression.
1058 /// This Value may be either of the following:
1059 ///   - A llvm::Argument (if E is a param with the pass_object_size attribute on
1060 ///     it)
1061 ///   - A call to the @llvm.objectsize intrinsic
1062 ///
1063 /// EmittedE is the result of emitting `E` as a scalar expr. If it's non-null
1064 /// and we wouldn't otherwise try to reference a pass_object_size parameter,
1065 /// we'll call @llvm.objectsize on EmittedE, rather than emitting E.
1066 llvm::Value *
1067 CodeGenFunction::emitBuiltinObjectSize(const Expr *E, unsigned Type,
1068                                        llvm::IntegerType *ResType,
1069                                        llvm::Value *EmittedE, bool IsDynamic) {
1070   // We need to reference an argument if the pointer is a parameter with the
1071   // pass_object_size attribute.
1072   if (auto *D = dyn_cast<DeclRefExpr>(E->IgnoreParenImpCasts())) {
1073     auto *Param = dyn_cast<ParmVarDecl>(D->getDecl());
1074     auto *PS = D->getDecl()->getAttr<PassObjectSizeAttr>();
1075     if (Param != nullptr && PS != nullptr &&
1076         areBOSTypesCompatible(PS->getType(), Type)) {
1077       auto Iter = SizeArguments.find(Param);
1078       assert(Iter != SizeArguments.end());
1079 
1080       const ImplicitParamDecl *D = Iter->second;
1081       auto DIter = LocalDeclMap.find(D);
1082       assert(DIter != LocalDeclMap.end());
1083 
1084       return EmitLoadOfScalar(DIter->second, /*Volatile=*/false,
1085                               getContext().getSizeType(), E->getBeginLoc());
1086     }
1087   }
1088 
1089   if (IsDynamic) {
1090     // Emit special code for a flexible array member with the "counted_by"
1091     // attribute.
1092     if (Value *V = emitFlexibleArrayMemberSize(E, Type, ResType))
1093       return V;
1094   }
1095 
1096   // LLVM can't handle Type=3 appropriately, and __builtin_object_size shouldn't
1097   // evaluate E for side-effects. In either case, we shouldn't lower to
1098   // @llvm.objectsize.
1099   if (Type == 3 || (!EmittedE && E->HasSideEffects(getContext())))
1100     return getDefaultBuiltinObjectSizeResult(Type, ResType);
1101 
1102   Value *Ptr = EmittedE ? EmittedE : EmitScalarExpr(E);
1103   assert(Ptr->getType()->isPointerTy() &&
1104          "Non-pointer passed to __builtin_object_size?");
1105 
1106   Function *F =
1107       CGM.getIntrinsic(Intrinsic::objectsize, {ResType, Ptr->getType()});
1108 
1109   // LLVM only supports 0 and 2, make sure that we pass along that as a boolean.
1110   Value *Min = Builder.getInt1((Type & 2) != 0);
1111   // For GCC compatibility, __builtin_object_size treat NULL as unknown size.
1112   Value *NullIsUnknown = Builder.getTrue();
1113   Value *Dynamic = Builder.getInt1(IsDynamic);
1114   return Builder.CreateCall(F, {Ptr, Min, NullIsUnknown, Dynamic});
1115 }
1116 
1117 namespace {
1118 /// A struct to generically describe a bit test intrinsic.
1119 struct BitTest {
1120   enum ActionKind : uint8_t { TestOnly, Complement, Reset, Set };
1121   enum InterlockingKind : uint8_t {
1122     Unlocked,
1123     Sequential,
1124     Acquire,
1125     Release,
1126     NoFence
1127   };
1128 
1129   ActionKind Action;
1130   InterlockingKind Interlocking;
1131   bool Is64Bit;
1132 
1133   static BitTest decodeBitTestBuiltin(unsigned BuiltinID);
1134 };
1135 } // namespace
1136 
1137 BitTest BitTest::decodeBitTestBuiltin(unsigned BuiltinID) {
1138   switch (BuiltinID) {
1139     // Main portable variants.
1140   case Builtin::BI_bittest:
1141     return {TestOnly, Unlocked, false};
1142   case Builtin::BI_bittestandcomplement:
1143     return {Complement, Unlocked, false};
1144   case Builtin::BI_bittestandreset:
1145     return {Reset, Unlocked, false};
1146   case Builtin::BI_bittestandset:
1147     return {Set, Unlocked, false};
1148   case Builtin::BI_interlockedbittestandreset:
1149     return {Reset, Sequential, false};
1150   case Builtin::BI_interlockedbittestandset:
1151     return {Set, Sequential, false};
1152 
1153     // X86-specific 64-bit variants.
1154   case Builtin::BI_bittest64:
1155     return {TestOnly, Unlocked, true};
1156   case Builtin::BI_bittestandcomplement64:
1157     return {Complement, Unlocked, true};
1158   case Builtin::BI_bittestandreset64:
1159     return {Reset, Unlocked, true};
1160   case Builtin::BI_bittestandset64:
1161     return {Set, Unlocked, true};
1162   case Builtin::BI_interlockedbittestandreset64:
1163     return {Reset, Sequential, true};
1164   case Builtin::BI_interlockedbittestandset64:
1165     return {Set, Sequential, true};
1166 
1167     // ARM/AArch64-specific ordering variants.
1168   case Builtin::BI_interlockedbittestandset_acq:
1169     return {Set, Acquire, false};
1170   case Builtin::BI_interlockedbittestandset_rel:
1171     return {Set, Release, false};
1172   case Builtin::BI_interlockedbittestandset_nf:
1173     return {Set, NoFence, false};
1174   case Builtin::BI_interlockedbittestandreset_acq:
1175     return {Reset, Acquire, false};
1176   case Builtin::BI_interlockedbittestandreset_rel:
1177     return {Reset, Release, false};
1178   case Builtin::BI_interlockedbittestandreset_nf:
1179     return {Reset, NoFence, false};
1180   }
1181   llvm_unreachable("expected only bittest intrinsics");
1182 }
1183 
1184 static char bitActionToX86BTCode(BitTest::ActionKind A) {
1185   switch (A) {
1186   case BitTest::TestOnly:   return '\0';
1187   case BitTest::Complement: return 'c';
1188   case BitTest::Reset:      return 'r';
1189   case BitTest::Set:        return 's';
1190   }
1191   llvm_unreachable("invalid action");
1192 }
1193 
1194 static llvm::Value *EmitX86BitTestIntrinsic(CodeGenFunction &CGF,
1195                                             BitTest BT,
1196                                             const CallExpr *E, Value *BitBase,
1197                                             Value *BitPos) {
1198   char Action = bitActionToX86BTCode(BT.Action);
1199   char SizeSuffix = BT.Is64Bit ? 'q' : 'l';
1200 
1201   // Build the assembly.
1202   SmallString<64> Asm;
1203   raw_svector_ostream AsmOS(Asm);
1204   if (BT.Interlocking != BitTest::Unlocked)
1205     AsmOS << "lock ";
1206   AsmOS << "bt";
1207   if (Action)
1208     AsmOS << Action;
1209   AsmOS << SizeSuffix << " $2, ($1)";
1210 
1211   // Build the constraints. FIXME: We should support immediates when possible.
1212   std::string Constraints = "={@ccc},r,r,~{cc},~{memory}";
1213   std::string_view MachineClobbers = CGF.getTarget().getClobbers();
1214   if (!MachineClobbers.empty()) {
1215     Constraints += ',';
1216     Constraints += MachineClobbers;
1217   }
1218   llvm::IntegerType *IntType = llvm::IntegerType::get(
1219       CGF.getLLVMContext(),
1220       CGF.getContext().getTypeSize(E->getArg(1)->getType()));
1221   llvm::FunctionType *FTy =
1222       llvm::FunctionType::get(CGF.Int8Ty, {CGF.UnqualPtrTy, IntType}, false);
1223 
1224   llvm::InlineAsm *IA =
1225       llvm::InlineAsm::get(FTy, Asm, Constraints, /*hasSideEffects=*/true);
1226   return CGF.Builder.CreateCall(IA, {BitBase, BitPos});
1227 }
1228 
1229 static llvm::AtomicOrdering
1230 getBitTestAtomicOrdering(BitTest::InterlockingKind I) {
1231   switch (I) {
1232   case BitTest::Unlocked:   return llvm::AtomicOrdering::NotAtomic;
1233   case BitTest::Sequential: return llvm::AtomicOrdering::SequentiallyConsistent;
1234   case BitTest::Acquire:    return llvm::AtomicOrdering::Acquire;
1235   case BitTest::Release:    return llvm::AtomicOrdering::Release;
1236   case BitTest::NoFence:    return llvm::AtomicOrdering::Monotonic;
1237   }
1238   llvm_unreachable("invalid interlocking");
1239 }
1240 
1241 /// Emit a _bittest* intrinsic. These intrinsics take a pointer to an array of
1242 /// bits and a bit position and read and optionally modify the bit at that
1243 /// position. The position index can be arbitrarily large, i.e. it can be larger
1244 /// than 31 or 63, so we need an indexed load in the general case.
1245 static llvm::Value *EmitBitTestIntrinsic(CodeGenFunction &CGF,
1246                                          unsigned BuiltinID,
1247                                          const CallExpr *E) {
1248   Value *BitBase = CGF.EmitScalarExpr(E->getArg(0));
1249   Value *BitPos = CGF.EmitScalarExpr(E->getArg(1));
1250 
1251   BitTest BT = BitTest::decodeBitTestBuiltin(BuiltinID);
1252 
1253   // X86 has special BT, BTC, BTR, and BTS instructions that handle the array
1254   // indexing operation internally. Use them if possible.
1255   if (CGF.getTarget().getTriple().isX86())
1256     return EmitX86BitTestIntrinsic(CGF, BT, E, BitBase, BitPos);
1257 
1258   // Otherwise, use generic code to load one byte and test the bit. Use all but
1259   // the bottom three bits as the array index, and the bottom three bits to form
1260   // a mask.
1261   // Bit = BitBaseI8[BitPos >> 3] & (1 << (BitPos & 0x7)) != 0;
1262   Value *ByteIndex = CGF.Builder.CreateAShr(
1263       BitPos, llvm::ConstantInt::get(BitPos->getType(), 3), "bittest.byteidx");
1264   Value *BitBaseI8 = CGF.Builder.CreatePointerCast(BitBase, CGF.Int8PtrTy);
1265   Address ByteAddr(CGF.Builder.CreateInBoundsGEP(CGF.Int8Ty, BitBaseI8,
1266                                                  ByteIndex, "bittest.byteaddr"),
1267                    CGF.Int8Ty, CharUnits::One());
1268   Value *PosLow =
1269       CGF.Builder.CreateAnd(CGF.Builder.CreateTrunc(BitPos, CGF.Int8Ty),
1270                             llvm::ConstantInt::get(CGF.Int8Ty, 0x7));
1271 
1272   // The updating instructions will need a mask.
1273   Value *Mask = nullptr;
1274   if (BT.Action != BitTest::TestOnly) {
1275     Mask = CGF.Builder.CreateShl(llvm::ConstantInt::get(CGF.Int8Ty, 1), PosLow,
1276                                  "bittest.mask");
1277   }
1278 
1279   // Check the action and ordering of the interlocked intrinsics.
1280   llvm::AtomicOrdering Ordering = getBitTestAtomicOrdering(BT.Interlocking);
1281 
1282   Value *OldByte = nullptr;
1283   if (Ordering != llvm::AtomicOrdering::NotAtomic) {
1284     // Emit a combined atomicrmw load/store operation for the interlocked
1285     // intrinsics.
1286     llvm::AtomicRMWInst::BinOp RMWOp = llvm::AtomicRMWInst::Or;
1287     if (BT.Action == BitTest::Reset) {
1288       Mask = CGF.Builder.CreateNot(Mask);
1289       RMWOp = llvm::AtomicRMWInst::And;
1290     }
1291     OldByte = CGF.Builder.CreateAtomicRMW(RMWOp, ByteAddr, Mask, Ordering);
1292   } else {
1293     // Emit a plain load for the non-interlocked intrinsics.
1294     OldByte = CGF.Builder.CreateLoad(ByteAddr, "bittest.byte");
1295     Value *NewByte = nullptr;
1296     switch (BT.Action) {
1297     case BitTest::TestOnly:
1298       // Don't store anything.
1299       break;
1300     case BitTest::Complement:
1301       NewByte = CGF.Builder.CreateXor(OldByte, Mask);
1302       break;
1303     case BitTest::Reset:
1304       NewByte = CGF.Builder.CreateAnd(OldByte, CGF.Builder.CreateNot(Mask));
1305       break;
1306     case BitTest::Set:
1307       NewByte = CGF.Builder.CreateOr(OldByte, Mask);
1308       break;
1309     }
1310     if (NewByte)
1311       CGF.Builder.CreateStore(NewByte, ByteAddr);
1312   }
1313 
1314   // However we loaded the old byte, either by plain load or atomicrmw, shift
1315   // the bit into the low position and mask it to 0 or 1.
1316   Value *ShiftedByte = CGF.Builder.CreateLShr(OldByte, PosLow, "bittest.shr");
1317   return CGF.Builder.CreateAnd(
1318       ShiftedByte, llvm::ConstantInt::get(CGF.Int8Ty, 1), "bittest.res");
1319 }
1320 
1321 static llvm::Value *emitPPCLoadReserveIntrinsic(CodeGenFunction &CGF,
1322                                                 unsigned BuiltinID,
1323                                                 const CallExpr *E) {
1324   Value *Addr = CGF.EmitScalarExpr(E->getArg(0));
1325 
1326   SmallString<64> Asm;
1327   raw_svector_ostream AsmOS(Asm);
1328   llvm::IntegerType *RetType = CGF.Int32Ty;
1329 
1330   switch (BuiltinID) {
1331   case clang::PPC::BI__builtin_ppc_ldarx:
1332     AsmOS << "ldarx ";
1333     RetType = CGF.Int64Ty;
1334     break;
1335   case clang::PPC::BI__builtin_ppc_lwarx:
1336     AsmOS << "lwarx ";
1337     RetType = CGF.Int32Ty;
1338     break;
1339   case clang::PPC::BI__builtin_ppc_lharx:
1340     AsmOS << "lharx ";
1341     RetType = CGF.Int16Ty;
1342     break;
1343   case clang::PPC::BI__builtin_ppc_lbarx:
1344     AsmOS << "lbarx ";
1345     RetType = CGF.Int8Ty;
1346     break;
1347   default:
1348     llvm_unreachable("Expected only PowerPC load reserve intrinsics");
1349   }
1350 
1351   AsmOS << "$0, ${1:y}";
1352 
1353   std::string Constraints = "=r,*Z,~{memory}";
1354   std::string_view MachineClobbers = CGF.getTarget().getClobbers();
1355   if (!MachineClobbers.empty()) {
1356     Constraints += ',';
1357     Constraints += MachineClobbers;
1358   }
1359 
1360   llvm::Type *PtrType = CGF.UnqualPtrTy;
1361   llvm::FunctionType *FTy = llvm::FunctionType::get(RetType, {PtrType}, false);
1362 
1363   llvm::InlineAsm *IA =
1364       llvm::InlineAsm::get(FTy, Asm, Constraints, /*hasSideEffects=*/true);
1365   llvm::CallInst *CI = CGF.Builder.CreateCall(IA, {Addr});
1366   CI->addParamAttr(
1367       0, Attribute::get(CGF.getLLVMContext(), Attribute::ElementType, RetType));
1368   return CI;
1369 }
1370 
1371 namespace {
1372 enum class MSVCSetJmpKind {
1373   _setjmpex,
1374   _setjmp3,
1375   _setjmp
1376 };
1377 }
1378 
1379 /// MSVC handles setjmp a bit differently on different platforms. On every
1380 /// architecture except 32-bit x86, the frame address is passed. On x86, extra
1381 /// parameters can be passed as variadic arguments, but we always pass none.
1382 static RValue EmitMSVCRTSetJmp(CodeGenFunction &CGF, MSVCSetJmpKind SJKind,
1383                                const CallExpr *E) {
1384   llvm::Value *Arg1 = nullptr;
1385   llvm::Type *Arg1Ty = nullptr;
1386   StringRef Name;
1387   bool IsVarArg = false;
1388   if (SJKind == MSVCSetJmpKind::_setjmp3) {
1389     Name = "_setjmp3";
1390     Arg1Ty = CGF.Int32Ty;
1391     Arg1 = llvm::ConstantInt::get(CGF.IntTy, 0);
1392     IsVarArg = true;
1393   } else {
1394     Name = SJKind == MSVCSetJmpKind::_setjmp ? "_setjmp" : "_setjmpex";
1395     Arg1Ty = CGF.Int8PtrTy;
1396     if (CGF.getTarget().getTriple().getArch() == llvm::Triple::aarch64) {
1397       Arg1 = CGF.Builder.CreateCall(
1398           CGF.CGM.getIntrinsic(Intrinsic::sponentry, CGF.AllocaInt8PtrTy));
1399     } else
1400       Arg1 = CGF.Builder.CreateCall(
1401           CGF.CGM.getIntrinsic(Intrinsic::frameaddress, CGF.AllocaInt8PtrTy),
1402           llvm::ConstantInt::get(CGF.Int32Ty, 0));
1403   }
1404 
1405   // Mark the call site and declaration with ReturnsTwice.
1406   llvm::Type *ArgTypes[2] = {CGF.Int8PtrTy, Arg1Ty};
1407   llvm::AttributeList ReturnsTwiceAttr = llvm::AttributeList::get(
1408       CGF.getLLVMContext(), llvm::AttributeList::FunctionIndex,
1409       llvm::Attribute::ReturnsTwice);
1410   llvm::FunctionCallee SetJmpFn = CGF.CGM.CreateRuntimeFunction(
1411       llvm::FunctionType::get(CGF.IntTy, ArgTypes, IsVarArg), Name,
1412       ReturnsTwiceAttr, /*Local=*/true);
1413 
1414   llvm::Value *Buf = CGF.Builder.CreateBitOrPointerCast(
1415       CGF.EmitScalarExpr(E->getArg(0)), CGF.Int8PtrTy);
1416   llvm::Value *Args[] = {Buf, Arg1};
1417   llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(SetJmpFn, Args);
1418   CB->setAttributes(ReturnsTwiceAttr);
1419   return RValue::get(CB);
1420 }
1421 
1422 // Many of MSVC builtins are on x64, ARM and AArch64; to avoid repeating code,
1423 // we handle them here.
1424 enum class CodeGenFunction::MSVCIntrin {
1425   _BitScanForward,
1426   _BitScanReverse,
1427   _InterlockedAnd,
1428   _InterlockedDecrement,
1429   _InterlockedExchange,
1430   _InterlockedExchangeAdd,
1431   _InterlockedExchangeSub,
1432   _InterlockedIncrement,
1433   _InterlockedOr,
1434   _InterlockedXor,
1435   _InterlockedExchangeAdd_acq,
1436   _InterlockedExchangeAdd_rel,
1437   _InterlockedExchangeAdd_nf,
1438   _InterlockedExchange_acq,
1439   _InterlockedExchange_rel,
1440   _InterlockedExchange_nf,
1441   _InterlockedCompareExchange_acq,
1442   _InterlockedCompareExchange_rel,
1443   _InterlockedCompareExchange_nf,
1444   _InterlockedCompareExchange128,
1445   _InterlockedCompareExchange128_acq,
1446   _InterlockedCompareExchange128_rel,
1447   _InterlockedCompareExchange128_nf,
1448   _InterlockedOr_acq,
1449   _InterlockedOr_rel,
1450   _InterlockedOr_nf,
1451   _InterlockedXor_acq,
1452   _InterlockedXor_rel,
1453   _InterlockedXor_nf,
1454   _InterlockedAnd_acq,
1455   _InterlockedAnd_rel,
1456   _InterlockedAnd_nf,
1457   _InterlockedIncrement_acq,
1458   _InterlockedIncrement_rel,
1459   _InterlockedIncrement_nf,
1460   _InterlockedDecrement_acq,
1461   _InterlockedDecrement_rel,
1462   _InterlockedDecrement_nf,
1463   __fastfail,
1464 };
1465 
1466 static std::optional<CodeGenFunction::MSVCIntrin>
1467 translateArmToMsvcIntrin(unsigned BuiltinID) {
1468   using MSVCIntrin = CodeGenFunction::MSVCIntrin;
1469   switch (BuiltinID) {
1470   default:
1471     return std::nullopt;
1472   case clang::ARM::BI_BitScanForward:
1473   case clang::ARM::BI_BitScanForward64:
1474     return MSVCIntrin::_BitScanForward;
1475   case clang::ARM::BI_BitScanReverse:
1476   case clang::ARM::BI_BitScanReverse64:
1477     return MSVCIntrin::_BitScanReverse;
1478   case clang::ARM::BI_InterlockedAnd64:
1479     return MSVCIntrin::_InterlockedAnd;
1480   case clang::ARM::BI_InterlockedExchange64:
1481     return MSVCIntrin::_InterlockedExchange;
1482   case clang::ARM::BI_InterlockedExchangeAdd64:
1483     return MSVCIntrin::_InterlockedExchangeAdd;
1484   case clang::ARM::BI_InterlockedExchangeSub64:
1485     return MSVCIntrin::_InterlockedExchangeSub;
1486   case clang::ARM::BI_InterlockedOr64:
1487     return MSVCIntrin::_InterlockedOr;
1488   case clang::ARM::BI_InterlockedXor64:
1489     return MSVCIntrin::_InterlockedXor;
1490   case clang::ARM::BI_InterlockedDecrement64:
1491     return MSVCIntrin::_InterlockedDecrement;
1492   case clang::ARM::BI_InterlockedIncrement64:
1493     return MSVCIntrin::_InterlockedIncrement;
1494   case clang::ARM::BI_InterlockedExchangeAdd8_acq:
1495   case clang::ARM::BI_InterlockedExchangeAdd16_acq:
1496   case clang::ARM::BI_InterlockedExchangeAdd_acq:
1497   case clang::ARM::BI_InterlockedExchangeAdd64_acq:
1498     return MSVCIntrin::_InterlockedExchangeAdd_acq;
1499   case clang::ARM::BI_InterlockedExchangeAdd8_rel:
1500   case clang::ARM::BI_InterlockedExchangeAdd16_rel:
1501   case clang::ARM::BI_InterlockedExchangeAdd_rel:
1502   case clang::ARM::BI_InterlockedExchangeAdd64_rel:
1503     return MSVCIntrin::_InterlockedExchangeAdd_rel;
1504   case clang::ARM::BI_InterlockedExchangeAdd8_nf:
1505   case clang::ARM::BI_InterlockedExchangeAdd16_nf:
1506   case clang::ARM::BI_InterlockedExchangeAdd_nf:
1507   case clang::ARM::BI_InterlockedExchangeAdd64_nf:
1508     return MSVCIntrin::_InterlockedExchangeAdd_nf;
1509   case clang::ARM::BI_InterlockedExchange8_acq:
1510   case clang::ARM::BI_InterlockedExchange16_acq:
1511   case clang::ARM::BI_InterlockedExchange_acq:
1512   case clang::ARM::BI_InterlockedExchange64_acq:
1513     return MSVCIntrin::_InterlockedExchange_acq;
1514   case clang::ARM::BI_InterlockedExchange8_rel:
1515   case clang::ARM::BI_InterlockedExchange16_rel:
1516   case clang::ARM::BI_InterlockedExchange_rel:
1517   case clang::ARM::BI_InterlockedExchange64_rel:
1518     return MSVCIntrin::_InterlockedExchange_rel;
1519   case clang::ARM::BI_InterlockedExchange8_nf:
1520   case clang::ARM::BI_InterlockedExchange16_nf:
1521   case clang::ARM::BI_InterlockedExchange_nf:
1522   case clang::ARM::BI_InterlockedExchange64_nf:
1523     return MSVCIntrin::_InterlockedExchange_nf;
1524   case clang::ARM::BI_InterlockedCompareExchange8_acq:
1525   case clang::ARM::BI_InterlockedCompareExchange16_acq:
1526   case clang::ARM::BI_InterlockedCompareExchange_acq:
1527   case clang::ARM::BI_InterlockedCompareExchange64_acq:
1528     return MSVCIntrin::_InterlockedCompareExchange_acq;
1529   case clang::ARM::BI_InterlockedCompareExchange8_rel:
1530   case clang::ARM::BI_InterlockedCompareExchange16_rel:
1531   case clang::ARM::BI_InterlockedCompareExchange_rel:
1532   case clang::ARM::BI_InterlockedCompareExchange64_rel:
1533     return MSVCIntrin::_InterlockedCompareExchange_rel;
1534   case clang::ARM::BI_InterlockedCompareExchange8_nf:
1535   case clang::ARM::BI_InterlockedCompareExchange16_nf:
1536   case clang::ARM::BI_InterlockedCompareExchange_nf:
1537   case clang::ARM::BI_InterlockedCompareExchange64_nf:
1538     return MSVCIntrin::_InterlockedCompareExchange_nf;
1539   case clang::ARM::BI_InterlockedOr8_acq:
1540   case clang::ARM::BI_InterlockedOr16_acq:
1541   case clang::ARM::BI_InterlockedOr_acq:
1542   case clang::ARM::BI_InterlockedOr64_acq:
1543     return MSVCIntrin::_InterlockedOr_acq;
1544   case clang::ARM::BI_InterlockedOr8_rel:
1545   case clang::ARM::BI_InterlockedOr16_rel:
1546   case clang::ARM::BI_InterlockedOr_rel:
1547   case clang::ARM::BI_InterlockedOr64_rel:
1548     return MSVCIntrin::_InterlockedOr_rel;
1549   case clang::ARM::BI_InterlockedOr8_nf:
1550   case clang::ARM::BI_InterlockedOr16_nf:
1551   case clang::ARM::BI_InterlockedOr_nf:
1552   case clang::ARM::BI_InterlockedOr64_nf:
1553     return MSVCIntrin::_InterlockedOr_nf;
1554   case clang::ARM::BI_InterlockedXor8_acq:
1555   case clang::ARM::BI_InterlockedXor16_acq:
1556   case clang::ARM::BI_InterlockedXor_acq:
1557   case clang::ARM::BI_InterlockedXor64_acq:
1558     return MSVCIntrin::_InterlockedXor_acq;
1559   case clang::ARM::BI_InterlockedXor8_rel:
1560   case clang::ARM::BI_InterlockedXor16_rel:
1561   case clang::ARM::BI_InterlockedXor_rel:
1562   case clang::ARM::BI_InterlockedXor64_rel:
1563     return MSVCIntrin::_InterlockedXor_rel;
1564   case clang::ARM::BI_InterlockedXor8_nf:
1565   case clang::ARM::BI_InterlockedXor16_nf:
1566   case clang::ARM::BI_InterlockedXor_nf:
1567   case clang::ARM::BI_InterlockedXor64_nf:
1568     return MSVCIntrin::_InterlockedXor_nf;
1569   case clang::ARM::BI_InterlockedAnd8_acq:
1570   case clang::ARM::BI_InterlockedAnd16_acq:
1571   case clang::ARM::BI_InterlockedAnd_acq:
1572   case clang::ARM::BI_InterlockedAnd64_acq:
1573     return MSVCIntrin::_InterlockedAnd_acq;
1574   case clang::ARM::BI_InterlockedAnd8_rel:
1575   case clang::ARM::BI_InterlockedAnd16_rel:
1576   case clang::ARM::BI_InterlockedAnd_rel:
1577   case clang::ARM::BI_InterlockedAnd64_rel:
1578     return MSVCIntrin::_InterlockedAnd_rel;
1579   case clang::ARM::BI_InterlockedAnd8_nf:
1580   case clang::ARM::BI_InterlockedAnd16_nf:
1581   case clang::ARM::BI_InterlockedAnd_nf:
1582   case clang::ARM::BI_InterlockedAnd64_nf:
1583     return MSVCIntrin::_InterlockedAnd_nf;
1584   case clang::ARM::BI_InterlockedIncrement16_acq:
1585   case clang::ARM::BI_InterlockedIncrement_acq:
1586   case clang::ARM::BI_InterlockedIncrement64_acq:
1587     return MSVCIntrin::_InterlockedIncrement_acq;
1588   case clang::ARM::BI_InterlockedIncrement16_rel:
1589   case clang::ARM::BI_InterlockedIncrement_rel:
1590   case clang::ARM::BI_InterlockedIncrement64_rel:
1591     return MSVCIntrin::_InterlockedIncrement_rel;
1592   case clang::ARM::BI_InterlockedIncrement16_nf:
1593   case clang::ARM::BI_InterlockedIncrement_nf:
1594   case clang::ARM::BI_InterlockedIncrement64_nf:
1595     return MSVCIntrin::_InterlockedIncrement_nf;
1596   case clang::ARM::BI_InterlockedDecrement16_acq:
1597   case clang::ARM::BI_InterlockedDecrement_acq:
1598   case clang::ARM::BI_InterlockedDecrement64_acq:
1599     return MSVCIntrin::_InterlockedDecrement_acq;
1600   case clang::ARM::BI_InterlockedDecrement16_rel:
1601   case clang::ARM::BI_InterlockedDecrement_rel:
1602   case clang::ARM::BI_InterlockedDecrement64_rel:
1603     return MSVCIntrin::_InterlockedDecrement_rel;
1604   case clang::ARM::BI_InterlockedDecrement16_nf:
1605   case clang::ARM::BI_InterlockedDecrement_nf:
1606   case clang::ARM::BI_InterlockedDecrement64_nf:
1607     return MSVCIntrin::_InterlockedDecrement_nf;
1608   }
1609   llvm_unreachable("must return from switch");
1610 }
1611 
1612 static std::optional<CodeGenFunction::MSVCIntrin>
1613 translateAarch64ToMsvcIntrin(unsigned BuiltinID) {
1614   using MSVCIntrin = CodeGenFunction::MSVCIntrin;
1615   switch (BuiltinID) {
1616   default:
1617     return std::nullopt;
1618   case clang::AArch64::BI_BitScanForward:
1619   case clang::AArch64::BI_BitScanForward64:
1620     return MSVCIntrin::_BitScanForward;
1621   case clang::AArch64::BI_BitScanReverse:
1622   case clang::AArch64::BI_BitScanReverse64:
1623     return MSVCIntrin::_BitScanReverse;
1624   case clang::AArch64::BI_InterlockedAnd64:
1625     return MSVCIntrin::_InterlockedAnd;
1626   case clang::AArch64::BI_InterlockedExchange64:
1627     return MSVCIntrin::_InterlockedExchange;
1628   case clang::AArch64::BI_InterlockedExchangeAdd64:
1629     return MSVCIntrin::_InterlockedExchangeAdd;
1630   case clang::AArch64::BI_InterlockedExchangeSub64:
1631     return MSVCIntrin::_InterlockedExchangeSub;
1632   case clang::AArch64::BI_InterlockedOr64:
1633     return MSVCIntrin::_InterlockedOr;
1634   case clang::AArch64::BI_InterlockedXor64:
1635     return MSVCIntrin::_InterlockedXor;
1636   case clang::AArch64::BI_InterlockedDecrement64:
1637     return MSVCIntrin::_InterlockedDecrement;
1638   case clang::AArch64::BI_InterlockedIncrement64:
1639     return MSVCIntrin::_InterlockedIncrement;
1640   case clang::AArch64::BI_InterlockedExchangeAdd8_acq:
1641   case clang::AArch64::BI_InterlockedExchangeAdd16_acq:
1642   case clang::AArch64::BI_InterlockedExchangeAdd_acq:
1643   case clang::AArch64::BI_InterlockedExchangeAdd64_acq:
1644     return MSVCIntrin::_InterlockedExchangeAdd_acq;
1645   case clang::AArch64::BI_InterlockedExchangeAdd8_rel:
1646   case clang::AArch64::BI_InterlockedExchangeAdd16_rel:
1647   case clang::AArch64::BI_InterlockedExchangeAdd_rel:
1648   case clang::AArch64::BI_InterlockedExchangeAdd64_rel:
1649     return MSVCIntrin::_InterlockedExchangeAdd_rel;
1650   case clang::AArch64::BI_InterlockedExchangeAdd8_nf:
1651   case clang::AArch64::BI_InterlockedExchangeAdd16_nf:
1652   case clang::AArch64::BI_InterlockedExchangeAdd_nf:
1653   case clang::AArch64::BI_InterlockedExchangeAdd64_nf:
1654     return MSVCIntrin::_InterlockedExchangeAdd_nf;
1655   case clang::AArch64::BI_InterlockedExchange8_acq:
1656   case clang::AArch64::BI_InterlockedExchange16_acq:
1657   case clang::AArch64::BI_InterlockedExchange_acq:
1658   case clang::AArch64::BI_InterlockedExchange64_acq:
1659     return MSVCIntrin::_InterlockedExchange_acq;
1660   case clang::AArch64::BI_InterlockedExchange8_rel:
1661   case clang::AArch64::BI_InterlockedExchange16_rel:
1662   case clang::AArch64::BI_InterlockedExchange_rel:
1663   case clang::AArch64::BI_InterlockedExchange64_rel:
1664     return MSVCIntrin::_InterlockedExchange_rel;
1665   case clang::AArch64::BI_InterlockedExchange8_nf:
1666   case clang::AArch64::BI_InterlockedExchange16_nf:
1667   case clang::AArch64::BI_InterlockedExchange_nf:
1668   case clang::AArch64::BI_InterlockedExchange64_nf:
1669     return MSVCIntrin::_InterlockedExchange_nf;
1670   case clang::AArch64::BI_InterlockedCompareExchange8_acq:
1671   case clang::AArch64::BI_InterlockedCompareExchange16_acq:
1672   case clang::AArch64::BI_InterlockedCompareExchange_acq:
1673   case clang::AArch64::BI_InterlockedCompareExchange64_acq:
1674     return MSVCIntrin::_InterlockedCompareExchange_acq;
1675   case clang::AArch64::BI_InterlockedCompareExchange8_rel:
1676   case clang::AArch64::BI_InterlockedCompareExchange16_rel:
1677   case clang::AArch64::BI_InterlockedCompareExchange_rel:
1678   case clang::AArch64::BI_InterlockedCompareExchange64_rel:
1679     return MSVCIntrin::_InterlockedCompareExchange_rel;
1680   case clang::AArch64::BI_InterlockedCompareExchange8_nf:
1681   case clang::AArch64::BI_InterlockedCompareExchange16_nf:
1682   case clang::AArch64::BI_InterlockedCompareExchange_nf:
1683   case clang::AArch64::BI_InterlockedCompareExchange64_nf:
1684     return MSVCIntrin::_InterlockedCompareExchange_nf;
1685   case clang::AArch64::BI_InterlockedCompareExchange128:
1686     return MSVCIntrin::_InterlockedCompareExchange128;
1687   case clang::AArch64::BI_InterlockedCompareExchange128_acq:
1688     return MSVCIntrin::_InterlockedCompareExchange128_acq;
1689   case clang::AArch64::BI_InterlockedCompareExchange128_nf:
1690     return MSVCIntrin::_InterlockedCompareExchange128_nf;
1691   case clang::AArch64::BI_InterlockedCompareExchange128_rel:
1692     return MSVCIntrin::_InterlockedCompareExchange128_rel;
1693   case clang::AArch64::BI_InterlockedOr8_acq:
1694   case clang::AArch64::BI_InterlockedOr16_acq:
1695   case clang::AArch64::BI_InterlockedOr_acq:
1696   case clang::AArch64::BI_InterlockedOr64_acq:
1697     return MSVCIntrin::_InterlockedOr_acq;
1698   case clang::AArch64::BI_InterlockedOr8_rel:
1699   case clang::AArch64::BI_InterlockedOr16_rel:
1700   case clang::AArch64::BI_InterlockedOr_rel:
1701   case clang::AArch64::BI_InterlockedOr64_rel:
1702     return MSVCIntrin::_InterlockedOr_rel;
1703   case clang::AArch64::BI_InterlockedOr8_nf:
1704   case clang::AArch64::BI_InterlockedOr16_nf:
1705   case clang::AArch64::BI_InterlockedOr_nf:
1706   case clang::AArch64::BI_InterlockedOr64_nf:
1707     return MSVCIntrin::_InterlockedOr_nf;
1708   case clang::AArch64::BI_InterlockedXor8_acq:
1709   case clang::AArch64::BI_InterlockedXor16_acq:
1710   case clang::AArch64::BI_InterlockedXor_acq:
1711   case clang::AArch64::BI_InterlockedXor64_acq:
1712     return MSVCIntrin::_InterlockedXor_acq;
1713   case clang::AArch64::BI_InterlockedXor8_rel:
1714   case clang::AArch64::BI_InterlockedXor16_rel:
1715   case clang::AArch64::BI_InterlockedXor_rel:
1716   case clang::AArch64::BI_InterlockedXor64_rel:
1717     return MSVCIntrin::_InterlockedXor_rel;
1718   case clang::AArch64::BI_InterlockedXor8_nf:
1719   case clang::AArch64::BI_InterlockedXor16_nf:
1720   case clang::AArch64::BI_InterlockedXor_nf:
1721   case clang::AArch64::BI_InterlockedXor64_nf:
1722     return MSVCIntrin::_InterlockedXor_nf;
1723   case clang::AArch64::BI_InterlockedAnd8_acq:
1724   case clang::AArch64::BI_InterlockedAnd16_acq:
1725   case clang::AArch64::BI_InterlockedAnd_acq:
1726   case clang::AArch64::BI_InterlockedAnd64_acq:
1727     return MSVCIntrin::_InterlockedAnd_acq;
1728   case clang::AArch64::BI_InterlockedAnd8_rel:
1729   case clang::AArch64::BI_InterlockedAnd16_rel:
1730   case clang::AArch64::BI_InterlockedAnd_rel:
1731   case clang::AArch64::BI_InterlockedAnd64_rel:
1732     return MSVCIntrin::_InterlockedAnd_rel;
1733   case clang::AArch64::BI_InterlockedAnd8_nf:
1734   case clang::AArch64::BI_InterlockedAnd16_nf:
1735   case clang::AArch64::BI_InterlockedAnd_nf:
1736   case clang::AArch64::BI_InterlockedAnd64_nf:
1737     return MSVCIntrin::_InterlockedAnd_nf;
1738   case clang::AArch64::BI_InterlockedIncrement16_acq:
1739   case clang::AArch64::BI_InterlockedIncrement_acq:
1740   case clang::AArch64::BI_InterlockedIncrement64_acq:
1741     return MSVCIntrin::_InterlockedIncrement_acq;
1742   case clang::AArch64::BI_InterlockedIncrement16_rel:
1743   case clang::AArch64::BI_InterlockedIncrement_rel:
1744   case clang::AArch64::BI_InterlockedIncrement64_rel:
1745     return MSVCIntrin::_InterlockedIncrement_rel;
1746   case clang::AArch64::BI_InterlockedIncrement16_nf:
1747   case clang::AArch64::BI_InterlockedIncrement_nf:
1748   case clang::AArch64::BI_InterlockedIncrement64_nf:
1749     return MSVCIntrin::_InterlockedIncrement_nf;
1750   case clang::AArch64::BI_InterlockedDecrement16_acq:
1751   case clang::AArch64::BI_InterlockedDecrement_acq:
1752   case clang::AArch64::BI_InterlockedDecrement64_acq:
1753     return MSVCIntrin::_InterlockedDecrement_acq;
1754   case clang::AArch64::BI_InterlockedDecrement16_rel:
1755   case clang::AArch64::BI_InterlockedDecrement_rel:
1756   case clang::AArch64::BI_InterlockedDecrement64_rel:
1757     return MSVCIntrin::_InterlockedDecrement_rel;
1758   case clang::AArch64::BI_InterlockedDecrement16_nf:
1759   case clang::AArch64::BI_InterlockedDecrement_nf:
1760   case clang::AArch64::BI_InterlockedDecrement64_nf:
1761     return MSVCIntrin::_InterlockedDecrement_nf;
1762   }
1763   llvm_unreachable("must return from switch");
1764 }
1765 
1766 static std::optional<CodeGenFunction::MSVCIntrin>
1767 translateX86ToMsvcIntrin(unsigned BuiltinID) {
1768   using MSVCIntrin = CodeGenFunction::MSVCIntrin;
1769   switch (BuiltinID) {
1770   default:
1771     return std::nullopt;
1772   case clang::X86::BI_BitScanForward:
1773   case clang::X86::BI_BitScanForward64:
1774     return MSVCIntrin::_BitScanForward;
1775   case clang::X86::BI_BitScanReverse:
1776   case clang::X86::BI_BitScanReverse64:
1777     return MSVCIntrin::_BitScanReverse;
1778   case clang::X86::BI_InterlockedAnd64:
1779     return MSVCIntrin::_InterlockedAnd;
1780   case clang::X86::BI_InterlockedCompareExchange128:
1781     return MSVCIntrin::_InterlockedCompareExchange128;
1782   case clang::X86::BI_InterlockedExchange64:
1783     return MSVCIntrin::_InterlockedExchange;
1784   case clang::X86::BI_InterlockedExchangeAdd64:
1785     return MSVCIntrin::_InterlockedExchangeAdd;
1786   case clang::X86::BI_InterlockedExchangeSub64:
1787     return MSVCIntrin::_InterlockedExchangeSub;
1788   case clang::X86::BI_InterlockedOr64:
1789     return MSVCIntrin::_InterlockedOr;
1790   case clang::X86::BI_InterlockedXor64:
1791     return MSVCIntrin::_InterlockedXor;
1792   case clang::X86::BI_InterlockedDecrement64:
1793     return MSVCIntrin::_InterlockedDecrement;
1794   case clang::X86::BI_InterlockedIncrement64:
1795     return MSVCIntrin::_InterlockedIncrement;
1796   }
1797   llvm_unreachable("must return from switch");
1798 }
1799 
1800 // Emit an MSVC intrinsic. Assumes that arguments have *not* been evaluated.
1801 Value *CodeGenFunction::EmitMSVCBuiltinExpr(MSVCIntrin BuiltinID,
1802                                             const CallExpr *E) {
1803   switch (BuiltinID) {
1804   case MSVCIntrin::_BitScanForward:
1805   case MSVCIntrin::_BitScanReverse: {
1806     Address IndexAddress(EmitPointerWithAlignment(E->getArg(0)));
1807     Value *ArgValue = EmitScalarExpr(E->getArg(1));
1808 
1809     llvm::Type *ArgType = ArgValue->getType();
1810     llvm::Type *IndexType = IndexAddress.getElementType();
1811     llvm::Type *ResultType = ConvertType(E->getType());
1812 
1813     Value *ArgZero = llvm::Constant::getNullValue(ArgType);
1814     Value *ResZero = llvm::Constant::getNullValue(ResultType);
1815     Value *ResOne = llvm::ConstantInt::get(ResultType, 1);
1816 
1817     BasicBlock *Begin = Builder.GetInsertBlock();
1818     BasicBlock *End = createBasicBlock("bitscan_end", this->CurFn);
1819     Builder.SetInsertPoint(End);
1820     PHINode *Result = Builder.CreatePHI(ResultType, 2, "bitscan_result");
1821 
1822     Builder.SetInsertPoint(Begin);
1823     Value *IsZero = Builder.CreateICmpEQ(ArgValue, ArgZero);
1824     BasicBlock *NotZero = createBasicBlock("bitscan_not_zero", this->CurFn);
1825     Builder.CreateCondBr(IsZero, End, NotZero);
1826     Result->addIncoming(ResZero, Begin);
1827 
1828     Builder.SetInsertPoint(NotZero);
1829 
1830     if (BuiltinID == MSVCIntrin::_BitScanForward) {
1831       Function *F = CGM.getIntrinsic(Intrinsic::cttz, ArgType);
1832       Value *ZeroCount = Builder.CreateCall(F, {ArgValue, Builder.getTrue()});
1833       ZeroCount = Builder.CreateIntCast(ZeroCount, IndexType, false);
1834       Builder.CreateStore(ZeroCount, IndexAddress, false);
1835     } else {
1836       unsigned ArgWidth = cast<llvm::IntegerType>(ArgType)->getBitWidth();
1837       Value *ArgTypeLastIndex = llvm::ConstantInt::get(IndexType, ArgWidth - 1);
1838 
1839       Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
1840       Value *ZeroCount = Builder.CreateCall(F, {ArgValue, Builder.getTrue()});
1841       ZeroCount = Builder.CreateIntCast(ZeroCount, IndexType, false);
1842       Value *Index = Builder.CreateNSWSub(ArgTypeLastIndex, ZeroCount);
1843       Builder.CreateStore(Index, IndexAddress, false);
1844     }
1845     Builder.CreateBr(End);
1846     Result->addIncoming(ResOne, NotZero);
1847 
1848     Builder.SetInsertPoint(End);
1849     return Result;
1850   }
1851   case MSVCIntrin::_InterlockedAnd:
1852     return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E);
1853   case MSVCIntrin::_InterlockedExchange:
1854     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E);
1855   case MSVCIntrin::_InterlockedExchangeAdd:
1856     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E);
1857   case MSVCIntrin::_InterlockedExchangeSub:
1858     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Sub, E);
1859   case MSVCIntrin::_InterlockedOr:
1860     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E);
1861   case MSVCIntrin::_InterlockedXor:
1862     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xor, E);
1863   case MSVCIntrin::_InterlockedExchangeAdd_acq:
1864     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E,
1865                                  AtomicOrdering::Acquire);
1866   case MSVCIntrin::_InterlockedExchangeAdd_rel:
1867     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E,
1868                                  AtomicOrdering::Release);
1869   case MSVCIntrin::_InterlockedExchangeAdd_nf:
1870     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E,
1871                                  AtomicOrdering::Monotonic);
1872   case MSVCIntrin::_InterlockedExchange_acq:
1873     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E,
1874                                  AtomicOrdering::Acquire);
1875   case MSVCIntrin::_InterlockedExchange_rel:
1876     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E,
1877                                  AtomicOrdering::Release);
1878   case MSVCIntrin::_InterlockedExchange_nf:
1879     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E,
1880                                  AtomicOrdering::Monotonic);
1881   case MSVCIntrin::_InterlockedCompareExchange_acq:
1882     return EmitAtomicCmpXchgForMSIntrin(*this, E, AtomicOrdering::Acquire);
1883   case MSVCIntrin::_InterlockedCompareExchange_rel:
1884     return EmitAtomicCmpXchgForMSIntrin(*this, E, AtomicOrdering::Release);
1885   case MSVCIntrin::_InterlockedCompareExchange_nf:
1886     return EmitAtomicCmpXchgForMSIntrin(*this, E, AtomicOrdering::Monotonic);
1887   case MSVCIntrin::_InterlockedCompareExchange128:
1888     return EmitAtomicCmpXchg128ForMSIntrin(
1889         *this, E, AtomicOrdering::SequentiallyConsistent);
1890   case MSVCIntrin::_InterlockedCompareExchange128_acq:
1891     return EmitAtomicCmpXchg128ForMSIntrin(*this, E, AtomicOrdering::Acquire);
1892   case MSVCIntrin::_InterlockedCompareExchange128_rel:
1893     return EmitAtomicCmpXchg128ForMSIntrin(*this, E, AtomicOrdering::Release);
1894   case MSVCIntrin::_InterlockedCompareExchange128_nf:
1895     return EmitAtomicCmpXchg128ForMSIntrin(*this, E, AtomicOrdering::Monotonic);
1896   case MSVCIntrin::_InterlockedOr_acq:
1897     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E,
1898                                  AtomicOrdering::Acquire);
1899   case MSVCIntrin::_InterlockedOr_rel:
1900     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E,
1901                                  AtomicOrdering::Release);
1902   case MSVCIntrin::_InterlockedOr_nf:
1903     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E,
1904                                  AtomicOrdering::Monotonic);
1905   case MSVCIntrin::_InterlockedXor_acq:
1906     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xor, E,
1907                                  AtomicOrdering::Acquire);
1908   case MSVCIntrin::_InterlockedXor_rel:
1909     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xor, E,
1910                                  AtomicOrdering::Release);
1911   case MSVCIntrin::_InterlockedXor_nf:
1912     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xor, E,
1913                                  AtomicOrdering::Monotonic);
1914   case MSVCIntrin::_InterlockedAnd_acq:
1915     return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E,
1916                                  AtomicOrdering::Acquire);
1917   case MSVCIntrin::_InterlockedAnd_rel:
1918     return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E,
1919                                  AtomicOrdering::Release);
1920   case MSVCIntrin::_InterlockedAnd_nf:
1921     return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E,
1922                                  AtomicOrdering::Monotonic);
1923   case MSVCIntrin::_InterlockedIncrement_acq:
1924     return EmitAtomicIncrementValue(*this, E, AtomicOrdering::Acquire);
1925   case MSVCIntrin::_InterlockedIncrement_rel:
1926     return EmitAtomicIncrementValue(*this, E, AtomicOrdering::Release);
1927   case MSVCIntrin::_InterlockedIncrement_nf:
1928     return EmitAtomicIncrementValue(*this, E, AtomicOrdering::Monotonic);
1929   case MSVCIntrin::_InterlockedDecrement_acq:
1930     return EmitAtomicDecrementValue(*this, E, AtomicOrdering::Acquire);
1931   case MSVCIntrin::_InterlockedDecrement_rel:
1932     return EmitAtomicDecrementValue(*this, E, AtomicOrdering::Release);
1933   case MSVCIntrin::_InterlockedDecrement_nf:
1934     return EmitAtomicDecrementValue(*this, E, AtomicOrdering::Monotonic);
1935 
1936   case MSVCIntrin::_InterlockedDecrement:
1937     return EmitAtomicDecrementValue(*this, E);
1938   case MSVCIntrin::_InterlockedIncrement:
1939     return EmitAtomicIncrementValue(*this, E);
1940 
1941   case MSVCIntrin::__fastfail: {
1942     // Request immediate process termination from the kernel. The instruction
1943     // sequences to do this are documented on MSDN:
1944     // https://msdn.microsoft.com/en-us/library/dn774154.aspx
1945     llvm::Triple::ArchType ISA = getTarget().getTriple().getArch();
1946     StringRef Asm, Constraints;
1947     switch (ISA) {
1948     default:
1949       ErrorUnsupported(E, "__fastfail call for this architecture");
1950       break;
1951     case llvm::Triple::x86:
1952     case llvm::Triple::x86_64:
1953       Asm = "int $$0x29";
1954       Constraints = "{cx}";
1955       break;
1956     case llvm::Triple::thumb:
1957       Asm = "udf #251";
1958       Constraints = "{r0}";
1959       break;
1960     case llvm::Triple::aarch64:
1961       Asm = "brk #0xF003";
1962       Constraints = "{w0}";
1963     }
1964     llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, {Int32Ty}, false);
1965     llvm::InlineAsm *IA =
1966         llvm::InlineAsm::get(FTy, Asm, Constraints, /*hasSideEffects=*/true);
1967     llvm::AttributeList NoReturnAttr = llvm::AttributeList::get(
1968         getLLVMContext(), llvm::AttributeList::FunctionIndex,
1969         llvm::Attribute::NoReturn);
1970     llvm::CallInst *CI = Builder.CreateCall(IA, EmitScalarExpr(E->getArg(0)));
1971     CI->setAttributes(NoReturnAttr);
1972     return CI;
1973   }
1974   }
1975   llvm_unreachable("Incorrect MSVC intrinsic!");
1976 }
1977 
1978 namespace {
1979 // ARC cleanup for __builtin_os_log_format
1980 struct CallObjCArcUse final : EHScopeStack::Cleanup {
1981   CallObjCArcUse(llvm::Value *object) : object(object) {}
1982   llvm::Value *object;
1983 
1984   void Emit(CodeGenFunction &CGF, Flags flags) override {
1985     CGF.EmitARCIntrinsicUse(object);
1986   }
1987 };
1988 }
1989 
1990 Value *CodeGenFunction::EmitCheckedArgForBuiltin(const Expr *E,
1991                                                  BuiltinCheckKind Kind) {
1992   assert((Kind == BCK_CLZPassedZero || Kind == BCK_CTZPassedZero)
1993           && "Unsupported builtin check kind");
1994 
1995   Value *ArgValue = EmitScalarExpr(E);
1996   if (!SanOpts.has(SanitizerKind::Builtin))
1997     return ArgValue;
1998 
1999   SanitizerScope SanScope(this);
2000   Value *Cond = Builder.CreateICmpNE(
2001       ArgValue, llvm::Constant::getNullValue(ArgValue->getType()));
2002   EmitCheck(std::make_pair(Cond, SanitizerKind::Builtin),
2003             SanitizerHandler::InvalidBuiltin,
2004             {EmitCheckSourceLocation(E->getExprLoc()),
2005              llvm::ConstantInt::get(Builder.getInt8Ty(), Kind)},
2006             std::nullopt);
2007   return ArgValue;
2008 }
2009 
2010 static Value *EmitAbs(CodeGenFunction &CGF, Value *ArgValue, bool HasNSW) {
2011   return CGF.Builder.CreateBinaryIntrinsic(
2012       Intrinsic::abs, ArgValue,
2013       ConstantInt::get(CGF.Builder.getInt1Ty(), HasNSW));
2014 }
2015 
2016 static Value *EmitOverflowCheckedAbs(CodeGenFunction &CGF, const CallExpr *E,
2017                                      bool SanitizeOverflow) {
2018   Value *ArgValue = CGF.EmitScalarExpr(E->getArg(0));
2019 
2020   // Try to eliminate overflow check.
2021   if (const auto *VCI = dyn_cast<llvm::ConstantInt>(ArgValue)) {
2022     if (!VCI->isMinSignedValue())
2023       return EmitAbs(CGF, ArgValue, true);
2024   }
2025 
2026   CodeGenFunction::SanitizerScope SanScope(&CGF);
2027 
2028   Constant *Zero = Constant::getNullValue(ArgValue->getType());
2029   Value *ResultAndOverflow = CGF.Builder.CreateBinaryIntrinsic(
2030       Intrinsic::ssub_with_overflow, Zero, ArgValue);
2031   Value *Result = CGF.Builder.CreateExtractValue(ResultAndOverflow, 0);
2032   Value *NotOverflow = CGF.Builder.CreateNot(
2033       CGF.Builder.CreateExtractValue(ResultAndOverflow, 1));
2034 
2035   // TODO: support -ftrapv-handler.
2036   if (SanitizeOverflow) {
2037     CGF.EmitCheck({{NotOverflow, SanitizerKind::SignedIntegerOverflow}},
2038                   SanitizerHandler::NegateOverflow,
2039                   {CGF.EmitCheckSourceLocation(E->getArg(0)->getExprLoc()),
2040                    CGF.EmitCheckTypeDescriptor(E->getType())},
2041                   {ArgValue});
2042   } else
2043     CGF.EmitTrapCheck(NotOverflow, SanitizerHandler::SubOverflow);
2044 
2045   Value *CmpResult = CGF.Builder.CreateICmpSLT(ArgValue, Zero, "abscond");
2046   return CGF.Builder.CreateSelect(CmpResult, Result, ArgValue, "abs");
2047 }
2048 
2049 /// Get the argument type for arguments to os_log_helper.
2050 static CanQualType getOSLogArgType(ASTContext &C, int Size) {
2051   QualType UnsignedTy = C.getIntTypeForBitwidth(Size * 8, /*Signed=*/false);
2052   return C.getCanonicalType(UnsignedTy);
2053 }
2054 
2055 llvm::Function *CodeGenFunction::generateBuiltinOSLogHelperFunction(
2056     const analyze_os_log::OSLogBufferLayout &Layout,
2057     CharUnits BufferAlignment) {
2058   ASTContext &Ctx = getContext();
2059 
2060   llvm::SmallString<64> Name;
2061   {
2062     raw_svector_ostream OS(Name);
2063     OS << "__os_log_helper";
2064     OS << "_" << BufferAlignment.getQuantity();
2065     OS << "_" << int(Layout.getSummaryByte());
2066     OS << "_" << int(Layout.getNumArgsByte());
2067     for (const auto &Item : Layout.Items)
2068       OS << "_" << int(Item.getSizeByte()) << "_"
2069          << int(Item.getDescriptorByte());
2070   }
2071 
2072   if (llvm::Function *F = CGM.getModule().getFunction(Name))
2073     return F;
2074 
2075   llvm::SmallVector<QualType, 4> ArgTys;
2076   FunctionArgList Args;
2077   Args.push_back(ImplicitParamDecl::Create(
2078       Ctx, nullptr, SourceLocation(), &Ctx.Idents.get("buffer"), Ctx.VoidPtrTy,
2079       ImplicitParamKind::Other));
2080   ArgTys.emplace_back(Ctx.VoidPtrTy);
2081 
2082   for (unsigned int I = 0, E = Layout.Items.size(); I < E; ++I) {
2083     char Size = Layout.Items[I].getSizeByte();
2084     if (!Size)
2085       continue;
2086 
2087     QualType ArgTy = getOSLogArgType(Ctx, Size);
2088     Args.push_back(ImplicitParamDecl::Create(
2089         Ctx, nullptr, SourceLocation(),
2090         &Ctx.Idents.get(std::string("arg") + llvm::to_string(I)), ArgTy,
2091         ImplicitParamKind::Other));
2092     ArgTys.emplace_back(ArgTy);
2093   }
2094 
2095   QualType ReturnTy = Ctx.VoidTy;
2096 
2097   // The helper function has linkonce_odr linkage to enable the linker to merge
2098   // identical functions. To ensure the merging always happens, 'noinline' is
2099   // attached to the function when compiling with -Oz.
2100   const CGFunctionInfo &FI =
2101       CGM.getTypes().arrangeBuiltinFunctionDeclaration(ReturnTy, Args);
2102   llvm::FunctionType *FuncTy = CGM.getTypes().GetFunctionType(FI);
2103   llvm::Function *Fn = llvm::Function::Create(
2104       FuncTy, llvm::GlobalValue::LinkOnceODRLinkage, Name, &CGM.getModule());
2105   Fn->setVisibility(llvm::GlobalValue::HiddenVisibility);
2106   CGM.SetLLVMFunctionAttributes(GlobalDecl(), FI, Fn, /*IsThunk=*/false);
2107   CGM.SetLLVMFunctionAttributesForDefinition(nullptr, Fn);
2108   Fn->setDoesNotThrow();
2109 
2110   // Attach 'noinline' at -Oz.
2111   if (CGM.getCodeGenOpts().OptimizeSize == 2)
2112     Fn->addFnAttr(llvm::Attribute::NoInline);
2113 
2114   auto NL = ApplyDebugLocation::CreateEmpty(*this);
2115   StartFunction(GlobalDecl(), ReturnTy, Fn, FI, Args);
2116 
2117   // Create a scope with an artificial location for the body of this function.
2118   auto AL = ApplyDebugLocation::CreateArtificial(*this);
2119 
2120   CharUnits Offset;
2121   Address BufAddr =
2122       Address(Builder.CreateLoad(GetAddrOfLocalVar(Args[0]), "buf"), Int8Ty,
2123               BufferAlignment);
2124   Builder.CreateStore(Builder.getInt8(Layout.getSummaryByte()),
2125                       Builder.CreateConstByteGEP(BufAddr, Offset++, "summary"));
2126   Builder.CreateStore(Builder.getInt8(Layout.getNumArgsByte()),
2127                       Builder.CreateConstByteGEP(BufAddr, Offset++, "numArgs"));
2128 
2129   unsigned I = 1;
2130   for (const auto &Item : Layout.Items) {
2131     Builder.CreateStore(
2132         Builder.getInt8(Item.getDescriptorByte()),
2133         Builder.CreateConstByteGEP(BufAddr, Offset++, "argDescriptor"));
2134     Builder.CreateStore(
2135         Builder.getInt8(Item.getSizeByte()),
2136         Builder.CreateConstByteGEP(BufAddr, Offset++, "argSize"));
2137 
2138     CharUnits Size = Item.size();
2139     if (!Size.getQuantity())
2140       continue;
2141 
2142     Address Arg = GetAddrOfLocalVar(Args[I]);
2143     Address Addr = Builder.CreateConstByteGEP(BufAddr, Offset, "argData");
2144     Addr = Addr.withElementType(Arg.getElementType());
2145     Builder.CreateStore(Builder.CreateLoad(Arg), Addr);
2146     Offset += Size;
2147     ++I;
2148   }
2149 
2150   FinishFunction();
2151 
2152   return Fn;
2153 }
2154 
2155 RValue CodeGenFunction::emitBuiltinOSLogFormat(const CallExpr &E) {
2156   assert(E.getNumArgs() >= 2 &&
2157          "__builtin_os_log_format takes at least 2 arguments");
2158   ASTContext &Ctx = getContext();
2159   analyze_os_log::OSLogBufferLayout Layout;
2160   analyze_os_log::computeOSLogBufferLayout(Ctx, &E, Layout);
2161   Address BufAddr = EmitPointerWithAlignment(E.getArg(0));
2162   llvm::SmallVector<llvm::Value *, 4> RetainableOperands;
2163 
2164   // Ignore argument 1, the format string. It is not currently used.
2165   CallArgList Args;
2166   Args.add(RValue::get(BufAddr.getPointer()), Ctx.VoidPtrTy);
2167 
2168   for (const auto &Item : Layout.Items) {
2169     int Size = Item.getSizeByte();
2170     if (!Size)
2171       continue;
2172 
2173     llvm::Value *ArgVal;
2174 
2175     if (Item.getKind() == analyze_os_log::OSLogBufferItem::MaskKind) {
2176       uint64_t Val = 0;
2177       for (unsigned I = 0, E = Item.getMaskType().size(); I < E; ++I)
2178         Val |= ((uint64_t)Item.getMaskType()[I]) << I * 8;
2179       ArgVal = llvm::Constant::getIntegerValue(Int64Ty, llvm::APInt(64, Val));
2180     } else if (const Expr *TheExpr = Item.getExpr()) {
2181       ArgVal = EmitScalarExpr(TheExpr, /*Ignore*/ false);
2182 
2183       // If a temporary object that requires destruction after the full
2184       // expression is passed, push a lifetime-extended cleanup to extend its
2185       // lifetime to the end of the enclosing block scope.
2186       auto LifetimeExtendObject = [&](const Expr *E) {
2187         E = E->IgnoreParenCasts();
2188         // Extend lifetimes of objects returned by function calls and message
2189         // sends.
2190 
2191         // FIXME: We should do this in other cases in which temporaries are
2192         //        created including arguments of non-ARC types (e.g., C++
2193         //        temporaries).
2194         if (isa<CallExpr>(E) || isa<ObjCMessageExpr>(E))
2195           return true;
2196         return false;
2197       };
2198 
2199       if (TheExpr->getType()->isObjCRetainableType() &&
2200           getLangOpts().ObjCAutoRefCount && LifetimeExtendObject(TheExpr)) {
2201         assert(getEvaluationKind(TheExpr->getType()) == TEK_Scalar &&
2202                "Only scalar can be a ObjC retainable type");
2203         if (!isa<Constant>(ArgVal)) {
2204           CleanupKind Cleanup = getARCCleanupKind();
2205           QualType Ty = TheExpr->getType();
2206           Address Alloca = Address::invalid();
2207           Address Addr = CreateMemTemp(Ty, "os.log.arg", &Alloca);
2208           ArgVal = EmitARCRetain(Ty, ArgVal);
2209           Builder.CreateStore(ArgVal, Addr);
2210           pushLifetimeExtendedDestroy(Cleanup, Alloca, Ty,
2211                                       CodeGenFunction::destroyARCStrongPrecise,
2212                                       Cleanup & EHCleanup);
2213 
2214           // Push a clang.arc.use call to ensure ARC optimizer knows that the
2215           // argument has to be alive.
2216           if (CGM.getCodeGenOpts().OptimizationLevel != 0)
2217             pushCleanupAfterFullExpr<CallObjCArcUse>(Cleanup, ArgVal);
2218         }
2219       }
2220     } else {
2221       ArgVal = Builder.getInt32(Item.getConstValue().getQuantity());
2222     }
2223 
2224     unsigned ArgValSize =
2225         CGM.getDataLayout().getTypeSizeInBits(ArgVal->getType());
2226     llvm::IntegerType *IntTy = llvm::Type::getIntNTy(getLLVMContext(),
2227                                                      ArgValSize);
2228     ArgVal = Builder.CreateBitOrPointerCast(ArgVal, IntTy);
2229     CanQualType ArgTy = getOSLogArgType(Ctx, Size);
2230     // If ArgVal has type x86_fp80, zero-extend ArgVal.
2231     ArgVal = Builder.CreateZExtOrBitCast(ArgVal, ConvertType(ArgTy));
2232     Args.add(RValue::get(ArgVal), ArgTy);
2233   }
2234 
2235   const CGFunctionInfo &FI =
2236       CGM.getTypes().arrangeBuiltinFunctionCall(Ctx.VoidTy, Args);
2237   llvm::Function *F = CodeGenFunction(CGM).generateBuiltinOSLogHelperFunction(
2238       Layout, BufAddr.getAlignment());
2239   EmitCall(FI, CGCallee::forDirect(F), ReturnValueSlot(), Args);
2240   return RValue::get(BufAddr.getPointer());
2241 }
2242 
2243 static bool isSpecialUnsignedMultiplySignedResult(
2244     unsigned BuiltinID, WidthAndSignedness Op1Info, WidthAndSignedness Op2Info,
2245     WidthAndSignedness ResultInfo) {
2246   return BuiltinID == Builtin::BI__builtin_mul_overflow &&
2247          Op1Info.Width == Op2Info.Width && Op2Info.Width == ResultInfo.Width &&
2248          !Op1Info.Signed && !Op2Info.Signed && ResultInfo.Signed;
2249 }
2250 
2251 static RValue EmitCheckedUnsignedMultiplySignedResult(
2252     CodeGenFunction &CGF, const clang::Expr *Op1, WidthAndSignedness Op1Info,
2253     const clang::Expr *Op2, WidthAndSignedness Op2Info,
2254     const clang::Expr *ResultArg, QualType ResultQTy,
2255     WidthAndSignedness ResultInfo) {
2256   assert(isSpecialUnsignedMultiplySignedResult(
2257              Builtin::BI__builtin_mul_overflow, Op1Info, Op2Info, ResultInfo) &&
2258          "Cannot specialize this multiply");
2259 
2260   llvm::Value *V1 = CGF.EmitScalarExpr(Op1);
2261   llvm::Value *V2 = CGF.EmitScalarExpr(Op2);
2262 
2263   llvm::Value *HasOverflow;
2264   llvm::Value *Result = EmitOverflowIntrinsic(
2265       CGF, llvm::Intrinsic::umul_with_overflow, V1, V2, HasOverflow);
2266 
2267   // The intrinsic call will detect overflow when the value is > UINT_MAX,
2268   // however, since the original builtin had a signed result, we need to report
2269   // an overflow when the result is greater than INT_MAX.
2270   auto IntMax = llvm::APInt::getSignedMaxValue(ResultInfo.Width);
2271   llvm::Value *IntMaxValue = llvm::ConstantInt::get(Result->getType(), IntMax);
2272 
2273   llvm::Value *IntMaxOverflow = CGF.Builder.CreateICmpUGT(Result, IntMaxValue);
2274   HasOverflow = CGF.Builder.CreateOr(HasOverflow, IntMaxOverflow);
2275 
2276   bool isVolatile =
2277       ResultArg->getType()->getPointeeType().isVolatileQualified();
2278   Address ResultPtr = CGF.EmitPointerWithAlignment(ResultArg);
2279   CGF.Builder.CreateStore(CGF.EmitToMemory(Result, ResultQTy), ResultPtr,
2280                           isVolatile);
2281   return RValue::get(HasOverflow);
2282 }
2283 
2284 /// Determine if a binop is a checked mixed-sign multiply we can specialize.
2285 static bool isSpecialMixedSignMultiply(unsigned BuiltinID,
2286                                        WidthAndSignedness Op1Info,
2287                                        WidthAndSignedness Op2Info,
2288                                        WidthAndSignedness ResultInfo) {
2289   return BuiltinID == Builtin::BI__builtin_mul_overflow &&
2290          std::max(Op1Info.Width, Op2Info.Width) >= ResultInfo.Width &&
2291          Op1Info.Signed != Op2Info.Signed;
2292 }
2293 
2294 /// Emit a checked mixed-sign multiply. This is a cheaper specialization of
2295 /// the generic checked-binop irgen.
2296 static RValue
2297 EmitCheckedMixedSignMultiply(CodeGenFunction &CGF, const clang::Expr *Op1,
2298                              WidthAndSignedness Op1Info, const clang::Expr *Op2,
2299                              WidthAndSignedness Op2Info,
2300                              const clang::Expr *ResultArg, QualType ResultQTy,
2301                              WidthAndSignedness ResultInfo) {
2302   assert(isSpecialMixedSignMultiply(Builtin::BI__builtin_mul_overflow, Op1Info,
2303                                     Op2Info, ResultInfo) &&
2304          "Not a mixed-sign multipliction we can specialize");
2305 
2306   // Emit the signed and unsigned operands.
2307   const clang::Expr *SignedOp = Op1Info.Signed ? Op1 : Op2;
2308   const clang::Expr *UnsignedOp = Op1Info.Signed ? Op2 : Op1;
2309   llvm::Value *Signed = CGF.EmitScalarExpr(SignedOp);
2310   llvm::Value *Unsigned = CGF.EmitScalarExpr(UnsignedOp);
2311   unsigned SignedOpWidth = Op1Info.Signed ? Op1Info.Width : Op2Info.Width;
2312   unsigned UnsignedOpWidth = Op1Info.Signed ? Op2Info.Width : Op1Info.Width;
2313 
2314   // One of the operands may be smaller than the other. If so, [s|z]ext it.
2315   if (SignedOpWidth < UnsignedOpWidth)
2316     Signed = CGF.Builder.CreateSExt(Signed, Unsigned->getType(), "op.sext");
2317   if (UnsignedOpWidth < SignedOpWidth)
2318     Unsigned = CGF.Builder.CreateZExt(Unsigned, Signed->getType(), "op.zext");
2319 
2320   llvm::Type *OpTy = Signed->getType();
2321   llvm::Value *Zero = llvm::Constant::getNullValue(OpTy);
2322   Address ResultPtr = CGF.EmitPointerWithAlignment(ResultArg);
2323   llvm::Type *ResTy = ResultPtr.getElementType();
2324   unsigned OpWidth = std::max(Op1Info.Width, Op2Info.Width);
2325 
2326   // Take the absolute value of the signed operand.
2327   llvm::Value *IsNegative = CGF.Builder.CreateICmpSLT(Signed, Zero);
2328   llvm::Value *AbsOfNegative = CGF.Builder.CreateSub(Zero, Signed);
2329   llvm::Value *AbsSigned =
2330       CGF.Builder.CreateSelect(IsNegative, AbsOfNegative, Signed);
2331 
2332   // Perform a checked unsigned multiplication.
2333   llvm::Value *UnsignedOverflow;
2334   llvm::Value *UnsignedResult =
2335       EmitOverflowIntrinsic(CGF, llvm::Intrinsic::umul_with_overflow, AbsSigned,
2336                             Unsigned, UnsignedOverflow);
2337 
2338   llvm::Value *Overflow, *Result;
2339   if (ResultInfo.Signed) {
2340     // Signed overflow occurs if the result is greater than INT_MAX or lesser
2341     // than INT_MIN, i.e when |Result| > (INT_MAX + IsNegative).
2342     auto IntMax =
2343         llvm::APInt::getSignedMaxValue(ResultInfo.Width).zext(OpWidth);
2344     llvm::Value *MaxResult =
2345         CGF.Builder.CreateAdd(llvm::ConstantInt::get(OpTy, IntMax),
2346                               CGF.Builder.CreateZExt(IsNegative, OpTy));
2347     llvm::Value *SignedOverflow =
2348         CGF.Builder.CreateICmpUGT(UnsignedResult, MaxResult);
2349     Overflow = CGF.Builder.CreateOr(UnsignedOverflow, SignedOverflow);
2350 
2351     // Prepare the signed result (possibly by negating it).
2352     llvm::Value *NegativeResult = CGF.Builder.CreateNeg(UnsignedResult);
2353     llvm::Value *SignedResult =
2354         CGF.Builder.CreateSelect(IsNegative, NegativeResult, UnsignedResult);
2355     Result = CGF.Builder.CreateTrunc(SignedResult, ResTy);
2356   } else {
2357     // Unsigned overflow occurs if the result is < 0 or greater than UINT_MAX.
2358     llvm::Value *Underflow = CGF.Builder.CreateAnd(
2359         IsNegative, CGF.Builder.CreateIsNotNull(UnsignedResult));
2360     Overflow = CGF.Builder.CreateOr(UnsignedOverflow, Underflow);
2361     if (ResultInfo.Width < OpWidth) {
2362       auto IntMax =
2363           llvm::APInt::getMaxValue(ResultInfo.Width).zext(OpWidth);
2364       llvm::Value *TruncOverflow = CGF.Builder.CreateICmpUGT(
2365           UnsignedResult, llvm::ConstantInt::get(OpTy, IntMax));
2366       Overflow = CGF.Builder.CreateOr(Overflow, TruncOverflow);
2367     }
2368 
2369     // Negate the product if it would be negative in infinite precision.
2370     Result = CGF.Builder.CreateSelect(
2371         IsNegative, CGF.Builder.CreateNeg(UnsignedResult), UnsignedResult);
2372 
2373     Result = CGF.Builder.CreateTrunc(Result, ResTy);
2374   }
2375   assert(Overflow && Result && "Missing overflow or result");
2376 
2377   bool isVolatile =
2378       ResultArg->getType()->getPointeeType().isVolatileQualified();
2379   CGF.Builder.CreateStore(CGF.EmitToMemory(Result, ResultQTy), ResultPtr,
2380                           isVolatile);
2381   return RValue::get(Overflow);
2382 }
2383 
2384 static bool
2385 TypeRequiresBuiltinLaunderImp(const ASTContext &Ctx, QualType Ty,
2386                               llvm::SmallPtrSetImpl<const Decl *> &Seen) {
2387   if (const auto *Arr = Ctx.getAsArrayType(Ty))
2388     Ty = Ctx.getBaseElementType(Arr);
2389 
2390   const auto *Record = Ty->getAsCXXRecordDecl();
2391   if (!Record)
2392     return false;
2393 
2394   // We've already checked this type, or are in the process of checking it.
2395   if (!Seen.insert(Record).second)
2396     return false;
2397 
2398   assert(Record->hasDefinition() &&
2399          "Incomplete types should already be diagnosed");
2400 
2401   if (Record->isDynamicClass())
2402     return true;
2403 
2404   for (FieldDecl *F : Record->fields()) {
2405     if (TypeRequiresBuiltinLaunderImp(Ctx, F->getType(), Seen))
2406       return true;
2407   }
2408   return false;
2409 }
2410 
2411 /// Determine if the specified type requires laundering by checking if it is a
2412 /// dynamic class type or contains a subobject which is a dynamic class type.
2413 static bool TypeRequiresBuiltinLaunder(CodeGenModule &CGM, QualType Ty) {
2414   if (!CGM.getCodeGenOpts().StrictVTablePointers)
2415     return false;
2416   llvm::SmallPtrSet<const Decl *, 16> Seen;
2417   return TypeRequiresBuiltinLaunderImp(CGM.getContext(), Ty, Seen);
2418 }
2419 
2420 RValue CodeGenFunction::emitRotate(const CallExpr *E, bool IsRotateRight) {
2421   llvm::Value *Src = EmitScalarExpr(E->getArg(0));
2422   llvm::Value *ShiftAmt = EmitScalarExpr(E->getArg(1));
2423 
2424   // The builtin's shift arg may have a different type than the source arg and
2425   // result, but the LLVM intrinsic uses the same type for all values.
2426   llvm::Type *Ty = Src->getType();
2427   ShiftAmt = Builder.CreateIntCast(ShiftAmt, Ty, false);
2428 
2429   // Rotate is a special case of LLVM funnel shift - 1st 2 args are the same.
2430   unsigned IID = IsRotateRight ? Intrinsic::fshr : Intrinsic::fshl;
2431   Function *F = CGM.getIntrinsic(IID, Ty);
2432   return RValue::get(Builder.CreateCall(F, { Src, Src, ShiftAmt }));
2433 }
2434 
2435 // Map math builtins for long-double to f128 version.
2436 static unsigned mutateLongDoubleBuiltin(unsigned BuiltinID) {
2437   switch (BuiltinID) {
2438 #define MUTATE_LDBL(func) \
2439   case Builtin::BI__builtin_##func##l: \
2440     return Builtin::BI__builtin_##func##f128;
2441   MUTATE_LDBL(sqrt)
2442   MUTATE_LDBL(cbrt)
2443   MUTATE_LDBL(fabs)
2444   MUTATE_LDBL(log)
2445   MUTATE_LDBL(log2)
2446   MUTATE_LDBL(log10)
2447   MUTATE_LDBL(log1p)
2448   MUTATE_LDBL(logb)
2449   MUTATE_LDBL(exp)
2450   MUTATE_LDBL(exp2)
2451   MUTATE_LDBL(expm1)
2452   MUTATE_LDBL(fdim)
2453   MUTATE_LDBL(hypot)
2454   MUTATE_LDBL(ilogb)
2455   MUTATE_LDBL(pow)
2456   MUTATE_LDBL(fmin)
2457   MUTATE_LDBL(fmax)
2458   MUTATE_LDBL(ceil)
2459   MUTATE_LDBL(trunc)
2460   MUTATE_LDBL(rint)
2461   MUTATE_LDBL(nearbyint)
2462   MUTATE_LDBL(round)
2463   MUTATE_LDBL(floor)
2464   MUTATE_LDBL(lround)
2465   MUTATE_LDBL(llround)
2466   MUTATE_LDBL(lrint)
2467   MUTATE_LDBL(llrint)
2468   MUTATE_LDBL(fmod)
2469   MUTATE_LDBL(modf)
2470   MUTATE_LDBL(nan)
2471   MUTATE_LDBL(nans)
2472   MUTATE_LDBL(inf)
2473   MUTATE_LDBL(fma)
2474   MUTATE_LDBL(sin)
2475   MUTATE_LDBL(cos)
2476   MUTATE_LDBL(tan)
2477   MUTATE_LDBL(sinh)
2478   MUTATE_LDBL(cosh)
2479   MUTATE_LDBL(tanh)
2480   MUTATE_LDBL(asin)
2481   MUTATE_LDBL(acos)
2482   MUTATE_LDBL(atan)
2483   MUTATE_LDBL(asinh)
2484   MUTATE_LDBL(acosh)
2485   MUTATE_LDBL(atanh)
2486   MUTATE_LDBL(atan2)
2487   MUTATE_LDBL(erf)
2488   MUTATE_LDBL(erfc)
2489   MUTATE_LDBL(ldexp)
2490   MUTATE_LDBL(frexp)
2491   MUTATE_LDBL(huge_val)
2492   MUTATE_LDBL(copysign)
2493   MUTATE_LDBL(nextafter)
2494   MUTATE_LDBL(nexttoward)
2495   MUTATE_LDBL(remainder)
2496   MUTATE_LDBL(remquo)
2497   MUTATE_LDBL(scalbln)
2498   MUTATE_LDBL(scalbn)
2499   MUTATE_LDBL(tgamma)
2500   MUTATE_LDBL(lgamma)
2501 #undef MUTATE_LDBL
2502   default:
2503     return BuiltinID;
2504   }
2505 }
2506 
2507 static Value *tryUseTestFPKind(CodeGenFunction &CGF, unsigned BuiltinID,
2508                                Value *V) {
2509   if (CGF.Builder.getIsFPConstrained() &&
2510       CGF.Builder.getDefaultConstrainedExcept() != fp::ebIgnore) {
2511     if (Value *Result =
2512             CGF.getTargetHooks().testFPKind(V, BuiltinID, CGF.Builder, CGF.CGM))
2513       return Result;
2514   }
2515   return nullptr;
2516 }
2517 
2518 static RValue EmitHipStdParUnsupportedBuiltin(CodeGenFunction *CGF,
2519                                               const FunctionDecl *FD) {
2520   auto Name = FD->getNameAsString() + "__hipstdpar_unsupported";
2521   auto FnTy = CGF->CGM.getTypes().GetFunctionType(FD);
2522   auto UBF = CGF->CGM.getModule().getOrInsertFunction(Name, FnTy);
2523 
2524   SmallVector<Value *, 16> Args;
2525   for (auto &&FormalTy : FnTy->params())
2526     Args.push_back(llvm::PoisonValue::get(FormalTy));
2527 
2528   return RValue::get(CGF->Builder.CreateCall(UBF, Args));
2529 }
2530 
2531 RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
2532                                         const CallExpr *E,
2533                                         ReturnValueSlot ReturnValue) {
2534   const FunctionDecl *FD = GD.getDecl()->getAsFunction();
2535   // See if we can constant fold this builtin.  If so, don't emit it at all.
2536   // TODO: Extend this handling to all builtin calls that we can constant-fold.
2537   Expr::EvalResult Result;
2538   if (E->isPRValue() && E->EvaluateAsRValue(Result, CGM.getContext()) &&
2539       !Result.hasSideEffects()) {
2540     if (Result.Val.isInt())
2541       return RValue::get(llvm::ConstantInt::get(getLLVMContext(),
2542                                                 Result.Val.getInt()));
2543     if (Result.Val.isFloat())
2544       return RValue::get(llvm::ConstantFP::get(getLLVMContext(),
2545                                                Result.Val.getFloat()));
2546   }
2547 
2548   // If current long-double semantics is IEEE 128-bit, replace math builtins
2549   // of long-double with f128 equivalent.
2550   // TODO: This mutation should also be applied to other targets other than PPC,
2551   // after backend supports IEEE 128-bit style libcalls.
2552   if (getTarget().getTriple().isPPC64() &&
2553       &getTarget().getLongDoubleFormat() == &llvm::APFloat::IEEEquad())
2554     BuiltinID = mutateLongDoubleBuiltin(BuiltinID);
2555 
2556   // If the builtin has been declared explicitly with an assembler label,
2557   // disable the specialized emitting below. Ideally we should communicate the
2558   // rename in IR, or at least avoid generating the intrinsic calls that are
2559   // likely to get lowered to the renamed library functions.
2560   const unsigned BuiltinIDIfNoAsmLabel =
2561       FD->hasAttr<AsmLabelAttr>() ? 0 : BuiltinID;
2562 
2563   std::optional<bool> ErrnoOverriden;
2564   // ErrnoOverriden is true if math-errno is overriden via the
2565   // '#pragma float_control(precise, on)'. This pragma disables fast-math,
2566   // which implies math-errno.
2567   if (E->hasStoredFPFeatures()) {
2568     FPOptionsOverride OP = E->getFPFeatures();
2569     if (OP.hasMathErrnoOverride())
2570       ErrnoOverriden = OP.getMathErrnoOverride();
2571   }
2572   // True if 'atttibute__((optnone)) is used. This attibute overrides
2573   // fast-math which implies math-errno.
2574   bool OptNone = CurFuncDecl && CurFuncDecl->hasAttr<OptimizeNoneAttr>();
2575 
2576   // True if we are compiling at -O2 and errno has been disabled
2577   // using the '#pragma float_control(precise, off)', and
2578   // attribute opt-none hasn't been seen.
2579   bool ErrnoOverridenToFalseWithOpt =
2580        ErrnoOverriden.has_value() && !ErrnoOverriden.value() && !OptNone &&
2581        CGM.getCodeGenOpts().OptimizationLevel != 0;
2582 
2583   // There are LLVM math intrinsics/instructions corresponding to math library
2584   // functions except the LLVM op will never set errno while the math library
2585   // might. Also, math builtins have the same semantics as their math library
2586   // twins. Thus, we can transform math library and builtin calls to their
2587   // LLVM counterparts if the call is marked 'const' (known to never set errno).
2588   // In case FP exceptions are enabled, the experimental versions of the
2589   // intrinsics model those.
2590   bool ConstAlways =
2591       getContext().BuiltinInfo.isConst(BuiltinID);
2592 
2593   // There's a special case with the fma builtins where they are always const
2594   // if the target environment is GNU or the target is OS is Windows and we're
2595   // targeting the MSVCRT.dll environment.
2596   // FIXME: This list can be become outdated. Need to find a way to get it some
2597   // other way.
2598   switch (BuiltinID) {
2599   case Builtin::BI__builtin_fma:
2600   case Builtin::BI__builtin_fmaf:
2601   case Builtin::BI__builtin_fmal:
2602   case Builtin::BIfma:
2603   case Builtin::BIfmaf:
2604   case Builtin::BIfmal: {
2605     auto &Trip = CGM.getTriple();
2606     if (Trip.isGNUEnvironment() || Trip.isOSMSVCRT())
2607       ConstAlways = true;
2608     break;
2609   }
2610   default:
2611     break;
2612   }
2613 
2614   bool ConstWithoutErrnoAndExceptions =
2615       getContext().BuiltinInfo.isConstWithoutErrnoAndExceptions(BuiltinID);
2616   bool ConstWithoutExceptions =
2617       getContext().BuiltinInfo.isConstWithoutExceptions(BuiltinID);
2618 
2619   // ConstAttr is enabled in fast-math mode. In fast-math mode, math-errno is
2620   // disabled.
2621   // Math intrinsics are generated only when math-errno is disabled. Any pragmas
2622   // or attributes that affect math-errno should prevent or allow math
2623   // intrincs to be generated. Intrinsics are generated:
2624   //   1- In fast math mode, unless math-errno is overriden
2625   //      via '#pragma float_control(precise, on)', or via an
2626   //      'attribute__((optnone))'.
2627   //   2- If math-errno was enabled on command line but overriden
2628   //      to false via '#pragma float_control(precise, off))' and
2629   //      'attribute__((optnone))' hasn't been used.
2630   //   3- If we are compiling with optimization and errno has been disabled
2631   //      via '#pragma float_control(precise, off)', and
2632   //      'attribute__((optnone))' hasn't been used.
2633 
2634   bool ConstWithoutErrnoOrExceptions =
2635       ConstWithoutErrnoAndExceptions || ConstWithoutExceptions;
2636   bool GenerateIntrinsics =
2637       (ConstAlways && !OptNone) ||
2638       (!getLangOpts().MathErrno &&
2639        !(ErrnoOverriden.has_value() && ErrnoOverriden.value()) && !OptNone);
2640   if (!GenerateIntrinsics) {
2641     GenerateIntrinsics =
2642         ConstWithoutErrnoOrExceptions && !ConstWithoutErrnoAndExceptions;
2643     if (!GenerateIntrinsics)
2644       GenerateIntrinsics =
2645           ConstWithoutErrnoOrExceptions &&
2646           (!getLangOpts().MathErrno &&
2647            !(ErrnoOverriden.has_value() && ErrnoOverriden.value()) && !OptNone);
2648     if (!GenerateIntrinsics)
2649       GenerateIntrinsics =
2650           ConstWithoutErrnoOrExceptions && ErrnoOverridenToFalseWithOpt;
2651   }
2652   if (GenerateIntrinsics) {
2653     switch (BuiltinIDIfNoAsmLabel) {
2654     case Builtin::BIceil:
2655     case Builtin::BIceilf:
2656     case Builtin::BIceill:
2657     case Builtin::BI__builtin_ceil:
2658     case Builtin::BI__builtin_ceilf:
2659     case Builtin::BI__builtin_ceilf16:
2660     case Builtin::BI__builtin_ceill:
2661     case Builtin::BI__builtin_ceilf128:
2662       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2663                                    Intrinsic::ceil,
2664                                    Intrinsic::experimental_constrained_ceil));
2665 
2666     case Builtin::BIcopysign:
2667     case Builtin::BIcopysignf:
2668     case Builtin::BIcopysignl:
2669     case Builtin::BI__builtin_copysign:
2670     case Builtin::BI__builtin_copysignf:
2671     case Builtin::BI__builtin_copysignf16:
2672     case Builtin::BI__builtin_copysignl:
2673     case Builtin::BI__builtin_copysignf128:
2674       return RValue::get(emitBinaryBuiltin(*this, E, Intrinsic::copysign));
2675 
2676     case Builtin::BIcos:
2677     case Builtin::BIcosf:
2678     case Builtin::BIcosl:
2679     case Builtin::BI__builtin_cos:
2680     case Builtin::BI__builtin_cosf:
2681     case Builtin::BI__builtin_cosf16:
2682     case Builtin::BI__builtin_cosl:
2683     case Builtin::BI__builtin_cosf128:
2684       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2685                                    Intrinsic::cos,
2686                                    Intrinsic::experimental_constrained_cos));
2687 
2688     case Builtin::BIexp:
2689     case Builtin::BIexpf:
2690     case Builtin::BIexpl:
2691     case Builtin::BI__builtin_exp:
2692     case Builtin::BI__builtin_expf:
2693     case Builtin::BI__builtin_expf16:
2694     case Builtin::BI__builtin_expl:
2695     case Builtin::BI__builtin_expf128:
2696       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2697                                    Intrinsic::exp,
2698                                    Intrinsic::experimental_constrained_exp));
2699 
2700     case Builtin::BIexp2:
2701     case Builtin::BIexp2f:
2702     case Builtin::BIexp2l:
2703     case Builtin::BI__builtin_exp2:
2704     case Builtin::BI__builtin_exp2f:
2705     case Builtin::BI__builtin_exp2f16:
2706     case Builtin::BI__builtin_exp2l:
2707     case Builtin::BI__builtin_exp2f128:
2708       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2709                                    Intrinsic::exp2,
2710                                    Intrinsic::experimental_constrained_exp2));
2711     case Builtin::BI__builtin_exp10:
2712     case Builtin::BI__builtin_exp10f:
2713     case Builtin::BI__builtin_exp10f16:
2714     case Builtin::BI__builtin_exp10l:
2715     case Builtin::BI__builtin_exp10f128: {
2716       // TODO: strictfp support
2717       if (Builder.getIsFPConstrained())
2718         break;
2719       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::exp10));
2720     }
2721     case Builtin::BIfabs:
2722     case Builtin::BIfabsf:
2723     case Builtin::BIfabsl:
2724     case Builtin::BI__builtin_fabs:
2725     case Builtin::BI__builtin_fabsf:
2726     case Builtin::BI__builtin_fabsf16:
2727     case Builtin::BI__builtin_fabsl:
2728     case Builtin::BI__builtin_fabsf128:
2729       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::fabs));
2730 
2731     case Builtin::BIfloor:
2732     case Builtin::BIfloorf:
2733     case Builtin::BIfloorl:
2734     case Builtin::BI__builtin_floor:
2735     case Builtin::BI__builtin_floorf:
2736     case Builtin::BI__builtin_floorf16:
2737     case Builtin::BI__builtin_floorl:
2738     case Builtin::BI__builtin_floorf128:
2739       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2740                                    Intrinsic::floor,
2741                                    Intrinsic::experimental_constrained_floor));
2742 
2743     case Builtin::BIfma:
2744     case Builtin::BIfmaf:
2745     case Builtin::BIfmal:
2746     case Builtin::BI__builtin_fma:
2747     case Builtin::BI__builtin_fmaf:
2748     case Builtin::BI__builtin_fmaf16:
2749     case Builtin::BI__builtin_fmal:
2750     case Builtin::BI__builtin_fmaf128:
2751       return RValue::get(emitTernaryMaybeConstrainedFPBuiltin(*this, E,
2752                                    Intrinsic::fma,
2753                                    Intrinsic::experimental_constrained_fma));
2754 
2755     case Builtin::BIfmax:
2756     case Builtin::BIfmaxf:
2757     case Builtin::BIfmaxl:
2758     case Builtin::BI__builtin_fmax:
2759     case Builtin::BI__builtin_fmaxf:
2760     case Builtin::BI__builtin_fmaxf16:
2761     case Builtin::BI__builtin_fmaxl:
2762     case Builtin::BI__builtin_fmaxf128:
2763       return RValue::get(emitBinaryMaybeConstrainedFPBuiltin(*this, E,
2764                                    Intrinsic::maxnum,
2765                                    Intrinsic::experimental_constrained_maxnum));
2766 
2767     case Builtin::BIfmin:
2768     case Builtin::BIfminf:
2769     case Builtin::BIfminl:
2770     case Builtin::BI__builtin_fmin:
2771     case Builtin::BI__builtin_fminf:
2772     case Builtin::BI__builtin_fminf16:
2773     case Builtin::BI__builtin_fminl:
2774     case Builtin::BI__builtin_fminf128:
2775       return RValue::get(emitBinaryMaybeConstrainedFPBuiltin(*this, E,
2776                                    Intrinsic::minnum,
2777                                    Intrinsic::experimental_constrained_minnum));
2778 
2779     // fmod() is a special-case. It maps to the frem instruction rather than an
2780     // LLVM intrinsic.
2781     case Builtin::BIfmod:
2782     case Builtin::BIfmodf:
2783     case Builtin::BIfmodl:
2784     case Builtin::BI__builtin_fmod:
2785     case Builtin::BI__builtin_fmodf:
2786     case Builtin::BI__builtin_fmodf16:
2787     case Builtin::BI__builtin_fmodl:
2788     case Builtin::BI__builtin_fmodf128: {
2789       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
2790       Value *Arg1 = EmitScalarExpr(E->getArg(0));
2791       Value *Arg2 = EmitScalarExpr(E->getArg(1));
2792       return RValue::get(Builder.CreateFRem(Arg1, Arg2, "fmod"));
2793     }
2794 
2795     case Builtin::BIlog:
2796     case Builtin::BIlogf:
2797     case Builtin::BIlogl:
2798     case Builtin::BI__builtin_log:
2799     case Builtin::BI__builtin_logf:
2800     case Builtin::BI__builtin_logf16:
2801     case Builtin::BI__builtin_logl:
2802     case Builtin::BI__builtin_logf128:
2803       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2804                                    Intrinsic::log,
2805                                    Intrinsic::experimental_constrained_log));
2806 
2807     case Builtin::BIlog10:
2808     case Builtin::BIlog10f:
2809     case Builtin::BIlog10l:
2810     case Builtin::BI__builtin_log10:
2811     case Builtin::BI__builtin_log10f:
2812     case Builtin::BI__builtin_log10f16:
2813     case Builtin::BI__builtin_log10l:
2814     case Builtin::BI__builtin_log10f128:
2815       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2816                                    Intrinsic::log10,
2817                                    Intrinsic::experimental_constrained_log10));
2818 
2819     case Builtin::BIlog2:
2820     case Builtin::BIlog2f:
2821     case Builtin::BIlog2l:
2822     case Builtin::BI__builtin_log2:
2823     case Builtin::BI__builtin_log2f:
2824     case Builtin::BI__builtin_log2f16:
2825     case Builtin::BI__builtin_log2l:
2826     case Builtin::BI__builtin_log2f128:
2827       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2828                                    Intrinsic::log2,
2829                                    Intrinsic::experimental_constrained_log2));
2830 
2831     case Builtin::BInearbyint:
2832     case Builtin::BInearbyintf:
2833     case Builtin::BInearbyintl:
2834     case Builtin::BI__builtin_nearbyint:
2835     case Builtin::BI__builtin_nearbyintf:
2836     case Builtin::BI__builtin_nearbyintl:
2837     case Builtin::BI__builtin_nearbyintf128:
2838       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2839                                 Intrinsic::nearbyint,
2840                                 Intrinsic::experimental_constrained_nearbyint));
2841 
2842     case Builtin::BIpow:
2843     case Builtin::BIpowf:
2844     case Builtin::BIpowl:
2845     case Builtin::BI__builtin_pow:
2846     case Builtin::BI__builtin_powf:
2847     case Builtin::BI__builtin_powf16:
2848     case Builtin::BI__builtin_powl:
2849     case Builtin::BI__builtin_powf128:
2850       return RValue::get(emitBinaryMaybeConstrainedFPBuiltin(*this, E,
2851                                    Intrinsic::pow,
2852                                    Intrinsic::experimental_constrained_pow));
2853 
2854     case Builtin::BIrint:
2855     case Builtin::BIrintf:
2856     case Builtin::BIrintl:
2857     case Builtin::BI__builtin_rint:
2858     case Builtin::BI__builtin_rintf:
2859     case Builtin::BI__builtin_rintf16:
2860     case Builtin::BI__builtin_rintl:
2861     case Builtin::BI__builtin_rintf128:
2862       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2863                                    Intrinsic::rint,
2864                                    Intrinsic::experimental_constrained_rint));
2865 
2866     case Builtin::BIround:
2867     case Builtin::BIroundf:
2868     case Builtin::BIroundl:
2869     case Builtin::BI__builtin_round:
2870     case Builtin::BI__builtin_roundf:
2871     case Builtin::BI__builtin_roundf16:
2872     case Builtin::BI__builtin_roundl:
2873     case Builtin::BI__builtin_roundf128:
2874       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2875                                    Intrinsic::round,
2876                                    Intrinsic::experimental_constrained_round));
2877 
2878     case Builtin::BIroundeven:
2879     case Builtin::BIroundevenf:
2880     case Builtin::BIroundevenl:
2881     case Builtin::BI__builtin_roundeven:
2882     case Builtin::BI__builtin_roundevenf:
2883     case Builtin::BI__builtin_roundevenf16:
2884     case Builtin::BI__builtin_roundevenl:
2885     case Builtin::BI__builtin_roundevenf128:
2886       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2887                                    Intrinsic::roundeven,
2888                                    Intrinsic::experimental_constrained_roundeven));
2889 
2890     case Builtin::BIsin:
2891     case Builtin::BIsinf:
2892     case Builtin::BIsinl:
2893     case Builtin::BI__builtin_sin:
2894     case Builtin::BI__builtin_sinf:
2895     case Builtin::BI__builtin_sinf16:
2896     case Builtin::BI__builtin_sinl:
2897     case Builtin::BI__builtin_sinf128:
2898       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2899                                    Intrinsic::sin,
2900                                    Intrinsic::experimental_constrained_sin));
2901 
2902     case Builtin::BIsqrt:
2903     case Builtin::BIsqrtf:
2904     case Builtin::BIsqrtl:
2905     case Builtin::BI__builtin_sqrt:
2906     case Builtin::BI__builtin_sqrtf:
2907     case Builtin::BI__builtin_sqrtf16:
2908     case Builtin::BI__builtin_sqrtl:
2909     case Builtin::BI__builtin_sqrtf128:
2910     case Builtin::BI__builtin_elementwise_sqrt: {
2911       llvm::Value *Call = emitUnaryMaybeConstrainedFPBuiltin(
2912           *this, E, Intrinsic::sqrt, Intrinsic::experimental_constrained_sqrt);
2913       SetSqrtFPAccuracy(Call);
2914       return RValue::get(Call);
2915     }
2916     case Builtin::BItrunc:
2917     case Builtin::BItruncf:
2918     case Builtin::BItruncl:
2919     case Builtin::BI__builtin_trunc:
2920     case Builtin::BI__builtin_truncf:
2921     case Builtin::BI__builtin_truncf16:
2922     case Builtin::BI__builtin_truncl:
2923     case Builtin::BI__builtin_truncf128:
2924       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2925                                    Intrinsic::trunc,
2926                                    Intrinsic::experimental_constrained_trunc));
2927 
2928     case Builtin::BIlround:
2929     case Builtin::BIlroundf:
2930     case Builtin::BIlroundl:
2931     case Builtin::BI__builtin_lround:
2932     case Builtin::BI__builtin_lroundf:
2933     case Builtin::BI__builtin_lroundl:
2934     case Builtin::BI__builtin_lroundf128:
2935       return RValue::get(emitMaybeConstrainedFPToIntRoundBuiltin(
2936           *this, E, Intrinsic::lround,
2937           Intrinsic::experimental_constrained_lround));
2938 
2939     case Builtin::BIllround:
2940     case Builtin::BIllroundf:
2941     case Builtin::BIllroundl:
2942     case Builtin::BI__builtin_llround:
2943     case Builtin::BI__builtin_llroundf:
2944     case Builtin::BI__builtin_llroundl:
2945     case Builtin::BI__builtin_llroundf128:
2946       return RValue::get(emitMaybeConstrainedFPToIntRoundBuiltin(
2947           *this, E, Intrinsic::llround,
2948           Intrinsic::experimental_constrained_llround));
2949 
2950     case Builtin::BIlrint:
2951     case Builtin::BIlrintf:
2952     case Builtin::BIlrintl:
2953     case Builtin::BI__builtin_lrint:
2954     case Builtin::BI__builtin_lrintf:
2955     case Builtin::BI__builtin_lrintl:
2956     case Builtin::BI__builtin_lrintf128:
2957       return RValue::get(emitMaybeConstrainedFPToIntRoundBuiltin(
2958           *this, E, Intrinsic::lrint,
2959           Intrinsic::experimental_constrained_lrint));
2960 
2961     case Builtin::BIllrint:
2962     case Builtin::BIllrintf:
2963     case Builtin::BIllrintl:
2964     case Builtin::BI__builtin_llrint:
2965     case Builtin::BI__builtin_llrintf:
2966     case Builtin::BI__builtin_llrintl:
2967     case Builtin::BI__builtin_llrintf128:
2968       return RValue::get(emitMaybeConstrainedFPToIntRoundBuiltin(
2969           *this, E, Intrinsic::llrint,
2970           Intrinsic::experimental_constrained_llrint));
2971     case Builtin::BI__builtin_ldexp:
2972     case Builtin::BI__builtin_ldexpf:
2973     case Builtin::BI__builtin_ldexpl:
2974     case Builtin::BI__builtin_ldexpf16:
2975     case Builtin::BI__builtin_ldexpf128: {
2976       return RValue::get(emitBinaryExpMaybeConstrainedFPBuiltin(
2977           *this, E, Intrinsic::ldexp,
2978           Intrinsic::experimental_constrained_ldexp));
2979     }
2980     default:
2981       break;
2982     }
2983   }
2984 
2985   // Check NonnullAttribute/NullabilityArg and Alignment.
2986   auto EmitArgCheck = [&](TypeCheckKind Kind, Address A, const Expr *Arg,
2987                           unsigned ParmNum) {
2988     Value *Val = A.getPointer();
2989     EmitNonNullArgCheck(RValue::get(Val), Arg->getType(), Arg->getExprLoc(), FD,
2990                         ParmNum);
2991 
2992     if (SanOpts.has(SanitizerKind::Alignment)) {
2993       SanitizerSet SkippedChecks;
2994       SkippedChecks.set(SanitizerKind::All);
2995       SkippedChecks.clear(SanitizerKind::Alignment);
2996       SourceLocation Loc = Arg->getExprLoc();
2997       // Strip an implicit cast.
2998       if (auto *CE = dyn_cast<ImplicitCastExpr>(Arg))
2999         if (CE->getCastKind() == CK_BitCast)
3000           Arg = CE->getSubExpr();
3001       EmitTypeCheck(Kind, Loc, Val, Arg->getType(), A.getAlignment(),
3002                     SkippedChecks);
3003     }
3004   };
3005 
3006   switch (BuiltinIDIfNoAsmLabel) {
3007   default: break;
3008   case Builtin::BI__builtin___CFStringMakeConstantString:
3009   case Builtin::BI__builtin___NSStringMakeConstantString:
3010     return RValue::get(ConstantEmitter(*this).emitAbstract(E, E->getType()));
3011   case Builtin::BI__builtin_stdarg_start:
3012   case Builtin::BI__builtin_va_start:
3013   case Builtin::BI__va_start:
3014   case Builtin::BI__builtin_va_end:
3015     EmitVAStartEnd(BuiltinID == Builtin::BI__va_start
3016                        ? EmitScalarExpr(E->getArg(0))
3017                        : EmitVAListRef(E->getArg(0)).getPointer(),
3018                    BuiltinID != Builtin::BI__builtin_va_end);
3019     return RValue::get(nullptr);
3020   case Builtin::BI__builtin_va_copy: {
3021     Value *DstPtr = EmitVAListRef(E->getArg(0)).getPointer();
3022     Value *SrcPtr = EmitVAListRef(E->getArg(1)).getPointer();
3023     Builder.CreateCall(CGM.getIntrinsic(Intrinsic::vacopy), {DstPtr, SrcPtr});
3024     return RValue::get(nullptr);
3025   }
3026   case Builtin::BIabs:
3027   case Builtin::BIlabs:
3028   case Builtin::BIllabs:
3029   case Builtin::BI__builtin_abs:
3030   case Builtin::BI__builtin_labs:
3031   case Builtin::BI__builtin_llabs: {
3032     bool SanitizeOverflow = SanOpts.has(SanitizerKind::SignedIntegerOverflow);
3033 
3034     Value *Result;
3035     switch (getLangOpts().getSignedOverflowBehavior()) {
3036     case LangOptions::SOB_Defined:
3037       Result = EmitAbs(*this, EmitScalarExpr(E->getArg(0)), false);
3038       break;
3039     case LangOptions::SOB_Undefined:
3040       if (!SanitizeOverflow) {
3041         Result = EmitAbs(*this, EmitScalarExpr(E->getArg(0)), true);
3042         break;
3043       }
3044       [[fallthrough]];
3045     case LangOptions::SOB_Trapping:
3046       // TODO: Somehow handle the corner case when the address of abs is taken.
3047       Result = EmitOverflowCheckedAbs(*this, E, SanitizeOverflow);
3048       break;
3049     }
3050     return RValue::get(Result);
3051   }
3052   case Builtin::BI__builtin_complex: {
3053     Value *Real = EmitScalarExpr(E->getArg(0));
3054     Value *Imag = EmitScalarExpr(E->getArg(1));
3055     return RValue::getComplex({Real, Imag});
3056   }
3057   case Builtin::BI__builtin_conj:
3058   case Builtin::BI__builtin_conjf:
3059   case Builtin::BI__builtin_conjl:
3060   case Builtin::BIconj:
3061   case Builtin::BIconjf:
3062   case Builtin::BIconjl: {
3063     ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
3064     Value *Real = ComplexVal.first;
3065     Value *Imag = ComplexVal.second;
3066     Imag = Builder.CreateFNeg(Imag, "neg");
3067     return RValue::getComplex(std::make_pair(Real, Imag));
3068   }
3069   case Builtin::BI__builtin_creal:
3070   case Builtin::BI__builtin_crealf:
3071   case Builtin::BI__builtin_creall:
3072   case Builtin::BIcreal:
3073   case Builtin::BIcrealf:
3074   case Builtin::BIcreall: {
3075     ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
3076     return RValue::get(ComplexVal.first);
3077   }
3078 
3079   case Builtin::BI__builtin_preserve_access_index: {
3080     // Only enabled preserved access index region when debuginfo
3081     // is available as debuginfo is needed to preserve user-level
3082     // access pattern.
3083     if (!getDebugInfo()) {
3084       CGM.Error(E->getExprLoc(), "using builtin_preserve_access_index() without -g");
3085       return RValue::get(EmitScalarExpr(E->getArg(0)));
3086     }
3087 
3088     // Nested builtin_preserve_access_index() not supported
3089     if (IsInPreservedAIRegion) {
3090       CGM.Error(E->getExprLoc(), "nested builtin_preserve_access_index() not supported");
3091       return RValue::get(EmitScalarExpr(E->getArg(0)));
3092     }
3093 
3094     IsInPreservedAIRegion = true;
3095     Value *Res = EmitScalarExpr(E->getArg(0));
3096     IsInPreservedAIRegion = false;
3097     return RValue::get(Res);
3098   }
3099 
3100   case Builtin::BI__builtin_cimag:
3101   case Builtin::BI__builtin_cimagf:
3102   case Builtin::BI__builtin_cimagl:
3103   case Builtin::BIcimag:
3104   case Builtin::BIcimagf:
3105   case Builtin::BIcimagl: {
3106     ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
3107     return RValue::get(ComplexVal.second);
3108   }
3109 
3110   case Builtin::BI__builtin_clrsb:
3111   case Builtin::BI__builtin_clrsbl:
3112   case Builtin::BI__builtin_clrsbll: {
3113     // clrsb(x) -> clz(x < 0 ? ~x : x) - 1 or
3114     Value *ArgValue = EmitScalarExpr(E->getArg(0));
3115 
3116     llvm::Type *ArgType = ArgValue->getType();
3117     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
3118 
3119     llvm::Type *ResultType = ConvertType(E->getType());
3120     Value *Zero = llvm::Constant::getNullValue(ArgType);
3121     Value *IsNeg = Builder.CreateICmpSLT(ArgValue, Zero, "isneg");
3122     Value *Inverse = Builder.CreateNot(ArgValue, "not");
3123     Value *Tmp = Builder.CreateSelect(IsNeg, Inverse, ArgValue);
3124     Value *Ctlz = Builder.CreateCall(F, {Tmp, Builder.getFalse()});
3125     Value *Result = Builder.CreateSub(Ctlz, llvm::ConstantInt::get(ArgType, 1));
3126     Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
3127                                    "cast");
3128     return RValue::get(Result);
3129   }
3130   case Builtin::BI__builtin_ctzs:
3131   case Builtin::BI__builtin_ctz:
3132   case Builtin::BI__builtin_ctzl:
3133   case Builtin::BI__builtin_ctzll: {
3134     Value *ArgValue = EmitCheckedArgForBuiltin(E->getArg(0), BCK_CTZPassedZero);
3135 
3136     llvm::Type *ArgType = ArgValue->getType();
3137     Function *F = CGM.getIntrinsic(Intrinsic::cttz, ArgType);
3138 
3139     llvm::Type *ResultType = ConvertType(E->getType());
3140     Value *ZeroUndef = Builder.getInt1(getTarget().isCLZForZeroUndef());
3141     Value *Result = Builder.CreateCall(F, {ArgValue, ZeroUndef});
3142     if (Result->getType() != ResultType)
3143       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
3144                                      "cast");
3145     return RValue::get(Result);
3146   }
3147   case Builtin::BI__builtin_clzs:
3148   case Builtin::BI__builtin_clz:
3149   case Builtin::BI__builtin_clzl:
3150   case Builtin::BI__builtin_clzll: {
3151     Value *ArgValue = EmitCheckedArgForBuiltin(E->getArg(0), BCK_CLZPassedZero);
3152 
3153     llvm::Type *ArgType = ArgValue->getType();
3154     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
3155 
3156     llvm::Type *ResultType = ConvertType(E->getType());
3157     Value *ZeroUndef = Builder.getInt1(getTarget().isCLZForZeroUndef());
3158     Value *Result = Builder.CreateCall(F, {ArgValue, ZeroUndef});
3159     if (Result->getType() != ResultType)
3160       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
3161                                      "cast");
3162     return RValue::get(Result);
3163   }
3164   case Builtin::BI__builtin_ffs:
3165   case Builtin::BI__builtin_ffsl:
3166   case Builtin::BI__builtin_ffsll: {
3167     // ffs(x) -> x ? cttz(x) + 1 : 0
3168     Value *ArgValue = EmitScalarExpr(E->getArg(0));
3169 
3170     llvm::Type *ArgType = ArgValue->getType();
3171     Function *F = CGM.getIntrinsic(Intrinsic::cttz, ArgType);
3172 
3173     llvm::Type *ResultType = ConvertType(E->getType());
3174     Value *Tmp =
3175         Builder.CreateAdd(Builder.CreateCall(F, {ArgValue, Builder.getTrue()}),
3176                           llvm::ConstantInt::get(ArgType, 1));
3177     Value *Zero = llvm::Constant::getNullValue(ArgType);
3178     Value *IsZero = Builder.CreateICmpEQ(ArgValue, Zero, "iszero");
3179     Value *Result = Builder.CreateSelect(IsZero, Zero, Tmp, "ffs");
3180     if (Result->getType() != ResultType)
3181       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
3182                                      "cast");
3183     return RValue::get(Result);
3184   }
3185   case Builtin::BI__builtin_parity:
3186   case Builtin::BI__builtin_parityl:
3187   case Builtin::BI__builtin_parityll: {
3188     // parity(x) -> ctpop(x) & 1
3189     Value *ArgValue = EmitScalarExpr(E->getArg(0));
3190 
3191     llvm::Type *ArgType = ArgValue->getType();
3192     Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType);
3193 
3194     llvm::Type *ResultType = ConvertType(E->getType());
3195     Value *Tmp = Builder.CreateCall(F, ArgValue);
3196     Value *Result = Builder.CreateAnd(Tmp, llvm::ConstantInt::get(ArgType, 1));
3197     if (Result->getType() != ResultType)
3198       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
3199                                      "cast");
3200     return RValue::get(Result);
3201   }
3202   case Builtin::BI__lzcnt16:
3203   case Builtin::BI__lzcnt:
3204   case Builtin::BI__lzcnt64: {
3205     Value *ArgValue = EmitScalarExpr(E->getArg(0));
3206 
3207     llvm::Type *ArgType = ArgValue->getType();
3208     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
3209 
3210     llvm::Type *ResultType = ConvertType(E->getType());
3211     Value *Result = Builder.CreateCall(F, {ArgValue, Builder.getFalse()});
3212     if (Result->getType() != ResultType)
3213       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
3214                                      "cast");
3215     return RValue::get(Result);
3216   }
3217   case Builtin::BI__popcnt16:
3218   case Builtin::BI__popcnt:
3219   case Builtin::BI__popcnt64:
3220   case Builtin::BI__builtin_popcount:
3221   case Builtin::BI__builtin_popcountl:
3222   case Builtin::BI__builtin_popcountll: {
3223     Value *ArgValue = EmitScalarExpr(E->getArg(0));
3224 
3225     llvm::Type *ArgType = ArgValue->getType();
3226     Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType);
3227 
3228     llvm::Type *ResultType = ConvertType(E->getType());
3229     Value *Result = Builder.CreateCall(F, ArgValue);
3230     if (Result->getType() != ResultType)
3231       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
3232                                      "cast");
3233     return RValue::get(Result);
3234   }
3235   case Builtin::BI__builtin_unpredictable: {
3236     // Always return the argument of __builtin_unpredictable. LLVM does not
3237     // handle this builtin. Metadata for this builtin should be added directly
3238     // to instructions such as branches or switches that use it.
3239     return RValue::get(EmitScalarExpr(E->getArg(0)));
3240   }
3241   case Builtin::BI__builtin_expect: {
3242     Value *ArgValue = EmitScalarExpr(E->getArg(0));
3243     llvm::Type *ArgType = ArgValue->getType();
3244 
3245     Value *ExpectedValue = EmitScalarExpr(E->getArg(1));
3246     // Don't generate llvm.expect on -O0 as the backend won't use it for
3247     // anything.
3248     // Note, we still IRGen ExpectedValue because it could have side-effects.
3249     if (CGM.getCodeGenOpts().OptimizationLevel == 0)
3250       return RValue::get(ArgValue);
3251 
3252     Function *FnExpect = CGM.getIntrinsic(Intrinsic::expect, ArgType);
3253     Value *Result =
3254         Builder.CreateCall(FnExpect, {ArgValue, ExpectedValue}, "expval");
3255     return RValue::get(Result);
3256   }
3257   case Builtin::BI__builtin_expect_with_probability: {
3258     Value *ArgValue = EmitScalarExpr(E->getArg(0));
3259     llvm::Type *ArgType = ArgValue->getType();
3260 
3261     Value *ExpectedValue = EmitScalarExpr(E->getArg(1));
3262     llvm::APFloat Probability(0.0);
3263     const Expr *ProbArg = E->getArg(2);
3264     bool EvalSucceed = ProbArg->EvaluateAsFloat(Probability, CGM.getContext());
3265     assert(EvalSucceed && "probability should be able to evaluate as float");
3266     (void)EvalSucceed;
3267     bool LoseInfo = false;
3268     Probability.convert(llvm::APFloat::IEEEdouble(),
3269                         llvm::RoundingMode::Dynamic, &LoseInfo);
3270     llvm::Type *Ty = ConvertType(ProbArg->getType());
3271     Constant *Confidence = ConstantFP::get(Ty, Probability);
3272     // Don't generate llvm.expect.with.probability on -O0 as the backend
3273     // won't use it for anything.
3274     // Note, we still IRGen ExpectedValue because it could have side-effects.
3275     if (CGM.getCodeGenOpts().OptimizationLevel == 0)
3276       return RValue::get(ArgValue);
3277 
3278     Function *FnExpect =
3279         CGM.getIntrinsic(Intrinsic::expect_with_probability, ArgType);
3280     Value *Result = Builder.CreateCall(
3281         FnExpect, {ArgValue, ExpectedValue, Confidence}, "expval");
3282     return RValue::get(Result);
3283   }
3284   case Builtin::BI__builtin_assume_aligned: {
3285     const Expr *Ptr = E->getArg(0);
3286     Value *PtrValue = EmitScalarExpr(Ptr);
3287     Value *OffsetValue =
3288       (E->getNumArgs() > 2) ? EmitScalarExpr(E->getArg(2)) : nullptr;
3289 
3290     Value *AlignmentValue = EmitScalarExpr(E->getArg(1));
3291     ConstantInt *AlignmentCI = cast<ConstantInt>(AlignmentValue);
3292     if (AlignmentCI->getValue().ugt(llvm::Value::MaximumAlignment))
3293       AlignmentCI = ConstantInt::get(AlignmentCI->getIntegerType(),
3294                                      llvm::Value::MaximumAlignment);
3295 
3296     emitAlignmentAssumption(PtrValue, Ptr,
3297                             /*The expr loc is sufficient.*/ SourceLocation(),
3298                             AlignmentCI, OffsetValue);
3299     return RValue::get(PtrValue);
3300   }
3301   case Builtin::BI__assume:
3302   case Builtin::BI__builtin_assume: {
3303     if (E->getArg(0)->HasSideEffects(getContext()))
3304       return RValue::get(nullptr);
3305 
3306     Value *ArgValue = EmitScalarExpr(E->getArg(0));
3307     Function *FnAssume = CGM.getIntrinsic(Intrinsic::assume);
3308     Builder.CreateCall(FnAssume, ArgValue);
3309     return RValue::get(nullptr);
3310   }
3311   case Builtin::BI__builtin_assume_separate_storage: {
3312     const Expr *Arg0 = E->getArg(0);
3313     const Expr *Arg1 = E->getArg(1);
3314 
3315     Value *Value0 = EmitScalarExpr(Arg0);
3316     Value *Value1 = EmitScalarExpr(Arg1);
3317 
3318     Value *Values[] = {Value0, Value1};
3319     OperandBundleDefT<Value *> OBD("separate_storage", Values);
3320     Builder.CreateAssumption(ConstantInt::getTrue(getLLVMContext()), {OBD});
3321     return RValue::get(nullptr);
3322   }
3323   case Builtin::BI__arithmetic_fence: {
3324     // Create the builtin call if FastMath is selected, and the target
3325     // supports the builtin, otherwise just return the argument.
3326     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3327     llvm::FastMathFlags FMF = Builder.getFastMathFlags();
3328     bool isArithmeticFenceEnabled =
3329         FMF.allowReassoc() &&
3330         getContext().getTargetInfo().checkArithmeticFenceSupported();
3331     QualType ArgType = E->getArg(0)->getType();
3332     if (ArgType->isComplexType()) {
3333       if (isArithmeticFenceEnabled) {
3334         QualType ElementType = ArgType->castAs<ComplexType>()->getElementType();
3335         ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
3336         Value *Real = Builder.CreateArithmeticFence(ComplexVal.first,
3337                                                     ConvertType(ElementType));
3338         Value *Imag = Builder.CreateArithmeticFence(ComplexVal.second,
3339                                                     ConvertType(ElementType));
3340         return RValue::getComplex(std::make_pair(Real, Imag));
3341       }
3342       ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
3343       Value *Real = ComplexVal.first;
3344       Value *Imag = ComplexVal.second;
3345       return RValue::getComplex(std::make_pair(Real, Imag));
3346     }
3347     Value *ArgValue = EmitScalarExpr(E->getArg(0));
3348     if (isArithmeticFenceEnabled)
3349       return RValue::get(
3350           Builder.CreateArithmeticFence(ArgValue, ConvertType(ArgType)));
3351     return RValue::get(ArgValue);
3352   }
3353   case Builtin::BI__builtin_bswap16:
3354   case Builtin::BI__builtin_bswap32:
3355   case Builtin::BI__builtin_bswap64:
3356   case Builtin::BI_byteswap_ushort:
3357   case Builtin::BI_byteswap_ulong:
3358   case Builtin::BI_byteswap_uint64: {
3359     return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::bswap));
3360   }
3361   case Builtin::BI__builtin_bitreverse8:
3362   case Builtin::BI__builtin_bitreverse16:
3363   case Builtin::BI__builtin_bitreverse32:
3364   case Builtin::BI__builtin_bitreverse64: {
3365     return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::bitreverse));
3366   }
3367   case Builtin::BI__builtin_rotateleft8:
3368   case Builtin::BI__builtin_rotateleft16:
3369   case Builtin::BI__builtin_rotateleft32:
3370   case Builtin::BI__builtin_rotateleft64:
3371   case Builtin::BI_rotl8: // Microsoft variants of rotate left
3372   case Builtin::BI_rotl16:
3373   case Builtin::BI_rotl:
3374   case Builtin::BI_lrotl:
3375   case Builtin::BI_rotl64:
3376     return emitRotate(E, false);
3377 
3378   case Builtin::BI__builtin_rotateright8:
3379   case Builtin::BI__builtin_rotateright16:
3380   case Builtin::BI__builtin_rotateright32:
3381   case Builtin::BI__builtin_rotateright64:
3382   case Builtin::BI_rotr8: // Microsoft variants of rotate right
3383   case Builtin::BI_rotr16:
3384   case Builtin::BI_rotr:
3385   case Builtin::BI_lrotr:
3386   case Builtin::BI_rotr64:
3387     return emitRotate(E, true);
3388 
3389   case Builtin::BI__builtin_constant_p: {
3390     llvm::Type *ResultType = ConvertType(E->getType());
3391 
3392     const Expr *Arg = E->getArg(0);
3393     QualType ArgType = Arg->getType();
3394     // FIXME: The allowance for Obj-C pointers and block pointers is historical
3395     // and likely a mistake.
3396     if (!ArgType->isIntegralOrEnumerationType() && !ArgType->isFloatingType() &&
3397         !ArgType->isObjCObjectPointerType() && !ArgType->isBlockPointerType())
3398       // Per the GCC documentation, only numeric constants are recognized after
3399       // inlining.
3400       return RValue::get(ConstantInt::get(ResultType, 0));
3401 
3402     if (Arg->HasSideEffects(getContext()))
3403       // The argument is unevaluated, so be conservative if it might have
3404       // side-effects.
3405       return RValue::get(ConstantInt::get(ResultType, 0));
3406 
3407     Value *ArgValue = EmitScalarExpr(Arg);
3408     if (ArgType->isObjCObjectPointerType()) {
3409       // Convert Objective-C objects to id because we cannot distinguish between
3410       // LLVM types for Obj-C classes as they are opaque.
3411       ArgType = CGM.getContext().getObjCIdType();
3412       ArgValue = Builder.CreateBitCast(ArgValue, ConvertType(ArgType));
3413     }
3414     Function *F =
3415         CGM.getIntrinsic(Intrinsic::is_constant, ConvertType(ArgType));
3416     Value *Result = Builder.CreateCall(F, ArgValue);
3417     if (Result->getType() != ResultType)
3418       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/false);
3419     return RValue::get(Result);
3420   }
3421   case Builtin::BI__builtin_dynamic_object_size:
3422   case Builtin::BI__builtin_object_size: {
3423     unsigned Type =
3424         E->getArg(1)->EvaluateKnownConstInt(getContext()).getZExtValue();
3425     auto *ResType = cast<llvm::IntegerType>(ConvertType(E->getType()));
3426 
3427     // We pass this builtin onto the optimizer so that it can figure out the
3428     // object size in more complex cases.
3429     bool IsDynamic = BuiltinID == Builtin::BI__builtin_dynamic_object_size;
3430     return RValue::get(emitBuiltinObjectSize(E->getArg(0), Type, ResType,
3431                                              /*EmittedE=*/nullptr, IsDynamic));
3432   }
3433   case Builtin::BI__builtin_prefetch: {
3434     Value *Locality, *RW, *Address = EmitScalarExpr(E->getArg(0));
3435     // FIXME: Technically these constants should of type 'int', yes?
3436     RW = (E->getNumArgs() > 1) ? EmitScalarExpr(E->getArg(1)) :
3437       llvm::ConstantInt::get(Int32Ty, 0);
3438     Locality = (E->getNumArgs() > 2) ? EmitScalarExpr(E->getArg(2)) :
3439       llvm::ConstantInt::get(Int32Ty, 3);
3440     Value *Data = llvm::ConstantInt::get(Int32Ty, 1);
3441     Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType());
3442     Builder.CreateCall(F, {Address, RW, Locality, Data});
3443     return RValue::get(nullptr);
3444   }
3445   case Builtin::BI__builtin_readcyclecounter: {
3446     Function *F = CGM.getIntrinsic(Intrinsic::readcyclecounter);
3447     return RValue::get(Builder.CreateCall(F));
3448   }
3449   case Builtin::BI__builtin___clear_cache: {
3450     Value *Begin = EmitScalarExpr(E->getArg(0));
3451     Value *End = EmitScalarExpr(E->getArg(1));
3452     Function *F = CGM.getIntrinsic(Intrinsic::clear_cache);
3453     return RValue::get(Builder.CreateCall(F, {Begin, End}));
3454   }
3455   case Builtin::BI__builtin_trap:
3456     EmitTrapCall(Intrinsic::trap);
3457     return RValue::get(nullptr);
3458   case Builtin::BI__debugbreak:
3459     EmitTrapCall(Intrinsic::debugtrap);
3460     return RValue::get(nullptr);
3461   case Builtin::BI__builtin_unreachable: {
3462     EmitUnreachable(E->getExprLoc());
3463 
3464     // We do need to preserve an insertion point.
3465     EmitBlock(createBasicBlock("unreachable.cont"));
3466 
3467     return RValue::get(nullptr);
3468   }
3469 
3470   case Builtin::BI__builtin_powi:
3471   case Builtin::BI__builtin_powif:
3472   case Builtin::BI__builtin_powil: {
3473     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
3474     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
3475 
3476     if (Builder.getIsFPConstrained()) {
3477       // FIXME: llvm.powi has 2 mangling types,
3478       // llvm.experimental.constrained.powi has one.
3479       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3480       Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_powi,
3481                                      Src0->getType());
3482       return RValue::get(Builder.CreateConstrainedFPCall(F, { Src0, Src1 }));
3483     }
3484 
3485     Function *F = CGM.getIntrinsic(Intrinsic::powi,
3486                                    { Src0->getType(), Src1->getType() });
3487     return RValue::get(Builder.CreateCall(F, { Src0, Src1 }));
3488   }
3489   case Builtin::BI__builtin_frexpl: {
3490     // Linux PPC will not be adding additional PPCDoubleDouble support.
3491     // WIP to switch default to IEEE long double. Will emit libcall for
3492     // frexpl instead of legalizing this type in the BE.
3493     if (&getTarget().getLongDoubleFormat() == &llvm::APFloat::PPCDoubleDouble())
3494       break;
3495     LLVM_FALLTHROUGH;
3496   }
3497   case Builtin::BI__builtin_frexp:
3498   case Builtin::BI__builtin_frexpf:
3499   case Builtin::BI__builtin_frexpf128:
3500   case Builtin::BI__builtin_frexpf16:
3501     return RValue::get(emitFrexpBuiltin(*this, E, Intrinsic::frexp));
3502   case Builtin::BI__builtin_isgreater:
3503   case Builtin::BI__builtin_isgreaterequal:
3504   case Builtin::BI__builtin_isless:
3505   case Builtin::BI__builtin_islessequal:
3506   case Builtin::BI__builtin_islessgreater:
3507   case Builtin::BI__builtin_isunordered: {
3508     // Ordered comparisons: we know the arguments to these are matching scalar
3509     // floating point values.
3510     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3511     Value *LHS = EmitScalarExpr(E->getArg(0));
3512     Value *RHS = EmitScalarExpr(E->getArg(1));
3513 
3514     switch (BuiltinID) {
3515     default: llvm_unreachable("Unknown ordered comparison");
3516     case Builtin::BI__builtin_isgreater:
3517       LHS = Builder.CreateFCmpOGT(LHS, RHS, "cmp");
3518       break;
3519     case Builtin::BI__builtin_isgreaterequal:
3520       LHS = Builder.CreateFCmpOGE(LHS, RHS, "cmp");
3521       break;
3522     case Builtin::BI__builtin_isless:
3523       LHS = Builder.CreateFCmpOLT(LHS, RHS, "cmp");
3524       break;
3525     case Builtin::BI__builtin_islessequal:
3526       LHS = Builder.CreateFCmpOLE(LHS, RHS, "cmp");
3527       break;
3528     case Builtin::BI__builtin_islessgreater:
3529       LHS = Builder.CreateFCmpONE(LHS, RHS, "cmp");
3530       break;
3531     case Builtin::BI__builtin_isunordered:
3532       LHS = Builder.CreateFCmpUNO(LHS, RHS, "cmp");
3533       break;
3534     }
3535     // ZExt bool to int type.
3536     return RValue::get(Builder.CreateZExt(LHS, ConvertType(E->getType())));
3537   }
3538 
3539   case Builtin::BI__builtin_isnan: {
3540     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3541     Value *V = EmitScalarExpr(E->getArg(0));
3542     if (Value *Result = tryUseTestFPKind(*this, BuiltinID, V))
3543       return RValue::get(Result);
3544     return RValue::get(
3545         Builder.CreateZExt(Builder.createIsFPClass(V, FPClassTest::fcNan),
3546                            ConvertType(E->getType())));
3547   }
3548 
3549   case Builtin::BI__builtin_issignaling: {
3550     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3551     Value *V = EmitScalarExpr(E->getArg(0));
3552     return RValue::get(
3553         Builder.CreateZExt(Builder.createIsFPClass(V, FPClassTest::fcSNan),
3554                            ConvertType(E->getType())));
3555   }
3556 
3557   case Builtin::BI__builtin_isinf: {
3558     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3559     Value *V = EmitScalarExpr(E->getArg(0));
3560     if (Value *Result = tryUseTestFPKind(*this, BuiltinID, V))
3561       return RValue::get(Result);
3562     return RValue::get(
3563         Builder.CreateZExt(Builder.createIsFPClass(V, FPClassTest::fcInf),
3564                            ConvertType(E->getType())));
3565   }
3566 
3567   case Builtin::BIfinite:
3568   case Builtin::BI__finite:
3569   case Builtin::BIfinitef:
3570   case Builtin::BI__finitef:
3571   case Builtin::BIfinitel:
3572   case Builtin::BI__finitel:
3573   case Builtin::BI__builtin_isfinite: {
3574     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3575     Value *V = EmitScalarExpr(E->getArg(0));
3576     if (Value *Result = tryUseTestFPKind(*this, BuiltinID, V))
3577       return RValue::get(Result);
3578     return RValue::get(
3579         Builder.CreateZExt(Builder.createIsFPClass(V, FPClassTest::fcFinite),
3580                            ConvertType(E->getType())));
3581   }
3582 
3583   case Builtin::BI__builtin_isnormal: {
3584     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3585     Value *V = EmitScalarExpr(E->getArg(0));
3586     return RValue::get(
3587         Builder.CreateZExt(Builder.createIsFPClass(V, FPClassTest::fcNormal),
3588                            ConvertType(E->getType())));
3589   }
3590 
3591   case Builtin::BI__builtin_issubnormal: {
3592     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3593     Value *V = EmitScalarExpr(E->getArg(0));
3594     return RValue::get(
3595         Builder.CreateZExt(Builder.createIsFPClass(V, FPClassTest::fcSubnormal),
3596                            ConvertType(E->getType())));
3597   }
3598 
3599   case Builtin::BI__builtin_iszero: {
3600     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3601     Value *V = EmitScalarExpr(E->getArg(0));
3602     return RValue::get(
3603         Builder.CreateZExt(Builder.createIsFPClass(V, FPClassTest::fcZero),
3604                            ConvertType(E->getType())));
3605   }
3606 
3607   case Builtin::BI__builtin_isfpclass: {
3608     Expr::EvalResult Result;
3609     if (!E->getArg(1)->EvaluateAsInt(Result, CGM.getContext()))
3610       break;
3611     uint64_t Test = Result.Val.getInt().getLimitedValue();
3612     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3613     Value *V = EmitScalarExpr(E->getArg(0));
3614     return RValue::get(Builder.CreateZExt(Builder.createIsFPClass(V, Test),
3615                                           ConvertType(E->getType())));
3616   }
3617 
3618   case Builtin::BI__builtin_nondeterministic_value: {
3619     llvm::Type *Ty = ConvertType(E->getArg(0)->getType());
3620 
3621     Value *Result = PoisonValue::get(Ty);
3622     Result = Builder.CreateFreeze(Result);
3623 
3624     return RValue::get(Result);
3625   }
3626 
3627   case Builtin::BI__builtin_elementwise_abs: {
3628     Value *Result;
3629     QualType QT = E->getArg(0)->getType();
3630 
3631     if (auto *VecTy = QT->getAs<VectorType>())
3632       QT = VecTy->getElementType();
3633     if (QT->isIntegerType())
3634       Result = Builder.CreateBinaryIntrinsic(
3635           llvm::Intrinsic::abs, EmitScalarExpr(E->getArg(0)),
3636           Builder.getFalse(), nullptr, "elt.abs");
3637     else
3638       Result = emitUnaryBuiltin(*this, E, llvm::Intrinsic::fabs, "elt.abs");
3639 
3640     return RValue::get(Result);
3641   }
3642 
3643   case Builtin::BI__builtin_elementwise_ceil:
3644     return RValue::get(
3645         emitUnaryBuiltin(*this, E, llvm::Intrinsic::ceil, "elt.ceil"));
3646   case Builtin::BI__builtin_elementwise_exp:
3647     return RValue::get(
3648         emitUnaryBuiltin(*this, E, llvm::Intrinsic::exp, "elt.exp"));
3649   case Builtin::BI__builtin_elementwise_exp2:
3650     return RValue::get(
3651         emitUnaryBuiltin(*this, E, llvm::Intrinsic::exp2, "elt.exp2"));
3652   case Builtin::BI__builtin_elementwise_log:
3653     return RValue::get(
3654         emitUnaryBuiltin(*this, E, llvm::Intrinsic::log, "elt.log"));
3655   case Builtin::BI__builtin_elementwise_log2:
3656     return RValue::get(
3657         emitUnaryBuiltin(*this, E, llvm::Intrinsic::log2, "elt.log2"));
3658   case Builtin::BI__builtin_elementwise_log10:
3659     return RValue::get(
3660         emitUnaryBuiltin(*this, E, llvm::Intrinsic::log10, "elt.log10"));
3661   case Builtin::BI__builtin_elementwise_pow: {
3662     return RValue::get(emitBinaryBuiltin(*this, E, llvm::Intrinsic::pow));
3663   }
3664   case Builtin::BI__builtin_elementwise_bitreverse:
3665     return RValue::get(emitUnaryBuiltin(*this, E, llvm::Intrinsic::bitreverse,
3666                                         "elt.bitreverse"));
3667   case Builtin::BI__builtin_elementwise_cos:
3668     return RValue::get(
3669         emitUnaryBuiltin(*this, E, llvm::Intrinsic::cos, "elt.cos"));
3670   case Builtin::BI__builtin_elementwise_floor:
3671     return RValue::get(
3672         emitUnaryBuiltin(*this, E, llvm::Intrinsic::floor, "elt.floor"));
3673   case Builtin::BI__builtin_elementwise_roundeven:
3674     return RValue::get(emitUnaryBuiltin(*this, E, llvm::Intrinsic::roundeven,
3675                                         "elt.roundeven"));
3676   case Builtin::BI__builtin_elementwise_round:
3677     return RValue::get(emitUnaryBuiltin(*this, E, llvm::Intrinsic::round,
3678                                         "elt.round"));
3679   case Builtin::BI__builtin_elementwise_rint:
3680     return RValue::get(emitUnaryBuiltin(*this, E, llvm::Intrinsic::rint,
3681                                         "elt.rint"));
3682   case Builtin::BI__builtin_elementwise_nearbyint:
3683     return RValue::get(emitUnaryBuiltin(*this, E, llvm::Intrinsic::nearbyint,
3684                                         "elt.nearbyint"));
3685   case Builtin::BI__builtin_elementwise_sin:
3686     return RValue::get(
3687         emitUnaryBuiltin(*this, E, llvm::Intrinsic::sin, "elt.sin"));
3688 
3689   case Builtin::BI__builtin_elementwise_trunc:
3690     return RValue::get(
3691         emitUnaryBuiltin(*this, E, llvm::Intrinsic::trunc, "elt.trunc"));
3692   case Builtin::BI__builtin_elementwise_canonicalize:
3693     return RValue::get(
3694         emitUnaryBuiltin(*this, E, llvm::Intrinsic::canonicalize, "elt.canonicalize"));
3695   case Builtin::BI__builtin_elementwise_copysign:
3696     return RValue::get(emitBinaryBuiltin(*this, E, llvm::Intrinsic::copysign));
3697   case Builtin::BI__builtin_elementwise_fma:
3698     return RValue::get(emitTernaryBuiltin(*this, E, llvm::Intrinsic::fma));
3699   case Builtin::BI__builtin_elementwise_add_sat:
3700   case Builtin::BI__builtin_elementwise_sub_sat: {
3701     Value *Op0 = EmitScalarExpr(E->getArg(0));
3702     Value *Op1 = EmitScalarExpr(E->getArg(1));
3703     Value *Result;
3704     assert(Op0->getType()->isIntOrIntVectorTy() && "integer type expected");
3705     QualType Ty = E->getArg(0)->getType();
3706     if (auto *VecTy = Ty->getAs<VectorType>())
3707       Ty = VecTy->getElementType();
3708     bool IsSigned = Ty->isSignedIntegerType();
3709     unsigned Opc;
3710     if (BuiltinIDIfNoAsmLabel == Builtin::BI__builtin_elementwise_add_sat)
3711       Opc = IsSigned ? llvm::Intrinsic::sadd_sat : llvm::Intrinsic::uadd_sat;
3712     else
3713       Opc = IsSigned ? llvm::Intrinsic::ssub_sat : llvm::Intrinsic::usub_sat;
3714     Result = Builder.CreateBinaryIntrinsic(Opc, Op0, Op1, nullptr, "elt.sat");
3715     return RValue::get(Result);
3716   }
3717 
3718   case Builtin::BI__builtin_elementwise_max: {
3719     Value *Op0 = EmitScalarExpr(E->getArg(0));
3720     Value *Op1 = EmitScalarExpr(E->getArg(1));
3721     Value *Result;
3722     if (Op0->getType()->isIntOrIntVectorTy()) {
3723       QualType Ty = E->getArg(0)->getType();
3724       if (auto *VecTy = Ty->getAs<VectorType>())
3725         Ty = VecTy->getElementType();
3726       Result = Builder.CreateBinaryIntrinsic(Ty->isSignedIntegerType()
3727                                                  ? llvm::Intrinsic::smax
3728                                                  : llvm::Intrinsic::umax,
3729                                              Op0, Op1, nullptr, "elt.max");
3730     } else
3731       Result = Builder.CreateMaxNum(Op0, Op1, "elt.max");
3732     return RValue::get(Result);
3733   }
3734   case Builtin::BI__builtin_elementwise_min: {
3735     Value *Op0 = EmitScalarExpr(E->getArg(0));
3736     Value *Op1 = EmitScalarExpr(E->getArg(1));
3737     Value *Result;
3738     if (Op0->getType()->isIntOrIntVectorTy()) {
3739       QualType Ty = E->getArg(0)->getType();
3740       if (auto *VecTy = Ty->getAs<VectorType>())
3741         Ty = VecTy->getElementType();
3742       Result = Builder.CreateBinaryIntrinsic(Ty->isSignedIntegerType()
3743                                                  ? llvm::Intrinsic::smin
3744                                                  : llvm::Intrinsic::umin,
3745                                              Op0, Op1, nullptr, "elt.min");
3746     } else
3747       Result = Builder.CreateMinNum(Op0, Op1, "elt.min");
3748     return RValue::get(Result);
3749   }
3750 
3751   case Builtin::BI__builtin_reduce_max: {
3752     auto GetIntrinsicID = [](QualType QT) {
3753       if (auto *VecTy = QT->getAs<VectorType>())
3754         QT = VecTy->getElementType();
3755       if (QT->isSignedIntegerType())
3756         return llvm::Intrinsic::vector_reduce_smax;
3757       if (QT->isUnsignedIntegerType())
3758         return llvm::Intrinsic::vector_reduce_umax;
3759       assert(QT->isFloatingType() && "must have a float here");
3760       return llvm::Intrinsic::vector_reduce_fmax;
3761     };
3762     return RValue::get(emitUnaryBuiltin(
3763         *this, E, GetIntrinsicID(E->getArg(0)->getType()), "rdx.min"));
3764   }
3765 
3766   case Builtin::BI__builtin_reduce_min: {
3767     auto GetIntrinsicID = [](QualType QT) {
3768       if (auto *VecTy = QT->getAs<VectorType>())
3769         QT = VecTy->getElementType();
3770       if (QT->isSignedIntegerType())
3771         return llvm::Intrinsic::vector_reduce_smin;
3772       if (QT->isUnsignedIntegerType())
3773         return llvm::Intrinsic::vector_reduce_umin;
3774       assert(QT->isFloatingType() && "must have a float here");
3775       return llvm::Intrinsic::vector_reduce_fmin;
3776     };
3777 
3778     return RValue::get(emitUnaryBuiltin(
3779         *this, E, GetIntrinsicID(E->getArg(0)->getType()), "rdx.min"));
3780   }
3781 
3782   case Builtin::BI__builtin_reduce_add:
3783     return RValue::get(emitUnaryBuiltin(
3784         *this, E, llvm::Intrinsic::vector_reduce_add, "rdx.add"));
3785   case Builtin::BI__builtin_reduce_mul:
3786     return RValue::get(emitUnaryBuiltin(
3787         *this, E, llvm::Intrinsic::vector_reduce_mul, "rdx.mul"));
3788   case Builtin::BI__builtin_reduce_xor:
3789     return RValue::get(emitUnaryBuiltin(
3790         *this, E, llvm::Intrinsic::vector_reduce_xor, "rdx.xor"));
3791   case Builtin::BI__builtin_reduce_or:
3792     return RValue::get(emitUnaryBuiltin(
3793         *this, E, llvm::Intrinsic::vector_reduce_or, "rdx.or"));
3794   case Builtin::BI__builtin_reduce_and:
3795     return RValue::get(emitUnaryBuiltin(
3796         *this, E, llvm::Intrinsic::vector_reduce_and, "rdx.and"));
3797 
3798   case Builtin::BI__builtin_matrix_transpose: {
3799     auto *MatrixTy = E->getArg(0)->getType()->castAs<ConstantMatrixType>();
3800     Value *MatValue = EmitScalarExpr(E->getArg(0));
3801     MatrixBuilder MB(Builder);
3802     Value *Result = MB.CreateMatrixTranspose(MatValue, MatrixTy->getNumRows(),
3803                                              MatrixTy->getNumColumns());
3804     return RValue::get(Result);
3805   }
3806 
3807   case Builtin::BI__builtin_matrix_column_major_load: {
3808     MatrixBuilder MB(Builder);
3809     // Emit everything that isn't dependent on the first parameter type
3810     Value *Stride = EmitScalarExpr(E->getArg(3));
3811     const auto *ResultTy = E->getType()->getAs<ConstantMatrixType>();
3812     auto *PtrTy = E->getArg(0)->getType()->getAs<PointerType>();
3813     assert(PtrTy && "arg0 must be of pointer type");
3814     bool IsVolatile = PtrTy->getPointeeType().isVolatileQualified();
3815 
3816     Address Src = EmitPointerWithAlignment(E->getArg(0));
3817     EmitNonNullArgCheck(RValue::get(Src.getPointer()), E->getArg(0)->getType(),
3818                         E->getArg(0)->getExprLoc(), FD, 0);
3819     Value *Result = MB.CreateColumnMajorLoad(
3820         Src.getElementType(), Src.getPointer(),
3821         Align(Src.getAlignment().getQuantity()), Stride, IsVolatile,
3822         ResultTy->getNumRows(), ResultTy->getNumColumns(),
3823         "matrix");
3824     return RValue::get(Result);
3825   }
3826 
3827   case Builtin::BI__builtin_matrix_column_major_store: {
3828     MatrixBuilder MB(Builder);
3829     Value *Matrix = EmitScalarExpr(E->getArg(0));
3830     Address Dst = EmitPointerWithAlignment(E->getArg(1));
3831     Value *Stride = EmitScalarExpr(E->getArg(2));
3832 
3833     const auto *MatrixTy = E->getArg(0)->getType()->getAs<ConstantMatrixType>();
3834     auto *PtrTy = E->getArg(1)->getType()->getAs<PointerType>();
3835     assert(PtrTy && "arg1 must be of pointer type");
3836     bool IsVolatile = PtrTy->getPointeeType().isVolatileQualified();
3837 
3838     EmitNonNullArgCheck(RValue::get(Dst.getPointer()), E->getArg(1)->getType(),
3839                         E->getArg(1)->getExprLoc(), FD, 0);
3840     Value *Result = MB.CreateColumnMajorStore(
3841         Matrix, Dst.getPointer(), Align(Dst.getAlignment().getQuantity()),
3842         Stride, IsVolatile, MatrixTy->getNumRows(), MatrixTy->getNumColumns());
3843     return RValue::get(Result);
3844   }
3845 
3846   case Builtin::BI__builtin_isinf_sign: {
3847     // isinf_sign(x) -> fabs(x) == infinity ? (signbit(x) ? -1 : 1) : 0
3848     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3849     // FIXME: for strictfp/IEEE-754 we need to not trap on SNaN here.
3850     Value *Arg = EmitScalarExpr(E->getArg(0));
3851     Value *AbsArg = EmitFAbs(*this, Arg);
3852     Value *IsInf = Builder.CreateFCmpOEQ(
3853         AbsArg, ConstantFP::getInfinity(Arg->getType()), "isinf");
3854     Value *IsNeg = EmitSignBit(*this, Arg);
3855 
3856     llvm::Type *IntTy = ConvertType(E->getType());
3857     Value *Zero = Constant::getNullValue(IntTy);
3858     Value *One = ConstantInt::get(IntTy, 1);
3859     Value *NegativeOne = ConstantInt::get(IntTy, -1);
3860     Value *SignResult = Builder.CreateSelect(IsNeg, NegativeOne, One);
3861     Value *Result = Builder.CreateSelect(IsInf, SignResult, Zero);
3862     return RValue::get(Result);
3863   }
3864 
3865   case Builtin::BI__builtin_flt_rounds: {
3866     Function *F = CGM.getIntrinsic(Intrinsic::get_rounding);
3867 
3868     llvm::Type *ResultType = ConvertType(E->getType());
3869     Value *Result = Builder.CreateCall(F);
3870     if (Result->getType() != ResultType)
3871       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
3872                                      "cast");
3873     return RValue::get(Result);
3874   }
3875 
3876   case Builtin::BI__builtin_set_flt_rounds: {
3877     Function *F = CGM.getIntrinsic(Intrinsic::set_rounding);
3878 
3879     Value *V = EmitScalarExpr(E->getArg(0));
3880     Builder.CreateCall(F, V);
3881     return RValue::get(nullptr);
3882   }
3883 
3884   case Builtin::BI__builtin_fpclassify: {
3885     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3886     // FIXME: for strictfp/IEEE-754 we need to not trap on SNaN here.
3887     Value *V = EmitScalarExpr(E->getArg(5));
3888     llvm::Type *Ty = ConvertType(E->getArg(5)->getType());
3889 
3890     // Create Result
3891     BasicBlock *Begin = Builder.GetInsertBlock();
3892     BasicBlock *End = createBasicBlock("fpclassify_end", this->CurFn);
3893     Builder.SetInsertPoint(End);
3894     PHINode *Result =
3895       Builder.CreatePHI(ConvertType(E->getArg(0)->getType()), 4,
3896                         "fpclassify_result");
3897 
3898     // if (V==0) return FP_ZERO
3899     Builder.SetInsertPoint(Begin);
3900     Value *IsZero = Builder.CreateFCmpOEQ(V, Constant::getNullValue(Ty),
3901                                           "iszero");
3902     Value *ZeroLiteral = EmitScalarExpr(E->getArg(4));
3903     BasicBlock *NotZero = createBasicBlock("fpclassify_not_zero", this->CurFn);
3904     Builder.CreateCondBr(IsZero, End, NotZero);
3905     Result->addIncoming(ZeroLiteral, Begin);
3906 
3907     // if (V != V) return FP_NAN
3908     Builder.SetInsertPoint(NotZero);
3909     Value *IsNan = Builder.CreateFCmpUNO(V, V, "cmp");
3910     Value *NanLiteral = EmitScalarExpr(E->getArg(0));
3911     BasicBlock *NotNan = createBasicBlock("fpclassify_not_nan", this->CurFn);
3912     Builder.CreateCondBr(IsNan, End, NotNan);
3913     Result->addIncoming(NanLiteral, NotZero);
3914 
3915     // if (fabs(V) == infinity) return FP_INFINITY
3916     Builder.SetInsertPoint(NotNan);
3917     Value *VAbs = EmitFAbs(*this, V);
3918     Value *IsInf =
3919       Builder.CreateFCmpOEQ(VAbs, ConstantFP::getInfinity(V->getType()),
3920                             "isinf");
3921     Value *InfLiteral = EmitScalarExpr(E->getArg(1));
3922     BasicBlock *NotInf = createBasicBlock("fpclassify_not_inf", this->CurFn);
3923     Builder.CreateCondBr(IsInf, End, NotInf);
3924     Result->addIncoming(InfLiteral, NotNan);
3925 
3926     // if (fabs(V) >= MIN_NORMAL) return FP_NORMAL else FP_SUBNORMAL
3927     Builder.SetInsertPoint(NotInf);
3928     APFloat Smallest = APFloat::getSmallestNormalized(
3929         getContext().getFloatTypeSemantics(E->getArg(5)->getType()));
3930     Value *IsNormal =
3931       Builder.CreateFCmpUGE(VAbs, ConstantFP::get(V->getContext(), Smallest),
3932                             "isnormal");
3933     Value *NormalResult =
3934       Builder.CreateSelect(IsNormal, EmitScalarExpr(E->getArg(2)),
3935                            EmitScalarExpr(E->getArg(3)));
3936     Builder.CreateBr(End);
3937     Result->addIncoming(NormalResult, NotInf);
3938 
3939     // return Result
3940     Builder.SetInsertPoint(End);
3941     return RValue::get(Result);
3942   }
3943 
3944   // An alloca will always return a pointer to the alloca (stack) address
3945   // space. This address space need not be the same as the AST / Language
3946   // default (e.g. in C / C++ auto vars are in the generic address space). At
3947   // the AST level this is handled within CreateTempAlloca et al., but for the
3948   // builtin / dynamic alloca we have to handle it here. We use an explicit cast
3949   // instead of passing an AS to CreateAlloca so as to not inhibit optimisation.
3950   case Builtin::BIalloca:
3951   case Builtin::BI_alloca:
3952   case Builtin::BI__builtin_alloca_uninitialized:
3953   case Builtin::BI__builtin_alloca: {
3954     Value *Size = EmitScalarExpr(E->getArg(0));
3955     const TargetInfo &TI = getContext().getTargetInfo();
3956     // The alignment of the alloca should correspond to __BIGGEST_ALIGNMENT__.
3957     const Align SuitableAlignmentInBytes =
3958         CGM.getContext()
3959             .toCharUnitsFromBits(TI.getSuitableAlign())
3960             .getAsAlign();
3961     AllocaInst *AI = Builder.CreateAlloca(Builder.getInt8Ty(), Size);
3962     AI->setAlignment(SuitableAlignmentInBytes);
3963     if (BuiltinID != Builtin::BI__builtin_alloca_uninitialized)
3964       initializeAlloca(*this, AI, Size, SuitableAlignmentInBytes);
3965     LangAS AAS = getASTAllocaAddressSpace();
3966     LangAS EAS = E->getType()->getPointeeType().getAddressSpace();
3967     if (AAS != EAS) {
3968       llvm::Type *Ty = CGM.getTypes().ConvertType(E->getType());
3969       return RValue::get(getTargetHooks().performAddrSpaceCast(*this, AI, AAS,
3970                                                                EAS, Ty));
3971     }
3972     return RValue::get(AI);
3973   }
3974 
3975   case Builtin::BI__builtin_alloca_with_align_uninitialized:
3976   case Builtin::BI__builtin_alloca_with_align: {
3977     Value *Size = EmitScalarExpr(E->getArg(0));
3978     Value *AlignmentInBitsValue = EmitScalarExpr(E->getArg(1));
3979     auto *AlignmentInBitsCI = cast<ConstantInt>(AlignmentInBitsValue);
3980     unsigned AlignmentInBits = AlignmentInBitsCI->getZExtValue();
3981     const Align AlignmentInBytes =
3982         CGM.getContext().toCharUnitsFromBits(AlignmentInBits).getAsAlign();
3983     AllocaInst *AI = Builder.CreateAlloca(Builder.getInt8Ty(), Size);
3984     AI->setAlignment(AlignmentInBytes);
3985     if (BuiltinID != Builtin::BI__builtin_alloca_with_align_uninitialized)
3986       initializeAlloca(*this, AI, Size, AlignmentInBytes);
3987     LangAS AAS = getASTAllocaAddressSpace();
3988     LangAS EAS = E->getType()->getPointeeType().getAddressSpace();
3989     if (AAS != EAS) {
3990       llvm::Type *Ty = CGM.getTypes().ConvertType(E->getType());
3991       return RValue::get(getTargetHooks().performAddrSpaceCast(*this, AI, AAS,
3992                                                                EAS, Ty));
3993     }
3994     return RValue::get(AI);
3995   }
3996 
3997   case Builtin::BIbzero:
3998   case Builtin::BI__builtin_bzero: {
3999     Address Dest = EmitPointerWithAlignment(E->getArg(0));
4000     Value *SizeVal = EmitScalarExpr(E->getArg(1));
4001     EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
4002                         E->getArg(0)->getExprLoc(), FD, 0);
4003     Builder.CreateMemSet(Dest, Builder.getInt8(0), SizeVal, false);
4004     return RValue::get(nullptr);
4005   }
4006 
4007   case Builtin::BIbcopy:
4008   case Builtin::BI__builtin_bcopy: {
4009     Address Src = EmitPointerWithAlignment(E->getArg(0));
4010     Address Dest = EmitPointerWithAlignment(E->getArg(1));
4011     Value *SizeVal = EmitScalarExpr(E->getArg(2));
4012     EmitNonNullArgCheck(RValue::get(Src.getPointer()), E->getArg(0)->getType(),
4013                         E->getArg(0)->getExprLoc(), FD, 0);
4014     EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(1)->getType(),
4015                         E->getArg(1)->getExprLoc(), FD, 0);
4016     Builder.CreateMemMove(Dest, Src, SizeVal, false);
4017     return RValue::get(Dest.getPointer());
4018   }
4019 
4020   case Builtin::BImemcpy:
4021   case Builtin::BI__builtin_memcpy:
4022   case Builtin::BImempcpy:
4023   case Builtin::BI__builtin_mempcpy: {
4024     Address Dest = EmitPointerWithAlignment(E->getArg(0));
4025     Address Src = EmitPointerWithAlignment(E->getArg(1));
4026     Value *SizeVal = EmitScalarExpr(E->getArg(2));
4027     EmitArgCheck(TCK_Store, Dest, E->getArg(0), 0);
4028     EmitArgCheck(TCK_Load, Src, E->getArg(1), 1);
4029     Builder.CreateMemCpy(Dest, Src, SizeVal, false);
4030     if (BuiltinID == Builtin::BImempcpy ||
4031         BuiltinID == Builtin::BI__builtin_mempcpy)
4032       return RValue::get(Builder.CreateInBoundsGEP(Dest.getElementType(),
4033                                                    Dest.getPointer(), SizeVal));
4034     else
4035       return RValue::get(Dest.getPointer());
4036   }
4037 
4038   case Builtin::BI__builtin_memcpy_inline: {
4039     Address Dest = EmitPointerWithAlignment(E->getArg(0));
4040     Address Src = EmitPointerWithAlignment(E->getArg(1));
4041     uint64_t Size =
4042         E->getArg(2)->EvaluateKnownConstInt(getContext()).getZExtValue();
4043     EmitArgCheck(TCK_Store, Dest, E->getArg(0), 0);
4044     EmitArgCheck(TCK_Load, Src, E->getArg(1), 1);
4045     Builder.CreateMemCpyInline(Dest, Src, Size);
4046     return RValue::get(nullptr);
4047   }
4048 
4049   case Builtin::BI__builtin_char_memchr:
4050     BuiltinID = Builtin::BI__builtin_memchr;
4051     break;
4052 
4053   case Builtin::BI__builtin___memcpy_chk: {
4054     // fold __builtin_memcpy_chk(x, y, cst1, cst2) to memcpy iff cst1<=cst2.
4055     Expr::EvalResult SizeResult, DstSizeResult;
4056     if (!E->getArg(2)->EvaluateAsInt(SizeResult, CGM.getContext()) ||
4057         !E->getArg(3)->EvaluateAsInt(DstSizeResult, CGM.getContext()))
4058       break;
4059     llvm::APSInt Size = SizeResult.Val.getInt();
4060     llvm::APSInt DstSize = DstSizeResult.Val.getInt();
4061     if (Size.ugt(DstSize))
4062       break;
4063     Address Dest = EmitPointerWithAlignment(E->getArg(0));
4064     Address Src = EmitPointerWithAlignment(E->getArg(1));
4065     Value *SizeVal = llvm::ConstantInt::get(Builder.getContext(), Size);
4066     Builder.CreateMemCpy(Dest, Src, SizeVal, false);
4067     return RValue::get(Dest.getPointer());
4068   }
4069 
4070   case Builtin::BI__builtin_objc_memmove_collectable: {
4071     Address DestAddr = EmitPointerWithAlignment(E->getArg(0));
4072     Address SrcAddr = EmitPointerWithAlignment(E->getArg(1));
4073     Value *SizeVal = EmitScalarExpr(E->getArg(2));
4074     CGM.getObjCRuntime().EmitGCMemmoveCollectable(*this,
4075                                                   DestAddr, SrcAddr, SizeVal);
4076     return RValue::get(DestAddr.getPointer());
4077   }
4078 
4079   case Builtin::BI__builtin___memmove_chk: {
4080     // fold __builtin_memmove_chk(x, y, cst1, cst2) to memmove iff cst1<=cst2.
4081     Expr::EvalResult SizeResult, DstSizeResult;
4082     if (!E->getArg(2)->EvaluateAsInt(SizeResult, CGM.getContext()) ||
4083         !E->getArg(3)->EvaluateAsInt(DstSizeResult, CGM.getContext()))
4084       break;
4085     llvm::APSInt Size = SizeResult.Val.getInt();
4086     llvm::APSInt DstSize = DstSizeResult.Val.getInt();
4087     if (Size.ugt(DstSize))
4088       break;
4089     Address Dest = EmitPointerWithAlignment(E->getArg(0));
4090     Address Src = EmitPointerWithAlignment(E->getArg(1));
4091     Value *SizeVal = llvm::ConstantInt::get(Builder.getContext(), Size);
4092     Builder.CreateMemMove(Dest, Src, SizeVal, false);
4093     return RValue::get(Dest.getPointer());
4094   }
4095 
4096   case Builtin::BImemmove:
4097   case Builtin::BI__builtin_memmove: {
4098     Address Dest = EmitPointerWithAlignment(E->getArg(0));
4099     Address Src = EmitPointerWithAlignment(E->getArg(1));
4100     Value *SizeVal = EmitScalarExpr(E->getArg(2));
4101     EmitArgCheck(TCK_Store, Dest, E->getArg(0), 0);
4102     EmitArgCheck(TCK_Load, Src, E->getArg(1), 1);
4103     Builder.CreateMemMove(Dest, Src, SizeVal, false);
4104     return RValue::get(Dest.getPointer());
4105   }
4106   case Builtin::BImemset:
4107   case Builtin::BI__builtin_memset: {
4108     Address Dest = EmitPointerWithAlignment(E->getArg(0));
4109     Value *ByteVal = Builder.CreateTrunc(EmitScalarExpr(E->getArg(1)),
4110                                          Builder.getInt8Ty());
4111     Value *SizeVal = EmitScalarExpr(E->getArg(2));
4112     EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
4113                         E->getArg(0)->getExprLoc(), FD, 0);
4114     Builder.CreateMemSet(Dest, ByteVal, SizeVal, false);
4115     return RValue::get(Dest.getPointer());
4116   }
4117   case Builtin::BI__builtin_memset_inline: {
4118     Address Dest = EmitPointerWithAlignment(E->getArg(0));
4119     Value *ByteVal =
4120         Builder.CreateTrunc(EmitScalarExpr(E->getArg(1)), Builder.getInt8Ty());
4121     uint64_t Size =
4122         E->getArg(2)->EvaluateKnownConstInt(getContext()).getZExtValue();
4123     EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
4124                         E->getArg(0)->getExprLoc(), FD, 0);
4125     Builder.CreateMemSetInline(Dest, ByteVal, Size);
4126     return RValue::get(nullptr);
4127   }
4128   case Builtin::BI__builtin___memset_chk: {
4129     // fold __builtin_memset_chk(x, y, cst1, cst2) to memset iff cst1<=cst2.
4130     Expr::EvalResult SizeResult, DstSizeResult;
4131     if (!E->getArg(2)->EvaluateAsInt(SizeResult, CGM.getContext()) ||
4132         !E->getArg(3)->EvaluateAsInt(DstSizeResult, CGM.getContext()))
4133       break;
4134     llvm::APSInt Size = SizeResult.Val.getInt();
4135     llvm::APSInt DstSize = DstSizeResult.Val.getInt();
4136     if (Size.ugt(DstSize))
4137       break;
4138     Address Dest = EmitPointerWithAlignment(E->getArg(0));
4139     Value *ByteVal = Builder.CreateTrunc(EmitScalarExpr(E->getArg(1)),
4140                                          Builder.getInt8Ty());
4141     Value *SizeVal = llvm::ConstantInt::get(Builder.getContext(), Size);
4142     Builder.CreateMemSet(Dest, ByteVal, SizeVal, false);
4143     return RValue::get(Dest.getPointer());
4144   }
4145   case Builtin::BI__builtin_wmemchr: {
4146     // The MSVC runtime library does not provide a definition of wmemchr, so we
4147     // need an inline implementation.
4148     if (!getTarget().getTriple().isOSMSVCRT())
4149       break;
4150 
4151     llvm::Type *WCharTy = ConvertType(getContext().WCharTy);
4152     Value *Str = EmitScalarExpr(E->getArg(0));
4153     Value *Chr = EmitScalarExpr(E->getArg(1));
4154     Value *Size = EmitScalarExpr(E->getArg(2));
4155 
4156     BasicBlock *Entry = Builder.GetInsertBlock();
4157     BasicBlock *CmpEq = createBasicBlock("wmemchr.eq");
4158     BasicBlock *Next = createBasicBlock("wmemchr.next");
4159     BasicBlock *Exit = createBasicBlock("wmemchr.exit");
4160     Value *SizeEq0 = Builder.CreateICmpEQ(Size, ConstantInt::get(SizeTy, 0));
4161     Builder.CreateCondBr(SizeEq0, Exit, CmpEq);
4162 
4163     EmitBlock(CmpEq);
4164     PHINode *StrPhi = Builder.CreatePHI(Str->getType(), 2);
4165     StrPhi->addIncoming(Str, Entry);
4166     PHINode *SizePhi = Builder.CreatePHI(SizeTy, 2);
4167     SizePhi->addIncoming(Size, Entry);
4168     CharUnits WCharAlign =
4169         getContext().getTypeAlignInChars(getContext().WCharTy);
4170     Value *StrCh = Builder.CreateAlignedLoad(WCharTy, StrPhi, WCharAlign);
4171     Value *FoundChr = Builder.CreateConstInBoundsGEP1_32(WCharTy, StrPhi, 0);
4172     Value *StrEqChr = Builder.CreateICmpEQ(StrCh, Chr);
4173     Builder.CreateCondBr(StrEqChr, Exit, Next);
4174 
4175     EmitBlock(Next);
4176     Value *NextStr = Builder.CreateConstInBoundsGEP1_32(WCharTy, StrPhi, 1);
4177     Value *NextSize = Builder.CreateSub(SizePhi, ConstantInt::get(SizeTy, 1));
4178     Value *NextSizeEq0 =
4179         Builder.CreateICmpEQ(NextSize, ConstantInt::get(SizeTy, 0));
4180     Builder.CreateCondBr(NextSizeEq0, Exit, CmpEq);
4181     StrPhi->addIncoming(NextStr, Next);
4182     SizePhi->addIncoming(NextSize, Next);
4183 
4184     EmitBlock(Exit);
4185     PHINode *Ret = Builder.CreatePHI(Str->getType(), 3);
4186     Ret->addIncoming(llvm::Constant::getNullValue(Str->getType()), Entry);
4187     Ret->addIncoming(llvm::Constant::getNullValue(Str->getType()), Next);
4188     Ret->addIncoming(FoundChr, CmpEq);
4189     return RValue::get(Ret);
4190   }
4191   case Builtin::BI__builtin_wmemcmp: {
4192     // The MSVC runtime library does not provide a definition of wmemcmp, so we
4193     // need an inline implementation.
4194     if (!getTarget().getTriple().isOSMSVCRT())
4195       break;
4196 
4197     llvm::Type *WCharTy = ConvertType(getContext().WCharTy);
4198 
4199     Value *Dst = EmitScalarExpr(E->getArg(0));
4200     Value *Src = EmitScalarExpr(E->getArg(1));
4201     Value *Size = EmitScalarExpr(E->getArg(2));
4202 
4203     BasicBlock *Entry = Builder.GetInsertBlock();
4204     BasicBlock *CmpGT = createBasicBlock("wmemcmp.gt");
4205     BasicBlock *CmpLT = createBasicBlock("wmemcmp.lt");
4206     BasicBlock *Next = createBasicBlock("wmemcmp.next");
4207     BasicBlock *Exit = createBasicBlock("wmemcmp.exit");
4208     Value *SizeEq0 = Builder.CreateICmpEQ(Size, ConstantInt::get(SizeTy, 0));
4209     Builder.CreateCondBr(SizeEq0, Exit, CmpGT);
4210 
4211     EmitBlock(CmpGT);
4212     PHINode *DstPhi = Builder.CreatePHI(Dst->getType(), 2);
4213     DstPhi->addIncoming(Dst, Entry);
4214     PHINode *SrcPhi = Builder.CreatePHI(Src->getType(), 2);
4215     SrcPhi->addIncoming(Src, Entry);
4216     PHINode *SizePhi = Builder.CreatePHI(SizeTy, 2);
4217     SizePhi->addIncoming(Size, Entry);
4218     CharUnits WCharAlign =
4219         getContext().getTypeAlignInChars(getContext().WCharTy);
4220     Value *DstCh = Builder.CreateAlignedLoad(WCharTy, DstPhi, WCharAlign);
4221     Value *SrcCh = Builder.CreateAlignedLoad(WCharTy, SrcPhi, WCharAlign);
4222     Value *DstGtSrc = Builder.CreateICmpUGT(DstCh, SrcCh);
4223     Builder.CreateCondBr(DstGtSrc, Exit, CmpLT);
4224 
4225     EmitBlock(CmpLT);
4226     Value *DstLtSrc = Builder.CreateICmpULT(DstCh, SrcCh);
4227     Builder.CreateCondBr(DstLtSrc, Exit, Next);
4228 
4229     EmitBlock(Next);
4230     Value *NextDst = Builder.CreateConstInBoundsGEP1_32(WCharTy, DstPhi, 1);
4231     Value *NextSrc = Builder.CreateConstInBoundsGEP1_32(WCharTy, SrcPhi, 1);
4232     Value *NextSize = Builder.CreateSub(SizePhi, ConstantInt::get(SizeTy, 1));
4233     Value *NextSizeEq0 =
4234         Builder.CreateICmpEQ(NextSize, ConstantInt::get(SizeTy, 0));
4235     Builder.CreateCondBr(NextSizeEq0, Exit, CmpGT);
4236     DstPhi->addIncoming(NextDst, Next);
4237     SrcPhi->addIncoming(NextSrc, Next);
4238     SizePhi->addIncoming(NextSize, Next);
4239 
4240     EmitBlock(Exit);
4241     PHINode *Ret = Builder.CreatePHI(IntTy, 4);
4242     Ret->addIncoming(ConstantInt::get(IntTy, 0), Entry);
4243     Ret->addIncoming(ConstantInt::get(IntTy, 1), CmpGT);
4244     Ret->addIncoming(ConstantInt::get(IntTy, -1), CmpLT);
4245     Ret->addIncoming(ConstantInt::get(IntTy, 0), Next);
4246     return RValue::get(Ret);
4247   }
4248   case Builtin::BI__builtin_dwarf_cfa: {
4249     // The offset in bytes from the first argument to the CFA.
4250     //
4251     // Why on earth is this in the frontend?  Is there any reason at
4252     // all that the backend can't reasonably determine this while
4253     // lowering llvm.eh.dwarf.cfa()?
4254     //
4255     // TODO: If there's a satisfactory reason, add a target hook for
4256     // this instead of hard-coding 0, which is correct for most targets.
4257     int32_t Offset = 0;
4258 
4259     Function *F = CGM.getIntrinsic(Intrinsic::eh_dwarf_cfa);
4260     return RValue::get(Builder.CreateCall(F,
4261                                       llvm::ConstantInt::get(Int32Ty, Offset)));
4262   }
4263   case Builtin::BI__builtin_return_address: {
4264     Value *Depth = ConstantEmitter(*this).emitAbstract(E->getArg(0),
4265                                                    getContext().UnsignedIntTy);
4266     Function *F = CGM.getIntrinsic(Intrinsic::returnaddress);
4267     return RValue::get(Builder.CreateCall(F, Depth));
4268   }
4269   case Builtin::BI_ReturnAddress: {
4270     Function *F = CGM.getIntrinsic(Intrinsic::returnaddress);
4271     return RValue::get(Builder.CreateCall(F, Builder.getInt32(0)));
4272   }
4273   case Builtin::BI__builtin_frame_address: {
4274     Value *Depth = ConstantEmitter(*this).emitAbstract(E->getArg(0),
4275                                                    getContext().UnsignedIntTy);
4276     Function *F = CGM.getIntrinsic(Intrinsic::frameaddress, AllocaInt8PtrTy);
4277     return RValue::get(Builder.CreateCall(F, Depth));
4278   }
4279   case Builtin::BI__builtin_extract_return_addr: {
4280     Value *Address = EmitScalarExpr(E->getArg(0));
4281     Value *Result = getTargetHooks().decodeReturnAddress(*this, Address);
4282     return RValue::get(Result);
4283   }
4284   case Builtin::BI__builtin_frob_return_addr: {
4285     Value *Address = EmitScalarExpr(E->getArg(0));
4286     Value *Result = getTargetHooks().encodeReturnAddress(*this, Address);
4287     return RValue::get(Result);
4288   }
4289   case Builtin::BI__builtin_dwarf_sp_column: {
4290     llvm::IntegerType *Ty
4291       = cast<llvm::IntegerType>(ConvertType(E->getType()));
4292     int Column = getTargetHooks().getDwarfEHStackPointer(CGM);
4293     if (Column == -1) {
4294       CGM.ErrorUnsupported(E, "__builtin_dwarf_sp_column");
4295       return RValue::get(llvm::UndefValue::get(Ty));
4296     }
4297     return RValue::get(llvm::ConstantInt::get(Ty, Column, true));
4298   }
4299   case Builtin::BI__builtin_init_dwarf_reg_size_table: {
4300     Value *Address = EmitScalarExpr(E->getArg(0));
4301     if (getTargetHooks().initDwarfEHRegSizeTable(*this, Address))
4302       CGM.ErrorUnsupported(E, "__builtin_init_dwarf_reg_size_table");
4303     return RValue::get(llvm::UndefValue::get(ConvertType(E->getType())));
4304   }
4305   case Builtin::BI__builtin_eh_return: {
4306     Value *Int = EmitScalarExpr(E->getArg(0));
4307     Value *Ptr = EmitScalarExpr(E->getArg(1));
4308 
4309     llvm::IntegerType *IntTy = cast<llvm::IntegerType>(Int->getType());
4310     assert((IntTy->getBitWidth() == 32 || IntTy->getBitWidth() == 64) &&
4311            "LLVM's __builtin_eh_return only supports 32- and 64-bit variants");
4312     Function *F =
4313         CGM.getIntrinsic(IntTy->getBitWidth() == 32 ? Intrinsic::eh_return_i32
4314                                                     : Intrinsic::eh_return_i64);
4315     Builder.CreateCall(F, {Int, Ptr});
4316     Builder.CreateUnreachable();
4317 
4318     // We do need to preserve an insertion point.
4319     EmitBlock(createBasicBlock("builtin_eh_return.cont"));
4320 
4321     return RValue::get(nullptr);
4322   }
4323   case Builtin::BI__builtin_unwind_init: {
4324     Function *F = CGM.getIntrinsic(Intrinsic::eh_unwind_init);
4325     Builder.CreateCall(F);
4326     return RValue::get(nullptr);
4327   }
4328   case Builtin::BI__builtin_extend_pointer: {
4329     // Extends a pointer to the size of an _Unwind_Word, which is
4330     // uint64_t on all platforms.  Generally this gets poked into a
4331     // register and eventually used as an address, so if the
4332     // addressing registers are wider than pointers and the platform
4333     // doesn't implicitly ignore high-order bits when doing
4334     // addressing, we need to make sure we zext / sext based on
4335     // the platform's expectations.
4336     //
4337     // See: http://gcc.gnu.org/ml/gcc-bugs/2002-02/msg00237.html
4338 
4339     // Cast the pointer to intptr_t.
4340     Value *Ptr = EmitScalarExpr(E->getArg(0));
4341     Value *Result = Builder.CreatePtrToInt(Ptr, IntPtrTy, "extend.cast");
4342 
4343     // If that's 64 bits, we're done.
4344     if (IntPtrTy->getBitWidth() == 64)
4345       return RValue::get(Result);
4346 
4347     // Otherwise, ask the codegen data what to do.
4348     if (getTargetHooks().extendPointerWithSExt())
4349       return RValue::get(Builder.CreateSExt(Result, Int64Ty, "extend.sext"));
4350     else
4351       return RValue::get(Builder.CreateZExt(Result, Int64Ty, "extend.zext"));
4352   }
4353   case Builtin::BI__builtin_setjmp: {
4354     // Buffer is a void**.
4355     Address Buf = EmitPointerWithAlignment(E->getArg(0));
4356 
4357     // Store the frame pointer to the setjmp buffer.
4358     Value *FrameAddr = Builder.CreateCall(
4359         CGM.getIntrinsic(Intrinsic::frameaddress, AllocaInt8PtrTy),
4360         ConstantInt::get(Int32Ty, 0));
4361     Builder.CreateStore(FrameAddr, Buf);
4362 
4363     // Store the stack pointer to the setjmp buffer.
4364     Value *StackAddr = Builder.CreateStackSave();
4365     assert(Buf.getPointer()->getType() == StackAddr->getType());
4366 
4367     Address StackSaveSlot = Builder.CreateConstInBoundsGEP(Buf, 2);
4368     Builder.CreateStore(StackAddr, StackSaveSlot);
4369 
4370     // Call LLVM's EH setjmp, which is lightweight.
4371     Function *F = CGM.getIntrinsic(Intrinsic::eh_sjlj_setjmp);
4372     return RValue::get(Builder.CreateCall(F, Buf.getPointer()));
4373   }
4374   case Builtin::BI__builtin_longjmp: {
4375     Value *Buf = EmitScalarExpr(E->getArg(0));
4376 
4377     // Call LLVM's EH longjmp, which is lightweight.
4378     Builder.CreateCall(CGM.getIntrinsic(Intrinsic::eh_sjlj_longjmp), Buf);
4379 
4380     // longjmp doesn't return; mark this as unreachable.
4381     Builder.CreateUnreachable();
4382 
4383     // We do need to preserve an insertion point.
4384     EmitBlock(createBasicBlock("longjmp.cont"));
4385 
4386     return RValue::get(nullptr);
4387   }
4388   case Builtin::BI__builtin_launder: {
4389     const Expr *Arg = E->getArg(0);
4390     QualType ArgTy = Arg->getType()->getPointeeType();
4391     Value *Ptr = EmitScalarExpr(Arg);
4392     if (TypeRequiresBuiltinLaunder(CGM, ArgTy))
4393       Ptr = Builder.CreateLaunderInvariantGroup(Ptr);
4394 
4395     return RValue::get(Ptr);
4396   }
4397   case Builtin::BI__sync_fetch_and_add:
4398   case Builtin::BI__sync_fetch_and_sub:
4399   case Builtin::BI__sync_fetch_and_or:
4400   case Builtin::BI__sync_fetch_and_and:
4401   case Builtin::BI__sync_fetch_and_xor:
4402   case Builtin::BI__sync_fetch_and_nand:
4403   case Builtin::BI__sync_add_and_fetch:
4404   case Builtin::BI__sync_sub_and_fetch:
4405   case Builtin::BI__sync_and_and_fetch:
4406   case Builtin::BI__sync_or_and_fetch:
4407   case Builtin::BI__sync_xor_and_fetch:
4408   case Builtin::BI__sync_nand_and_fetch:
4409   case Builtin::BI__sync_val_compare_and_swap:
4410   case Builtin::BI__sync_bool_compare_and_swap:
4411   case Builtin::BI__sync_lock_test_and_set:
4412   case Builtin::BI__sync_lock_release:
4413   case Builtin::BI__sync_swap:
4414     llvm_unreachable("Shouldn't make it through sema");
4415   case Builtin::BI__sync_fetch_and_add_1:
4416   case Builtin::BI__sync_fetch_and_add_2:
4417   case Builtin::BI__sync_fetch_and_add_4:
4418   case Builtin::BI__sync_fetch_and_add_8:
4419   case Builtin::BI__sync_fetch_and_add_16:
4420     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Add, E);
4421   case Builtin::BI__sync_fetch_and_sub_1:
4422   case Builtin::BI__sync_fetch_and_sub_2:
4423   case Builtin::BI__sync_fetch_and_sub_4:
4424   case Builtin::BI__sync_fetch_and_sub_8:
4425   case Builtin::BI__sync_fetch_and_sub_16:
4426     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Sub, E);
4427   case Builtin::BI__sync_fetch_and_or_1:
4428   case Builtin::BI__sync_fetch_and_or_2:
4429   case Builtin::BI__sync_fetch_and_or_4:
4430   case Builtin::BI__sync_fetch_and_or_8:
4431   case Builtin::BI__sync_fetch_and_or_16:
4432     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Or, E);
4433   case Builtin::BI__sync_fetch_and_and_1:
4434   case Builtin::BI__sync_fetch_and_and_2:
4435   case Builtin::BI__sync_fetch_and_and_4:
4436   case Builtin::BI__sync_fetch_and_and_8:
4437   case Builtin::BI__sync_fetch_and_and_16:
4438     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::And, E);
4439   case Builtin::BI__sync_fetch_and_xor_1:
4440   case Builtin::BI__sync_fetch_and_xor_2:
4441   case Builtin::BI__sync_fetch_and_xor_4:
4442   case Builtin::BI__sync_fetch_and_xor_8:
4443   case Builtin::BI__sync_fetch_and_xor_16:
4444     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xor, E);
4445   case Builtin::BI__sync_fetch_and_nand_1:
4446   case Builtin::BI__sync_fetch_and_nand_2:
4447   case Builtin::BI__sync_fetch_and_nand_4:
4448   case Builtin::BI__sync_fetch_and_nand_8:
4449   case Builtin::BI__sync_fetch_and_nand_16:
4450     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Nand, E);
4451 
4452   // Clang extensions: not overloaded yet.
4453   case Builtin::BI__sync_fetch_and_min:
4454     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Min, E);
4455   case Builtin::BI__sync_fetch_and_max:
4456     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Max, E);
4457   case Builtin::BI__sync_fetch_and_umin:
4458     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::UMin, E);
4459   case Builtin::BI__sync_fetch_and_umax:
4460     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::UMax, E);
4461 
4462   case Builtin::BI__sync_add_and_fetch_1:
4463   case Builtin::BI__sync_add_and_fetch_2:
4464   case Builtin::BI__sync_add_and_fetch_4:
4465   case Builtin::BI__sync_add_and_fetch_8:
4466   case Builtin::BI__sync_add_and_fetch_16:
4467     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Add, E,
4468                                 llvm::Instruction::Add);
4469   case Builtin::BI__sync_sub_and_fetch_1:
4470   case Builtin::BI__sync_sub_and_fetch_2:
4471   case Builtin::BI__sync_sub_and_fetch_4:
4472   case Builtin::BI__sync_sub_and_fetch_8:
4473   case Builtin::BI__sync_sub_and_fetch_16:
4474     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Sub, E,
4475                                 llvm::Instruction::Sub);
4476   case Builtin::BI__sync_and_and_fetch_1:
4477   case Builtin::BI__sync_and_and_fetch_2:
4478   case Builtin::BI__sync_and_and_fetch_4:
4479   case Builtin::BI__sync_and_and_fetch_8:
4480   case Builtin::BI__sync_and_and_fetch_16:
4481     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::And, E,
4482                                 llvm::Instruction::And);
4483   case Builtin::BI__sync_or_and_fetch_1:
4484   case Builtin::BI__sync_or_and_fetch_2:
4485   case Builtin::BI__sync_or_and_fetch_4:
4486   case Builtin::BI__sync_or_and_fetch_8:
4487   case Builtin::BI__sync_or_and_fetch_16:
4488     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Or, E,
4489                                 llvm::Instruction::Or);
4490   case Builtin::BI__sync_xor_and_fetch_1:
4491   case Builtin::BI__sync_xor_and_fetch_2:
4492   case Builtin::BI__sync_xor_and_fetch_4:
4493   case Builtin::BI__sync_xor_and_fetch_8:
4494   case Builtin::BI__sync_xor_and_fetch_16:
4495     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Xor, E,
4496                                 llvm::Instruction::Xor);
4497   case Builtin::BI__sync_nand_and_fetch_1:
4498   case Builtin::BI__sync_nand_and_fetch_2:
4499   case Builtin::BI__sync_nand_and_fetch_4:
4500   case Builtin::BI__sync_nand_and_fetch_8:
4501   case Builtin::BI__sync_nand_and_fetch_16:
4502     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Nand, E,
4503                                 llvm::Instruction::And, true);
4504 
4505   case Builtin::BI__sync_val_compare_and_swap_1:
4506   case Builtin::BI__sync_val_compare_and_swap_2:
4507   case Builtin::BI__sync_val_compare_and_swap_4:
4508   case Builtin::BI__sync_val_compare_and_swap_8:
4509   case Builtin::BI__sync_val_compare_and_swap_16:
4510     return RValue::get(MakeAtomicCmpXchgValue(*this, E, false));
4511 
4512   case Builtin::BI__sync_bool_compare_and_swap_1:
4513   case Builtin::BI__sync_bool_compare_and_swap_2:
4514   case Builtin::BI__sync_bool_compare_and_swap_4:
4515   case Builtin::BI__sync_bool_compare_and_swap_8:
4516   case Builtin::BI__sync_bool_compare_and_swap_16:
4517     return RValue::get(MakeAtomicCmpXchgValue(*this, E, true));
4518 
4519   case Builtin::BI__sync_swap_1:
4520   case Builtin::BI__sync_swap_2:
4521   case Builtin::BI__sync_swap_4:
4522   case Builtin::BI__sync_swap_8:
4523   case Builtin::BI__sync_swap_16:
4524     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xchg, E);
4525 
4526   case Builtin::BI__sync_lock_test_and_set_1:
4527   case Builtin::BI__sync_lock_test_and_set_2:
4528   case Builtin::BI__sync_lock_test_and_set_4:
4529   case Builtin::BI__sync_lock_test_and_set_8:
4530   case Builtin::BI__sync_lock_test_and_set_16:
4531     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xchg, E);
4532 
4533   case Builtin::BI__sync_lock_release_1:
4534   case Builtin::BI__sync_lock_release_2:
4535   case Builtin::BI__sync_lock_release_4:
4536   case Builtin::BI__sync_lock_release_8:
4537   case Builtin::BI__sync_lock_release_16: {
4538     Address Ptr = CheckAtomicAlignment(*this, E);
4539     QualType ElTy = E->getArg(0)->getType()->getPointeeType();
4540 
4541     llvm::Type *ITy = llvm::IntegerType::get(getLLVMContext(),
4542                                              getContext().getTypeSize(ElTy));
4543     llvm::StoreInst *Store =
4544         Builder.CreateStore(llvm::Constant::getNullValue(ITy), Ptr);
4545     Store->setAtomic(llvm::AtomicOrdering::Release);
4546     return RValue::get(nullptr);
4547   }
4548 
4549   case Builtin::BI__sync_synchronize: {
4550     // We assume this is supposed to correspond to a C++0x-style
4551     // sequentially-consistent fence (i.e. this is only usable for
4552     // synchronization, not device I/O or anything like that). This intrinsic
4553     // is really badly designed in the sense that in theory, there isn't
4554     // any way to safely use it... but in practice, it mostly works
4555     // to use it with non-atomic loads and stores to get acquire/release
4556     // semantics.
4557     Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent);
4558     return RValue::get(nullptr);
4559   }
4560 
4561   case Builtin::BI__builtin_nontemporal_load:
4562     return RValue::get(EmitNontemporalLoad(*this, E));
4563   case Builtin::BI__builtin_nontemporal_store:
4564     return RValue::get(EmitNontemporalStore(*this, E));
4565   case Builtin::BI__c11_atomic_is_lock_free:
4566   case Builtin::BI__atomic_is_lock_free: {
4567     // Call "bool __atomic_is_lock_free(size_t size, void *ptr)". For the
4568     // __c11 builtin, ptr is 0 (indicating a properly-aligned object), since
4569     // _Atomic(T) is always properly-aligned.
4570     const char *LibCallName = "__atomic_is_lock_free";
4571     CallArgList Args;
4572     Args.add(RValue::get(EmitScalarExpr(E->getArg(0))),
4573              getContext().getSizeType());
4574     if (BuiltinID == Builtin::BI__atomic_is_lock_free)
4575       Args.add(RValue::get(EmitScalarExpr(E->getArg(1))),
4576                getContext().VoidPtrTy);
4577     else
4578       Args.add(RValue::get(llvm::Constant::getNullValue(VoidPtrTy)),
4579                getContext().VoidPtrTy);
4580     const CGFunctionInfo &FuncInfo =
4581         CGM.getTypes().arrangeBuiltinFunctionCall(E->getType(), Args);
4582     llvm::FunctionType *FTy = CGM.getTypes().GetFunctionType(FuncInfo);
4583     llvm::FunctionCallee Func = CGM.CreateRuntimeFunction(FTy, LibCallName);
4584     return EmitCall(FuncInfo, CGCallee::forDirect(Func),
4585                     ReturnValueSlot(), Args);
4586   }
4587 
4588   case Builtin::BI__atomic_test_and_set: {
4589     // Look at the argument type to determine whether this is a volatile
4590     // operation. The parameter type is always volatile.
4591     QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
4592     bool Volatile =
4593         PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
4594 
4595     Address Ptr =
4596         EmitPointerWithAlignment(E->getArg(0)).withElementType(Int8Ty);
4597 
4598     Value *NewVal = Builder.getInt8(1);
4599     Value *Order = EmitScalarExpr(E->getArg(1));
4600     if (isa<llvm::ConstantInt>(Order)) {
4601       int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
4602       AtomicRMWInst *Result = nullptr;
4603       switch (ord) {
4604       case 0:  // memory_order_relaxed
4605       default: // invalid order
4606         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
4607                                          llvm::AtomicOrdering::Monotonic);
4608         break;
4609       case 1: // memory_order_consume
4610       case 2: // memory_order_acquire
4611         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
4612                                          llvm::AtomicOrdering::Acquire);
4613         break;
4614       case 3: // memory_order_release
4615         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
4616                                          llvm::AtomicOrdering::Release);
4617         break;
4618       case 4: // memory_order_acq_rel
4619 
4620         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
4621                                          llvm::AtomicOrdering::AcquireRelease);
4622         break;
4623       case 5: // memory_order_seq_cst
4624         Result = Builder.CreateAtomicRMW(
4625             llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
4626             llvm::AtomicOrdering::SequentiallyConsistent);
4627         break;
4628       }
4629       Result->setVolatile(Volatile);
4630       return RValue::get(Builder.CreateIsNotNull(Result, "tobool"));
4631     }
4632 
4633     llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
4634 
4635     llvm::BasicBlock *BBs[5] = {
4636       createBasicBlock("monotonic", CurFn),
4637       createBasicBlock("acquire", CurFn),
4638       createBasicBlock("release", CurFn),
4639       createBasicBlock("acqrel", CurFn),
4640       createBasicBlock("seqcst", CurFn)
4641     };
4642     llvm::AtomicOrdering Orders[5] = {
4643         llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Acquire,
4644         llvm::AtomicOrdering::Release, llvm::AtomicOrdering::AcquireRelease,
4645         llvm::AtomicOrdering::SequentiallyConsistent};
4646 
4647     Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
4648     llvm::SwitchInst *SI = Builder.CreateSwitch(Order, BBs[0]);
4649 
4650     Builder.SetInsertPoint(ContBB);
4651     PHINode *Result = Builder.CreatePHI(Int8Ty, 5, "was_set");
4652 
4653     for (unsigned i = 0; i < 5; ++i) {
4654       Builder.SetInsertPoint(BBs[i]);
4655       AtomicRMWInst *RMW = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg,
4656                                                    Ptr, NewVal, Orders[i]);
4657       RMW->setVolatile(Volatile);
4658       Result->addIncoming(RMW, BBs[i]);
4659       Builder.CreateBr(ContBB);
4660     }
4661 
4662     SI->addCase(Builder.getInt32(0), BBs[0]);
4663     SI->addCase(Builder.getInt32(1), BBs[1]);
4664     SI->addCase(Builder.getInt32(2), BBs[1]);
4665     SI->addCase(Builder.getInt32(3), BBs[2]);
4666     SI->addCase(Builder.getInt32(4), BBs[3]);
4667     SI->addCase(Builder.getInt32(5), BBs[4]);
4668 
4669     Builder.SetInsertPoint(ContBB);
4670     return RValue::get(Builder.CreateIsNotNull(Result, "tobool"));
4671   }
4672 
4673   case Builtin::BI__atomic_clear: {
4674     QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
4675     bool Volatile =
4676         PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
4677 
4678     Address Ptr = EmitPointerWithAlignment(E->getArg(0));
4679     Ptr = Ptr.withElementType(Int8Ty);
4680     Value *NewVal = Builder.getInt8(0);
4681     Value *Order = EmitScalarExpr(E->getArg(1));
4682     if (isa<llvm::ConstantInt>(Order)) {
4683       int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
4684       StoreInst *Store = Builder.CreateStore(NewVal, Ptr, Volatile);
4685       switch (ord) {
4686       case 0:  // memory_order_relaxed
4687       default: // invalid order
4688         Store->setOrdering(llvm::AtomicOrdering::Monotonic);
4689         break;
4690       case 3:  // memory_order_release
4691         Store->setOrdering(llvm::AtomicOrdering::Release);
4692         break;
4693       case 5:  // memory_order_seq_cst
4694         Store->setOrdering(llvm::AtomicOrdering::SequentiallyConsistent);
4695         break;
4696       }
4697       return RValue::get(nullptr);
4698     }
4699 
4700     llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
4701 
4702     llvm::BasicBlock *BBs[3] = {
4703       createBasicBlock("monotonic", CurFn),
4704       createBasicBlock("release", CurFn),
4705       createBasicBlock("seqcst", CurFn)
4706     };
4707     llvm::AtomicOrdering Orders[3] = {
4708         llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Release,
4709         llvm::AtomicOrdering::SequentiallyConsistent};
4710 
4711     Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
4712     llvm::SwitchInst *SI = Builder.CreateSwitch(Order, BBs[0]);
4713 
4714     for (unsigned i = 0; i < 3; ++i) {
4715       Builder.SetInsertPoint(BBs[i]);
4716       StoreInst *Store = Builder.CreateStore(NewVal, Ptr, Volatile);
4717       Store->setOrdering(Orders[i]);
4718       Builder.CreateBr(ContBB);
4719     }
4720 
4721     SI->addCase(Builder.getInt32(0), BBs[0]);
4722     SI->addCase(Builder.getInt32(3), BBs[1]);
4723     SI->addCase(Builder.getInt32(5), BBs[2]);
4724 
4725     Builder.SetInsertPoint(ContBB);
4726     return RValue::get(nullptr);
4727   }
4728 
4729   case Builtin::BI__atomic_thread_fence:
4730   case Builtin::BI__atomic_signal_fence:
4731   case Builtin::BI__c11_atomic_thread_fence:
4732   case Builtin::BI__c11_atomic_signal_fence: {
4733     llvm::SyncScope::ID SSID;
4734     if (BuiltinID == Builtin::BI__atomic_signal_fence ||
4735         BuiltinID == Builtin::BI__c11_atomic_signal_fence)
4736       SSID = llvm::SyncScope::SingleThread;
4737     else
4738       SSID = llvm::SyncScope::System;
4739     Value *Order = EmitScalarExpr(E->getArg(0));
4740     if (isa<llvm::ConstantInt>(Order)) {
4741       int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
4742       switch (ord) {
4743       case 0:  // memory_order_relaxed
4744       default: // invalid order
4745         break;
4746       case 1:  // memory_order_consume
4747       case 2:  // memory_order_acquire
4748         Builder.CreateFence(llvm::AtomicOrdering::Acquire, SSID);
4749         break;
4750       case 3:  // memory_order_release
4751         Builder.CreateFence(llvm::AtomicOrdering::Release, SSID);
4752         break;
4753       case 4:  // memory_order_acq_rel
4754         Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, SSID);
4755         break;
4756       case 5:  // memory_order_seq_cst
4757         Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, SSID);
4758         break;
4759       }
4760       return RValue::get(nullptr);
4761     }
4762 
4763     llvm::BasicBlock *AcquireBB, *ReleaseBB, *AcqRelBB, *SeqCstBB;
4764     AcquireBB = createBasicBlock("acquire", CurFn);
4765     ReleaseBB = createBasicBlock("release", CurFn);
4766     AcqRelBB = createBasicBlock("acqrel", CurFn);
4767     SeqCstBB = createBasicBlock("seqcst", CurFn);
4768     llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
4769 
4770     Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
4771     llvm::SwitchInst *SI = Builder.CreateSwitch(Order, ContBB);
4772 
4773     Builder.SetInsertPoint(AcquireBB);
4774     Builder.CreateFence(llvm::AtomicOrdering::Acquire, SSID);
4775     Builder.CreateBr(ContBB);
4776     SI->addCase(Builder.getInt32(1), AcquireBB);
4777     SI->addCase(Builder.getInt32(2), AcquireBB);
4778 
4779     Builder.SetInsertPoint(ReleaseBB);
4780     Builder.CreateFence(llvm::AtomicOrdering::Release, SSID);
4781     Builder.CreateBr(ContBB);
4782     SI->addCase(Builder.getInt32(3), ReleaseBB);
4783 
4784     Builder.SetInsertPoint(AcqRelBB);
4785     Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, SSID);
4786     Builder.CreateBr(ContBB);
4787     SI->addCase(Builder.getInt32(4), AcqRelBB);
4788 
4789     Builder.SetInsertPoint(SeqCstBB);
4790     Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, SSID);
4791     Builder.CreateBr(ContBB);
4792     SI->addCase(Builder.getInt32(5), SeqCstBB);
4793 
4794     Builder.SetInsertPoint(ContBB);
4795     return RValue::get(nullptr);
4796   }
4797 
4798   case Builtin::BI__builtin_signbit:
4799   case Builtin::BI__builtin_signbitf:
4800   case Builtin::BI__builtin_signbitl: {
4801     return RValue::get(
4802         Builder.CreateZExt(EmitSignBit(*this, EmitScalarExpr(E->getArg(0))),
4803                            ConvertType(E->getType())));
4804   }
4805   case Builtin::BI__warn_memset_zero_len:
4806     return RValue::getIgnored();
4807   case Builtin::BI__annotation: {
4808     // Re-encode each wide string to UTF8 and make an MDString.
4809     SmallVector<Metadata *, 1> Strings;
4810     for (const Expr *Arg : E->arguments()) {
4811       const auto *Str = cast<StringLiteral>(Arg->IgnoreParenCasts());
4812       assert(Str->getCharByteWidth() == 2);
4813       StringRef WideBytes = Str->getBytes();
4814       std::string StrUtf8;
4815       if (!convertUTF16ToUTF8String(
4816               ArrayRef(WideBytes.data(), WideBytes.size()), StrUtf8)) {
4817         CGM.ErrorUnsupported(E, "non-UTF16 __annotation argument");
4818         continue;
4819       }
4820       Strings.push_back(llvm::MDString::get(getLLVMContext(), StrUtf8));
4821     }
4822 
4823     // Build and MDTuple of MDStrings and emit the intrinsic call.
4824     llvm::Function *F =
4825         CGM.getIntrinsic(llvm::Intrinsic::codeview_annotation, {});
4826     MDTuple *StrTuple = MDTuple::get(getLLVMContext(), Strings);
4827     Builder.CreateCall(F, MetadataAsValue::get(getLLVMContext(), StrTuple));
4828     return RValue::getIgnored();
4829   }
4830   case Builtin::BI__builtin_annotation: {
4831     llvm::Value *AnnVal = EmitScalarExpr(E->getArg(0));
4832     llvm::Function *F =
4833         CGM.getIntrinsic(llvm::Intrinsic::annotation,
4834                          {AnnVal->getType(), CGM.ConstGlobalsPtrTy});
4835 
4836     // Get the annotation string, go through casts. Sema requires this to be a
4837     // non-wide string literal, potentially casted, so the cast<> is safe.
4838     const Expr *AnnotationStrExpr = E->getArg(1)->IgnoreParenCasts();
4839     StringRef Str = cast<StringLiteral>(AnnotationStrExpr)->getString();
4840     return RValue::get(
4841         EmitAnnotationCall(F, AnnVal, Str, E->getExprLoc(), nullptr));
4842   }
4843   case Builtin::BI__builtin_addcb:
4844   case Builtin::BI__builtin_addcs:
4845   case Builtin::BI__builtin_addc:
4846   case Builtin::BI__builtin_addcl:
4847   case Builtin::BI__builtin_addcll:
4848   case Builtin::BI__builtin_subcb:
4849   case Builtin::BI__builtin_subcs:
4850   case Builtin::BI__builtin_subc:
4851   case Builtin::BI__builtin_subcl:
4852   case Builtin::BI__builtin_subcll: {
4853 
4854     // We translate all of these builtins from expressions of the form:
4855     //   int x = ..., y = ..., carryin = ..., carryout, result;
4856     //   result = __builtin_addc(x, y, carryin, &carryout);
4857     //
4858     // to LLVM IR of the form:
4859     //
4860     //   %tmp1 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %x, i32 %y)
4861     //   %tmpsum1 = extractvalue {i32, i1} %tmp1, 0
4862     //   %carry1 = extractvalue {i32, i1} %tmp1, 1
4863     //   %tmp2 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %tmpsum1,
4864     //                                                       i32 %carryin)
4865     //   %result = extractvalue {i32, i1} %tmp2, 0
4866     //   %carry2 = extractvalue {i32, i1} %tmp2, 1
4867     //   %tmp3 = or i1 %carry1, %carry2
4868     //   %tmp4 = zext i1 %tmp3 to i32
4869     //   store i32 %tmp4, i32* %carryout
4870 
4871     // Scalarize our inputs.
4872     llvm::Value *X = EmitScalarExpr(E->getArg(0));
4873     llvm::Value *Y = EmitScalarExpr(E->getArg(1));
4874     llvm::Value *Carryin = EmitScalarExpr(E->getArg(2));
4875     Address CarryOutPtr = EmitPointerWithAlignment(E->getArg(3));
4876 
4877     // Decide if we are lowering to a uadd.with.overflow or usub.with.overflow.
4878     llvm::Intrinsic::ID IntrinsicId;
4879     switch (BuiltinID) {
4880     default: llvm_unreachable("Unknown multiprecision builtin id.");
4881     case Builtin::BI__builtin_addcb:
4882     case Builtin::BI__builtin_addcs:
4883     case Builtin::BI__builtin_addc:
4884     case Builtin::BI__builtin_addcl:
4885     case Builtin::BI__builtin_addcll:
4886       IntrinsicId = llvm::Intrinsic::uadd_with_overflow;
4887       break;
4888     case Builtin::BI__builtin_subcb:
4889     case Builtin::BI__builtin_subcs:
4890     case Builtin::BI__builtin_subc:
4891     case Builtin::BI__builtin_subcl:
4892     case Builtin::BI__builtin_subcll:
4893       IntrinsicId = llvm::Intrinsic::usub_with_overflow;
4894       break;
4895     }
4896 
4897     // Construct our resulting LLVM IR expression.
4898     llvm::Value *Carry1;
4899     llvm::Value *Sum1 = EmitOverflowIntrinsic(*this, IntrinsicId,
4900                                               X, Y, Carry1);
4901     llvm::Value *Carry2;
4902     llvm::Value *Sum2 = EmitOverflowIntrinsic(*this, IntrinsicId,
4903                                               Sum1, Carryin, Carry2);
4904     llvm::Value *CarryOut = Builder.CreateZExt(Builder.CreateOr(Carry1, Carry2),
4905                                                X->getType());
4906     Builder.CreateStore(CarryOut, CarryOutPtr);
4907     return RValue::get(Sum2);
4908   }
4909 
4910   case Builtin::BI__builtin_add_overflow:
4911   case Builtin::BI__builtin_sub_overflow:
4912   case Builtin::BI__builtin_mul_overflow: {
4913     const clang::Expr *LeftArg = E->getArg(0);
4914     const clang::Expr *RightArg = E->getArg(1);
4915     const clang::Expr *ResultArg = E->getArg(2);
4916 
4917     clang::QualType ResultQTy =
4918         ResultArg->getType()->castAs<PointerType>()->getPointeeType();
4919 
4920     WidthAndSignedness LeftInfo =
4921         getIntegerWidthAndSignedness(CGM.getContext(), LeftArg->getType());
4922     WidthAndSignedness RightInfo =
4923         getIntegerWidthAndSignedness(CGM.getContext(), RightArg->getType());
4924     WidthAndSignedness ResultInfo =
4925         getIntegerWidthAndSignedness(CGM.getContext(), ResultQTy);
4926 
4927     // Handle mixed-sign multiplication as a special case, because adding
4928     // runtime or backend support for our generic irgen would be too expensive.
4929     if (isSpecialMixedSignMultiply(BuiltinID, LeftInfo, RightInfo, ResultInfo))
4930       return EmitCheckedMixedSignMultiply(*this, LeftArg, LeftInfo, RightArg,
4931                                           RightInfo, ResultArg, ResultQTy,
4932                                           ResultInfo);
4933 
4934     if (isSpecialUnsignedMultiplySignedResult(BuiltinID, LeftInfo, RightInfo,
4935                                               ResultInfo))
4936       return EmitCheckedUnsignedMultiplySignedResult(
4937           *this, LeftArg, LeftInfo, RightArg, RightInfo, ResultArg, ResultQTy,
4938           ResultInfo);
4939 
4940     WidthAndSignedness EncompassingInfo =
4941         EncompassingIntegerType({LeftInfo, RightInfo, ResultInfo});
4942 
4943     llvm::Type *EncompassingLLVMTy =
4944         llvm::IntegerType::get(CGM.getLLVMContext(), EncompassingInfo.Width);
4945 
4946     llvm::Type *ResultLLVMTy = CGM.getTypes().ConvertType(ResultQTy);
4947 
4948     llvm::Intrinsic::ID IntrinsicId;
4949     switch (BuiltinID) {
4950     default:
4951       llvm_unreachable("Unknown overflow builtin id.");
4952     case Builtin::BI__builtin_add_overflow:
4953       IntrinsicId = EncompassingInfo.Signed
4954                         ? llvm::Intrinsic::sadd_with_overflow
4955                         : llvm::Intrinsic::uadd_with_overflow;
4956       break;
4957     case Builtin::BI__builtin_sub_overflow:
4958       IntrinsicId = EncompassingInfo.Signed
4959                         ? llvm::Intrinsic::ssub_with_overflow
4960                         : llvm::Intrinsic::usub_with_overflow;
4961       break;
4962     case Builtin::BI__builtin_mul_overflow:
4963       IntrinsicId = EncompassingInfo.Signed
4964                         ? llvm::Intrinsic::smul_with_overflow
4965                         : llvm::Intrinsic::umul_with_overflow;
4966       break;
4967     }
4968 
4969     llvm::Value *Left = EmitScalarExpr(LeftArg);
4970     llvm::Value *Right = EmitScalarExpr(RightArg);
4971     Address ResultPtr = EmitPointerWithAlignment(ResultArg);
4972 
4973     // Extend each operand to the encompassing type.
4974     Left = Builder.CreateIntCast(Left, EncompassingLLVMTy, LeftInfo.Signed);
4975     Right = Builder.CreateIntCast(Right, EncompassingLLVMTy, RightInfo.Signed);
4976 
4977     // Perform the operation on the extended values.
4978     llvm::Value *Overflow, *Result;
4979     Result = EmitOverflowIntrinsic(*this, IntrinsicId, Left, Right, Overflow);
4980 
4981     if (EncompassingInfo.Width > ResultInfo.Width) {
4982       // The encompassing type is wider than the result type, so we need to
4983       // truncate it.
4984       llvm::Value *ResultTrunc = Builder.CreateTrunc(Result, ResultLLVMTy);
4985 
4986       // To see if the truncation caused an overflow, we will extend
4987       // the result and then compare it to the original result.
4988       llvm::Value *ResultTruncExt = Builder.CreateIntCast(
4989           ResultTrunc, EncompassingLLVMTy, ResultInfo.Signed);
4990       llvm::Value *TruncationOverflow =
4991           Builder.CreateICmpNE(Result, ResultTruncExt);
4992 
4993       Overflow = Builder.CreateOr(Overflow, TruncationOverflow);
4994       Result = ResultTrunc;
4995     }
4996 
4997     // Finally, store the result using the pointer.
4998     bool isVolatile =
4999       ResultArg->getType()->getPointeeType().isVolatileQualified();
5000     Builder.CreateStore(EmitToMemory(Result, ResultQTy), ResultPtr, isVolatile);
5001 
5002     return RValue::get(Overflow);
5003   }
5004 
5005   case Builtin::BI__builtin_uadd_overflow:
5006   case Builtin::BI__builtin_uaddl_overflow:
5007   case Builtin::BI__builtin_uaddll_overflow:
5008   case Builtin::BI__builtin_usub_overflow:
5009   case Builtin::BI__builtin_usubl_overflow:
5010   case Builtin::BI__builtin_usubll_overflow:
5011   case Builtin::BI__builtin_umul_overflow:
5012   case Builtin::BI__builtin_umull_overflow:
5013   case Builtin::BI__builtin_umulll_overflow:
5014   case Builtin::BI__builtin_sadd_overflow:
5015   case Builtin::BI__builtin_saddl_overflow:
5016   case Builtin::BI__builtin_saddll_overflow:
5017   case Builtin::BI__builtin_ssub_overflow:
5018   case Builtin::BI__builtin_ssubl_overflow:
5019   case Builtin::BI__builtin_ssubll_overflow:
5020   case Builtin::BI__builtin_smul_overflow:
5021   case Builtin::BI__builtin_smull_overflow:
5022   case Builtin::BI__builtin_smulll_overflow: {
5023 
5024     // We translate all of these builtins directly to the relevant llvm IR node.
5025 
5026     // Scalarize our inputs.
5027     llvm::Value *X = EmitScalarExpr(E->getArg(0));
5028     llvm::Value *Y = EmitScalarExpr(E->getArg(1));
5029     Address SumOutPtr = EmitPointerWithAlignment(E->getArg(2));
5030 
5031     // Decide which of the overflow intrinsics we are lowering to:
5032     llvm::Intrinsic::ID IntrinsicId;
5033     switch (BuiltinID) {
5034     default: llvm_unreachable("Unknown overflow builtin id.");
5035     case Builtin::BI__builtin_uadd_overflow:
5036     case Builtin::BI__builtin_uaddl_overflow:
5037     case Builtin::BI__builtin_uaddll_overflow:
5038       IntrinsicId = llvm::Intrinsic::uadd_with_overflow;
5039       break;
5040     case Builtin::BI__builtin_usub_overflow:
5041     case Builtin::BI__builtin_usubl_overflow:
5042     case Builtin::BI__builtin_usubll_overflow:
5043       IntrinsicId = llvm::Intrinsic::usub_with_overflow;
5044       break;
5045     case Builtin::BI__builtin_umul_overflow:
5046     case Builtin::BI__builtin_umull_overflow:
5047     case Builtin::BI__builtin_umulll_overflow:
5048       IntrinsicId = llvm::Intrinsic::umul_with_overflow;
5049       break;
5050     case Builtin::BI__builtin_sadd_overflow:
5051     case Builtin::BI__builtin_saddl_overflow:
5052     case Builtin::BI__builtin_saddll_overflow:
5053       IntrinsicId = llvm::Intrinsic::sadd_with_overflow;
5054       break;
5055     case Builtin::BI__builtin_ssub_overflow:
5056     case Builtin::BI__builtin_ssubl_overflow:
5057     case Builtin::BI__builtin_ssubll_overflow:
5058       IntrinsicId = llvm::Intrinsic::ssub_with_overflow;
5059       break;
5060     case Builtin::BI__builtin_smul_overflow:
5061     case Builtin::BI__builtin_smull_overflow:
5062     case Builtin::BI__builtin_smulll_overflow:
5063       IntrinsicId = llvm::Intrinsic::smul_with_overflow;
5064       break;
5065     }
5066 
5067 
5068     llvm::Value *Carry;
5069     llvm::Value *Sum = EmitOverflowIntrinsic(*this, IntrinsicId, X, Y, Carry);
5070     Builder.CreateStore(Sum, SumOutPtr);
5071 
5072     return RValue::get(Carry);
5073   }
5074   case Builtin::BIaddressof:
5075   case Builtin::BI__addressof:
5076   case Builtin::BI__builtin_addressof:
5077     return RValue::get(EmitLValue(E->getArg(0)).getPointer(*this));
5078   case Builtin::BI__builtin_function_start:
5079     return RValue::get(CGM.GetFunctionStart(
5080         E->getArg(0)->getAsBuiltinConstantDeclRef(CGM.getContext())));
5081   case Builtin::BI__builtin_operator_new:
5082     return EmitBuiltinNewDeleteCall(
5083         E->getCallee()->getType()->castAs<FunctionProtoType>(), E, false);
5084   case Builtin::BI__builtin_operator_delete:
5085     EmitBuiltinNewDeleteCall(
5086         E->getCallee()->getType()->castAs<FunctionProtoType>(), E, true);
5087     return RValue::get(nullptr);
5088 
5089   case Builtin::BI__builtin_is_aligned:
5090     return EmitBuiltinIsAligned(E);
5091   case Builtin::BI__builtin_align_up:
5092     return EmitBuiltinAlignTo(E, true);
5093   case Builtin::BI__builtin_align_down:
5094     return EmitBuiltinAlignTo(E, false);
5095 
5096   case Builtin::BI__noop:
5097     // __noop always evaluates to an integer literal zero.
5098     return RValue::get(ConstantInt::get(IntTy, 0));
5099   case Builtin::BI__builtin_call_with_static_chain: {
5100     const CallExpr *Call = cast<CallExpr>(E->getArg(0));
5101     const Expr *Chain = E->getArg(1);
5102     return EmitCall(Call->getCallee()->getType(),
5103                     EmitCallee(Call->getCallee()), Call, ReturnValue,
5104                     EmitScalarExpr(Chain));
5105   }
5106   case Builtin::BI_InterlockedExchange8:
5107   case Builtin::BI_InterlockedExchange16:
5108   case Builtin::BI_InterlockedExchange:
5109   case Builtin::BI_InterlockedExchangePointer:
5110     return RValue::get(
5111         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchange, E));
5112   case Builtin::BI_InterlockedCompareExchangePointer:
5113   case Builtin::BI_InterlockedCompareExchangePointer_nf: {
5114     llvm::Type *RTy;
5115     llvm::IntegerType *IntType = IntegerType::get(
5116         getLLVMContext(), getContext().getTypeSize(E->getType()));
5117 
5118     Address DestAddr = CheckAtomicAlignment(*this, E);
5119 
5120     llvm::Value *Exchange = EmitScalarExpr(E->getArg(1));
5121     RTy = Exchange->getType();
5122     Exchange = Builder.CreatePtrToInt(Exchange, IntType);
5123 
5124     llvm::Value *Comparand =
5125       Builder.CreatePtrToInt(EmitScalarExpr(E->getArg(2)), IntType);
5126 
5127     auto Ordering =
5128       BuiltinID == Builtin::BI_InterlockedCompareExchangePointer_nf ?
5129       AtomicOrdering::Monotonic : AtomicOrdering::SequentiallyConsistent;
5130 
5131     auto Result = Builder.CreateAtomicCmpXchg(DestAddr, Comparand, Exchange,
5132                                               Ordering, Ordering);
5133     Result->setVolatile(true);
5134 
5135     return RValue::get(Builder.CreateIntToPtr(Builder.CreateExtractValue(Result,
5136                                                                          0),
5137                                               RTy));
5138   }
5139   case Builtin::BI_InterlockedCompareExchange8:
5140   case Builtin::BI_InterlockedCompareExchange16:
5141   case Builtin::BI_InterlockedCompareExchange:
5142   case Builtin::BI_InterlockedCompareExchange64:
5143     return RValue::get(EmitAtomicCmpXchgForMSIntrin(*this, E));
5144   case Builtin::BI_InterlockedIncrement16:
5145   case Builtin::BI_InterlockedIncrement:
5146     return RValue::get(
5147         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedIncrement, E));
5148   case Builtin::BI_InterlockedDecrement16:
5149   case Builtin::BI_InterlockedDecrement:
5150     return RValue::get(
5151         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedDecrement, E));
5152   case Builtin::BI_InterlockedAnd8:
5153   case Builtin::BI_InterlockedAnd16:
5154   case Builtin::BI_InterlockedAnd:
5155     return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedAnd, E));
5156   case Builtin::BI_InterlockedExchangeAdd8:
5157   case Builtin::BI_InterlockedExchangeAdd16:
5158   case Builtin::BI_InterlockedExchangeAdd:
5159     return RValue::get(
5160         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeAdd, E));
5161   case Builtin::BI_InterlockedExchangeSub8:
5162   case Builtin::BI_InterlockedExchangeSub16:
5163   case Builtin::BI_InterlockedExchangeSub:
5164     return RValue::get(
5165         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeSub, E));
5166   case Builtin::BI_InterlockedOr8:
5167   case Builtin::BI_InterlockedOr16:
5168   case Builtin::BI_InterlockedOr:
5169     return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedOr, E));
5170   case Builtin::BI_InterlockedXor8:
5171   case Builtin::BI_InterlockedXor16:
5172   case Builtin::BI_InterlockedXor:
5173     return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedXor, E));
5174 
5175   case Builtin::BI_bittest64:
5176   case Builtin::BI_bittest:
5177   case Builtin::BI_bittestandcomplement64:
5178   case Builtin::BI_bittestandcomplement:
5179   case Builtin::BI_bittestandreset64:
5180   case Builtin::BI_bittestandreset:
5181   case Builtin::BI_bittestandset64:
5182   case Builtin::BI_bittestandset:
5183   case Builtin::BI_interlockedbittestandreset:
5184   case Builtin::BI_interlockedbittestandreset64:
5185   case Builtin::BI_interlockedbittestandset64:
5186   case Builtin::BI_interlockedbittestandset:
5187   case Builtin::BI_interlockedbittestandset_acq:
5188   case Builtin::BI_interlockedbittestandset_rel:
5189   case Builtin::BI_interlockedbittestandset_nf:
5190   case Builtin::BI_interlockedbittestandreset_acq:
5191   case Builtin::BI_interlockedbittestandreset_rel:
5192   case Builtin::BI_interlockedbittestandreset_nf:
5193     return RValue::get(EmitBitTestIntrinsic(*this, BuiltinID, E));
5194 
5195     // These builtins exist to emit regular volatile loads and stores not
5196     // affected by the -fms-volatile setting.
5197   case Builtin::BI__iso_volatile_load8:
5198   case Builtin::BI__iso_volatile_load16:
5199   case Builtin::BI__iso_volatile_load32:
5200   case Builtin::BI__iso_volatile_load64:
5201     return RValue::get(EmitISOVolatileLoad(*this, E));
5202   case Builtin::BI__iso_volatile_store8:
5203   case Builtin::BI__iso_volatile_store16:
5204   case Builtin::BI__iso_volatile_store32:
5205   case Builtin::BI__iso_volatile_store64:
5206     return RValue::get(EmitISOVolatileStore(*this, E));
5207 
5208   case Builtin::BI__exception_code:
5209   case Builtin::BI_exception_code:
5210     return RValue::get(EmitSEHExceptionCode());
5211   case Builtin::BI__exception_info:
5212   case Builtin::BI_exception_info:
5213     return RValue::get(EmitSEHExceptionInfo());
5214   case Builtin::BI__abnormal_termination:
5215   case Builtin::BI_abnormal_termination:
5216     return RValue::get(EmitSEHAbnormalTermination());
5217   case Builtin::BI_setjmpex:
5218     if (getTarget().getTriple().isOSMSVCRT() && E->getNumArgs() == 1 &&
5219         E->getArg(0)->getType()->isPointerType())
5220       return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmpex, E);
5221     break;
5222   case Builtin::BI_setjmp:
5223     if (getTarget().getTriple().isOSMSVCRT() && E->getNumArgs() == 1 &&
5224         E->getArg(0)->getType()->isPointerType()) {
5225       if (getTarget().getTriple().getArch() == llvm::Triple::x86)
5226         return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmp3, E);
5227       else if (getTarget().getTriple().getArch() == llvm::Triple::aarch64)
5228         return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmpex, E);
5229       return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmp, E);
5230     }
5231     break;
5232 
5233   // C++ std:: builtins.
5234   case Builtin::BImove:
5235   case Builtin::BImove_if_noexcept:
5236   case Builtin::BIforward:
5237   case Builtin::BIforward_like:
5238   case Builtin::BIas_const:
5239     return RValue::get(EmitLValue(E->getArg(0)).getPointer(*this));
5240   case Builtin::BI__GetExceptionInfo: {
5241     if (llvm::GlobalVariable *GV =
5242             CGM.getCXXABI().getThrowInfo(FD->getParamDecl(0)->getType()))
5243       return RValue::get(GV);
5244     break;
5245   }
5246 
5247   case Builtin::BI__fastfail:
5248     return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::__fastfail, E));
5249 
5250   case Builtin::BI__builtin_coro_id:
5251     return EmitCoroutineIntrinsic(E, Intrinsic::coro_id);
5252   case Builtin::BI__builtin_coro_promise:
5253     return EmitCoroutineIntrinsic(E, Intrinsic::coro_promise);
5254   case Builtin::BI__builtin_coro_resume:
5255     EmitCoroutineIntrinsic(E, Intrinsic::coro_resume);
5256     return RValue::get(nullptr);
5257   case Builtin::BI__builtin_coro_frame:
5258     return EmitCoroutineIntrinsic(E, Intrinsic::coro_frame);
5259   case Builtin::BI__builtin_coro_noop:
5260     return EmitCoroutineIntrinsic(E, Intrinsic::coro_noop);
5261   case Builtin::BI__builtin_coro_free:
5262     return EmitCoroutineIntrinsic(E, Intrinsic::coro_free);
5263   case Builtin::BI__builtin_coro_destroy:
5264     EmitCoroutineIntrinsic(E, Intrinsic::coro_destroy);
5265     return RValue::get(nullptr);
5266   case Builtin::BI__builtin_coro_done:
5267     return EmitCoroutineIntrinsic(E, Intrinsic::coro_done);
5268   case Builtin::BI__builtin_coro_alloc:
5269     return EmitCoroutineIntrinsic(E, Intrinsic::coro_alloc);
5270   case Builtin::BI__builtin_coro_begin:
5271     return EmitCoroutineIntrinsic(E, Intrinsic::coro_begin);
5272   case Builtin::BI__builtin_coro_end:
5273     return EmitCoroutineIntrinsic(E, Intrinsic::coro_end);
5274   case Builtin::BI__builtin_coro_suspend:
5275     return EmitCoroutineIntrinsic(E, Intrinsic::coro_suspend);
5276   case Builtin::BI__builtin_coro_size:
5277     return EmitCoroutineIntrinsic(E, Intrinsic::coro_size);
5278   case Builtin::BI__builtin_coro_align:
5279     return EmitCoroutineIntrinsic(E, Intrinsic::coro_align);
5280 
5281   // OpenCL v2.0 s6.13.16.2, Built-in pipe read and write functions
5282   case Builtin::BIread_pipe:
5283   case Builtin::BIwrite_pipe: {
5284     Value *Arg0 = EmitScalarExpr(E->getArg(0)),
5285           *Arg1 = EmitScalarExpr(E->getArg(1));
5286     CGOpenCLRuntime OpenCLRT(CGM);
5287     Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
5288     Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
5289 
5290     // Type of the generic packet parameter.
5291     unsigned GenericAS =
5292         getContext().getTargetAddressSpace(LangAS::opencl_generic);
5293     llvm::Type *I8PTy = llvm::PointerType::get(getLLVMContext(), GenericAS);
5294 
5295     // Testing which overloaded version we should generate the call for.
5296     if (2U == E->getNumArgs()) {
5297       const char *Name = (BuiltinID == Builtin::BIread_pipe) ? "__read_pipe_2"
5298                                                              : "__write_pipe_2";
5299       // Creating a generic function type to be able to call with any builtin or
5300       // user defined type.
5301       llvm::Type *ArgTys[] = {Arg0->getType(), I8PTy, Int32Ty, Int32Ty};
5302       llvm::FunctionType *FTy = llvm::FunctionType::get(
5303           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
5304       Value *BCast = Builder.CreatePointerCast(Arg1, I8PTy);
5305       return RValue::get(
5306           EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
5307                           {Arg0, BCast, PacketSize, PacketAlign}));
5308     } else {
5309       assert(4 == E->getNumArgs() &&
5310              "Illegal number of parameters to pipe function");
5311       const char *Name = (BuiltinID == Builtin::BIread_pipe) ? "__read_pipe_4"
5312                                                              : "__write_pipe_4";
5313 
5314       llvm::Type *ArgTys[] = {Arg0->getType(), Arg1->getType(), Int32Ty, I8PTy,
5315                               Int32Ty, Int32Ty};
5316       Value *Arg2 = EmitScalarExpr(E->getArg(2)),
5317             *Arg3 = EmitScalarExpr(E->getArg(3));
5318       llvm::FunctionType *FTy = llvm::FunctionType::get(
5319           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
5320       Value *BCast = Builder.CreatePointerCast(Arg3, I8PTy);
5321       // We know the third argument is an integer type, but we may need to cast
5322       // it to i32.
5323       if (Arg2->getType() != Int32Ty)
5324         Arg2 = Builder.CreateZExtOrTrunc(Arg2, Int32Ty);
5325       return RValue::get(
5326           EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
5327                           {Arg0, Arg1, Arg2, BCast, PacketSize, PacketAlign}));
5328     }
5329   }
5330   // OpenCL v2.0 s6.13.16 ,s9.17.3.5 - Built-in pipe reserve read and write
5331   // functions
5332   case Builtin::BIreserve_read_pipe:
5333   case Builtin::BIreserve_write_pipe:
5334   case Builtin::BIwork_group_reserve_read_pipe:
5335   case Builtin::BIwork_group_reserve_write_pipe:
5336   case Builtin::BIsub_group_reserve_read_pipe:
5337   case Builtin::BIsub_group_reserve_write_pipe: {
5338     // Composing the mangled name for the function.
5339     const char *Name;
5340     if (BuiltinID == Builtin::BIreserve_read_pipe)
5341       Name = "__reserve_read_pipe";
5342     else if (BuiltinID == Builtin::BIreserve_write_pipe)
5343       Name = "__reserve_write_pipe";
5344     else if (BuiltinID == Builtin::BIwork_group_reserve_read_pipe)
5345       Name = "__work_group_reserve_read_pipe";
5346     else if (BuiltinID == Builtin::BIwork_group_reserve_write_pipe)
5347       Name = "__work_group_reserve_write_pipe";
5348     else if (BuiltinID == Builtin::BIsub_group_reserve_read_pipe)
5349       Name = "__sub_group_reserve_read_pipe";
5350     else
5351       Name = "__sub_group_reserve_write_pipe";
5352 
5353     Value *Arg0 = EmitScalarExpr(E->getArg(0)),
5354           *Arg1 = EmitScalarExpr(E->getArg(1));
5355     llvm::Type *ReservedIDTy = ConvertType(getContext().OCLReserveIDTy);
5356     CGOpenCLRuntime OpenCLRT(CGM);
5357     Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
5358     Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
5359 
5360     // Building the generic function prototype.
5361     llvm::Type *ArgTys[] = {Arg0->getType(), Int32Ty, Int32Ty, Int32Ty};
5362     llvm::FunctionType *FTy = llvm::FunctionType::get(
5363         ReservedIDTy, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
5364     // We know the second argument is an integer type, but we may need to cast
5365     // it to i32.
5366     if (Arg1->getType() != Int32Ty)
5367       Arg1 = Builder.CreateZExtOrTrunc(Arg1, Int32Ty);
5368     return RValue::get(EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
5369                                        {Arg0, Arg1, PacketSize, PacketAlign}));
5370   }
5371   // OpenCL v2.0 s6.13.16, s9.17.3.5 - Built-in pipe commit read and write
5372   // functions
5373   case Builtin::BIcommit_read_pipe:
5374   case Builtin::BIcommit_write_pipe:
5375   case Builtin::BIwork_group_commit_read_pipe:
5376   case Builtin::BIwork_group_commit_write_pipe:
5377   case Builtin::BIsub_group_commit_read_pipe:
5378   case Builtin::BIsub_group_commit_write_pipe: {
5379     const char *Name;
5380     if (BuiltinID == Builtin::BIcommit_read_pipe)
5381       Name = "__commit_read_pipe";
5382     else if (BuiltinID == Builtin::BIcommit_write_pipe)
5383       Name = "__commit_write_pipe";
5384     else if (BuiltinID == Builtin::BIwork_group_commit_read_pipe)
5385       Name = "__work_group_commit_read_pipe";
5386     else if (BuiltinID == Builtin::BIwork_group_commit_write_pipe)
5387       Name = "__work_group_commit_write_pipe";
5388     else if (BuiltinID == Builtin::BIsub_group_commit_read_pipe)
5389       Name = "__sub_group_commit_read_pipe";
5390     else
5391       Name = "__sub_group_commit_write_pipe";
5392 
5393     Value *Arg0 = EmitScalarExpr(E->getArg(0)),
5394           *Arg1 = EmitScalarExpr(E->getArg(1));
5395     CGOpenCLRuntime OpenCLRT(CGM);
5396     Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
5397     Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
5398 
5399     // Building the generic function prototype.
5400     llvm::Type *ArgTys[] = {Arg0->getType(), Arg1->getType(), Int32Ty, Int32Ty};
5401     llvm::FunctionType *FTy =
5402         llvm::FunctionType::get(llvm::Type::getVoidTy(getLLVMContext()),
5403                                 llvm::ArrayRef<llvm::Type *>(ArgTys), false);
5404 
5405     return RValue::get(EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
5406                                        {Arg0, Arg1, PacketSize, PacketAlign}));
5407   }
5408   // OpenCL v2.0 s6.13.16.4 Built-in pipe query functions
5409   case Builtin::BIget_pipe_num_packets:
5410   case Builtin::BIget_pipe_max_packets: {
5411     const char *BaseName;
5412     const auto *PipeTy = E->getArg(0)->getType()->castAs<PipeType>();
5413     if (BuiltinID == Builtin::BIget_pipe_num_packets)
5414       BaseName = "__get_pipe_num_packets";
5415     else
5416       BaseName = "__get_pipe_max_packets";
5417     std::string Name = std::string(BaseName) +
5418                        std::string(PipeTy->isReadOnly() ? "_ro" : "_wo");
5419 
5420     // Building the generic function prototype.
5421     Value *Arg0 = EmitScalarExpr(E->getArg(0));
5422     CGOpenCLRuntime OpenCLRT(CGM);
5423     Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
5424     Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
5425     llvm::Type *ArgTys[] = {Arg0->getType(), Int32Ty, Int32Ty};
5426     llvm::FunctionType *FTy = llvm::FunctionType::get(
5427         Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
5428 
5429     return RValue::get(EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
5430                                        {Arg0, PacketSize, PacketAlign}));
5431   }
5432 
5433   // OpenCL v2.0 s6.13.9 - Address space qualifier functions.
5434   case Builtin::BIto_global:
5435   case Builtin::BIto_local:
5436   case Builtin::BIto_private: {
5437     auto Arg0 = EmitScalarExpr(E->getArg(0));
5438     auto NewArgT = llvm::PointerType::get(
5439         getLLVMContext(),
5440         CGM.getContext().getTargetAddressSpace(LangAS::opencl_generic));
5441     auto NewRetT = llvm::PointerType::get(
5442         getLLVMContext(),
5443         CGM.getContext().getTargetAddressSpace(
5444             E->getType()->getPointeeType().getAddressSpace()));
5445     auto FTy = llvm::FunctionType::get(NewRetT, {NewArgT}, false);
5446     llvm::Value *NewArg;
5447     if (Arg0->getType()->getPointerAddressSpace() !=
5448         NewArgT->getPointerAddressSpace())
5449       NewArg = Builder.CreateAddrSpaceCast(Arg0, NewArgT);
5450     else
5451       NewArg = Builder.CreateBitOrPointerCast(Arg0, NewArgT);
5452     auto NewName = std::string("__") + E->getDirectCallee()->getName().str();
5453     auto NewCall =
5454         EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, NewName), {NewArg});
5455     return RValue::get(Builder.CreateBitOrPointerCast(NewCall,
5456       ConvertType(E->getType())));
5457   }
5458 
5459   // OpenCL v2.0, s6.13.17 - Enqueue kernel function.
5460   // It contains four different overload formats specified in Table 6.13.17.1.
5461   case Builtin::BIenqueue_kernel: {
5462     StringRef Name; // Generated function call name
5463     unsigned NumArgs = E->getNumArgs();
5464 
5465     llvm::Type *QueueTy = ConvertType(getContext().OCLQueueTy);
5466     llvm::Type *GenericVoidPtrTy = Builder.getPtrTy(
5467         getContext().getTargetAddressSpace(LangAS::opencl_generic));
5468 
5469     llvm::Value *Queue = EmitScalarExpr(E->getArg(0));
5470     llvm::Value *Flags = EmitScalarExpr(E->getArg(1));
5471     LValue NDRangeL = EmitAggExprToLValue(E->getArg(2));
5472     llvm::Value *Range = NDRangeL.getAddress(*this).getPointer();
5473     llvm::Type *RangeTy = NDRangeL.getAddress(*this).getType();
5474 
5475     if (NumArgs == 4) {
5476       // The most basic form of the call with parameters:
5477       // queue_t, kernel_enqueue_flags_t, ndrange_t, block(void)
5478       Name = "__enqueue_kernel_basic";
5479       llvm::Type *ArgTys[] = {QueueTy, Int32Ty, RangeTy, GenericVoidPtrTy,
5480                               GenericVoidPtrTy};
5481       llvm::FunctionType *FTy = llvm::FunctionType::get(
5482           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
5483 
5484       auto Info =
5485           CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(3));
5486       llvm::Value *Kernel =
5487           Builder.CreatePointerCast(Info.KernelHandle, GenericVoidPtrTy);
5488       llvm::Value *Block =
5489           Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
5490 
5491       AttrBuilder B(Builder.getContext());
5492       B.addByValAttr(NDRangeL.getAddress(*this).getElementType());
5493       llvm::AttributeList ByValAttrSet =
5494           llvm::AttributeList::get(CGM.getModule().getContext(), 3U, B);
5495 
5496       auto RTCall =
5497           EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name, ByValAttrSet),
5498                           {Queue, Flags, Range, Kernel, Block});
5499       RTCall->setAttributes(ByValAttrSet);
5500       return RValue::get(RTCall);
5501     }
5502     assert(NumArgs >= 5 && "Invalid enqueue_kernel signature");
5503 
5504     // Create a temporary array to hold the sizes of local pointer arguments
5505     // for the block. \p First is the position of the first size argument.
5506     auto CreateArrayForSizeVar = [=](unsigned First)
5507         -> std::tuple<llvm::Value *, llvm::Value *, llvm::Value *> {
5508       llvm::APInt ArraySize(32, NumArgs - First);
5509       QualType SizeArrayTy = getContext().getConstantArrayType(
5510           getContext().getSizeType(), ArraySize, nullptr,
5511           ArraySizeModifier::Normal,
5512           /*IndexTypeQuals=*/0);
5513       auto Tmp = CreateMemTemp(SizeArrayTy, "block_sizes");
5514       llvm::Value *TmpPtr = Tmp.getPointer();
5515       llvm::Value *TmpSize = EmitLifetimeStart(
5516           CGM.getDataLayout().getTypeAllocSize(Tmp.getElementType()), TmpPtr);
5517       llvm::Value *ElemPtr;
5518       // Each of the following arguments specifies the size of the corresponding
5519       // argument passed to the enqueued block.
5520       auto *Zero = llvm::ConstantInt::get(IntTy, 0);
5521       for (unsigned I = First; I < NumArgs; ++I) {
5522         auto *Index = llvm::ConstantInt::get(IntTy, I - First);
5523         auto *GEP = Builder.CreateGEP(Tmp.getElementType(), TmpPtr,
5524                                       {Zero, Index});
5525         if (I == First)
5526           ElemPtr = GEP;
5527         auto *V =
5528             Builder.CreateZExtOrTrunc(EmitScalarExpr(E->getArg(I)), SizeTy);
5529         Builder.CreateAlignedStore(
5530             V, GEP, CGM.getDataLayout().getPrefTypeAlign(SizeTy));
5531       }
5532       return std::tie(ElemPtr, TmpSize, TmpPtr);
5533     };
5534 
5535     // Could have events and/or varargs.
5536     if (E->getArg(3)->getType()->isBlockPointerType()) {
5537       // No events passed, but has variadic arguments.
5538       Name = "__enqueue_kernel_varargs";
5539       auto Info =
5540           CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(3));
5541       llvm::Value *Kernel =
5542           Builder.CreatePointerCast(Info.KernelHandle, GenericVoidPtrTy);
5543       auto *Block = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
5544       llvm::Value *ElemPtr, *TmpSize, *TmpPtr;
5545       std::tie(ElemPtr, TmpSize, TmpPtr) = CreateArrayForSizeVar(4);
5546 
5547       // Create a vector of the arguments, as well as a constant value to
5548       // express to the runtime the number of variadic arguments.
5549       llvm::Value *const Args[] = {Queue,  Flags,
5550                                    Range,  Kernel,
5551                                    Block,  ConstantInt::get(IntTy, NumArgs - 4),
5552                                    ElemPtr};
5553       llvm::Type *const ArgTys[] = {
5554           QueueTy,          IntTy, RangeTy,           GenericVoidPtrTy,
5555           GenericVoidPtrTy, IntTy, ElemPtr->getType()};
5556 
5557       llvm::FunctionType *FTy = llvm::FunctionType::get(Int32Ty, ArgTys, false);
5558       auto Call = RValue::get(
5559           EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Args));
5560       if (TmpSize)
5561         EmitLifetimeEnd(TmpSize, TmpPtr);
5562       return Call;
5563     }
5564     // Any calls now have event arguments passed.
5565     if (NumArgs >= 7) {
5566       llvm::PointerType *PtrTy = llvm::PointerType::get(
5567           CGM.getLLVMContext(),
5568           CGM.getContext().getTargetAddressSpace(LangAS::opencl_generic));
5569 
5570       llvm::Value *NumEvents =
5571           Builder.CreateZExtOrTrunc(EmitScalarExpr(E->getArg(3)), Int32Ty);
5572 
5573       // Since SemaOpenCLBuiltinEnqueueKernel allows fifth and sixth arguments
5574       // to be a null pointer constant (including `0` literal), we can take it
5575       // into account and emit null pointer directly.
5576       llvm::Value *EventWaitList = nullptr;
5577       if (E->getArg(4)->isNullPointerConstant(
5578               getContext(), Expr::NPC_ValueDependentIsNotNull)) {
5579         EventWaitList = llvm::ConstantPointerNull::get(PtrTy);
5580       } else {
5581         EventWaitList = E->getArg(4)->getType()->isArrayType()
5582                         ? EmitArrayToPointerDecay(E->getArg(4)).getPointer()
5583                         : EmitScalarExpr(E->getArg(4));
5584         // Convert to generic address space.
5585         EventWaitList = Builder.CreatePointerCast(EventWaitList, PtrTy);
5586       }
5587       llvm::Value *EventRet = nullptr;
5588       if (E->getArg(5)->isNullPointerConstant(
5589               getContext(), Expr::NPC_ValueDependentIsNotNull)) {
5590         EventRet = llvm::ConstantPointerNull::get(PtrTy);
5591       } else {
5592         EventRet =
5593             Builder.CreatePointerCast(EmitScalarExpr(E->getArg(5)), PtrTy);
5594       }
5595 
5596       auto Info =
5597           CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(6));
5598       llvm::Value *Kernel =
5599           Builder.CreatePointerCast(Info.KernelHandle, GenericVoidPtrTy);
5600       llvm::Value *Block =
5601           Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
5602 
5603       std::vector<llvm::Type *> ArgTys = {
5604           QueueTy, Int32Ty, RangeTy,          Int32Ty,
5605           PtrTy,   PtrTy,   GenericVoidPtrTy, GenericVoidPtrTy};
5606 
5607       std::vector<llvm::Value *> Args = {Queue,     Flags,         Range,
5608                                          NumEvents, EventWaitList, EventRet,
5609                                          Kernel,    Block};
5610 
5611       if (NumArgs == 7) {
5612         // Has events but no variadics.
5613         Name = "__enqueue_kernel_basic_events";
5614         llvm::FunctionType *FTy = llvm::FunctionType::get(
5615             Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
5616         return RValue::get(
5617             EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
5618                             llvm::ArrayRef<llvm::Value *>(Args)));
5619       }
5620       // Has event info and variadics
5621       // Pass the number of variadics to the runtime function too.
5622       Args.push_back(ConstantInt::get(Int32Ty, NumArgs - 7));
5623       ArgTys.push_back(Int32Ty);
5624       Name = "__enqueue_kernel_events_varargs";
5625 
5626       llvm::Value *ElemPtr, *TmpSize, *TmpPtr;
5627       std::tie(ElemPtr, TmpSize, TmpPtr) = CreateArrayForSizeVar(7);
5628       Args.push_back(ElemPtr);
5629       ArgTys.push_back(ElemPtr->getType());
5630 
5631       llvm::FunctionType *FTy = llvm::FunctionType::get(
5632           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
5633       auto Call =
5634           RValue::get(EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
5635                                       llvm::ArrayRef<llvm::Value *>(Args)));
5636       if (TmpSize)
5637         EmitLifetimeEnd(TmpSize, TmpPtr);
5638       return Call;
5639     }
5640     [[fallthrough]];
5641   }
5642   // OpenCL v2.0 s6.13.17.6 - Kernel query functions need bitcast of block
5643   // parameter.
5644   case Builtin::BIget_kernel_work_group_size: {
5645     llvm::Type *GenericVoidPtrTy = Builder.getPtrTy(
5646         getContext().getTargetAddressSpace(LangAS::opencl_generic));
5647     auto Info =
5648         CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(0));
5649     Value *Kernel =
5650         Builder.CreatePointerCast(Info.KernelHandle, GenericVoidPtrTy);
5651     Value *Arg = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
5652     return RValue::get(EmitRuntimeCall(
5653         CGM.CreateRuntimeFunction(
5654             llvm::FunctionType::get(IntTy, {GenericVoidPtrTy, GenericVoidPtrTy},
5655                                     false),
5656             "__get_kernel_work_group_size_impl"),
5657         {Kernel, Arg}));
5658   }
5659   case Builtin::BIget_kernel_preferred_work_group_size_multiple: {
5660     llvm::Type *GenericVoidPtrTy = Builder.getPtrTy(
5661         getContext().getTargetAddressSpace(LangAS::opencl_generic));
5662     auto Info =
5663         CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(0));
5664     Value *Kernel =
5665         Builder.CreatePointerCast(Info.KernelHandle, GenericVoidPtrTy);
5666     Value *Arg = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
5667     return RValue::get(EmitRuntimeCall(
5668         CGM.CreateRuntimeFunction(
5669             llvm::FunctionType::get(IntTy, {GenericVoidPtrTy, GenericVoidPtrTy},
5670                                     false),
5671             "__get_kernel_preferred_work_group_size_multiple_impl"),
5672         {Kernel, Arg}));
5673   }
5674   case Builtin::BIget_kernel_max_sub_group_size_for_ndrange:
5675   case Builtin::BIget_kernel_sub_group_count_for_ndrange: {
5676     llvm::Type *GenericVoidPtrTy = Builder.getPtrTy(
5677         getContext().getTargetAddressSpace(LangAS::opencl_generic));
5678     LValue NDRangeL = EmitAggExprToLValue(E->getArg(0));
5679     llvm::Value *NDRange = NDRangeL.getAddress(*this).getPointer();
5680     auto Info =
5681         CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(1));
5682     Value *Kernel =
5683         Builder.CreatePointerCast(Info.KernelHandle, GenericVoidPtrTy);
5684     Value *Block = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
5685     const char *Name =
5686         BuiltinID == Builtin::BIget_kernel_max_sub_group_size_for_ndrange
5687             ? "__get_kernel_max_sub_group_size_for_ndrange_impl"
5688             : "__get_kernel_sub_group_count_for_ndrange_impl";
5689     return RValue::get(EmitRuntimeCall(
5690         CGM.CreateRuntimeFunction(
5691             llvm::FunctionType::get(
5692                 IntTy, {NDRange->getType(), GenericVoidPtrTy, GenericVoidPtrTy},
5693                 false),
5694             Name),
5695         {NDRange, Kernel, Block}));
5696   }
5697 
5698   case Builtin::BI__builtin_store_half:
5699   case Builtin::BI__builtin_store_halff: {
5700     Value *Val = EmitScalarExpr(E->getArg(0));
5701     Address Address = EmitPointerWithAlignment(E->getArg(1));
5702     Value *HalfVal = Builder.CreateFPTrunc(Val, Builder.getHalfTy());
5703     Builder.CreateStore(HalfVal, Address);
5704     return RValue::get(nullptr);
5705   }
5706   case Builtin::BI__builtin_load_half: {
5707     Address Address = EmitPointerWithAlignment(E->getArg(0));
5708     Value *HalfVal = Builder.CreateLoad(Address);
5709     return RValue::get(Builder.CreateFPExt(HalfVal, Builder.getDoubleTy()));
5710   }
5711   case Builtin::BI__builtin_load_halff: {
5712     Address Address = EmitPointerWithAlignment(E->getArg(0));
5713     Value *HalfVal = Builder.CreateLoad(Address);
5714     return RValue::get(Builder.CreateFPExt(HalfVal, Builder.getFloatTy()));
5715   }
5716   case Builtin::BIprintf:
5717     if (getTarget().getTriple().isNVPTX() ||
5718         getTarget().getTriple().isAMDGCN()) {
5719       if (getLangOpts().OpenMPIsTargetDevice)
5720         return EmitOpenMPDevicePrintfCallExpr(E);
5721       if (getTarget().getTriple().isNVPTX())
5722         return EmitNVPTXDevicePrintfCallExpr(E);
5723       if (getTarget().getTriple().isAMDGCN() && getLangOpts().HIP)
5724         return EmitAMDGPUDevicePrintfCallExpr(E);
5725     }
5726 
5727     break;
5728   case Builtin::BI__builtin_canonicalize:
5729   case Builtin::BI__builtin_canonicalizef:
5730   case Builtin::BI__builtin_canonicalizef16:
5731   case Builtin::BI__builtin_canonicalizel:
5732     return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::canonicalize));
5733 
5734   case Builtin::BI__builtin_thread_pointer: {
5735     if (!getContext().getTargetInfo().isTLSSupported())
5736       CGM.ErrorUnsupported(E, "__builtin_thread_pointer");
5737     // Fall through - it's already mapped to the intrinsic by ClangBuiltin.
5738     break;
5739   }
5740   case Builtin::BI__builtin_os_log_format:
5741     return emitBuiltinOSLogFormat(*E);
5742 
5743   case Builtin::BI__xray_customevent: {
5744     if (!ShouldXRayInstrumentFunction())
5745       return RValue::getIgnored();
5746 
5747     if (!CGM.getCodeGenOpts().XRayInstrumentationBundle.has(
5748             XRayInstrKind::Custom))
5749       return RValue::getIgnored();
5750 
5751     if (const auto *XRayAttr = CurFuncDecl->getAttr<XRayInstrumentAttr>())
5752       if (XRayAttr->neverXRayInstrument() && !AlwaysEmitXRayCustomEvents())
5753         return RValue::getIgnored();
5754 
5755     Function *F = CGM.getIntrinsic(Intrinsic::xray_customevent);
5756     auto FTy = F->getFunctionType();
5757     auto Arg0 = E->getArg(0);
5758     auto Arg0Val = EmitScalarExpr(Arg0);
5759     auto Arg0Ty = Arg0->getType();
5760     auto PTy0 = FTy->getParamType(0);
5761     if (PTy0 != Arg0Val->getType()) {
5762       if (Arg0Ty->isArrayType())
5763         Arg0Val = EmitArrayToPointerDecay(Arg0).getPointer();
5764       else
5765         Arg0Val = Builder.CreatePointerCast(Arg0Val, PTy0);
5766     }
5767     auto Arg1 = EmitScalarExpr(E->getArg(1));
5768     auto PTy1 = FTy->getParamType(1);
5769     if (PTy1 != Arg1->getType())
5770       Arg1 = Builder.CreateTruncOrBitCast(Arg1, PTy1);
5771     return RValue::get(Builder.CreateCall(F, {Arg0Val, Arg1}));
5772   }
5773 
5774   case Builtin::BI__xray_typedevent: {
5775     // TODO: There should be a way to always emit events even if the current
5776     // function is not instrumented. Losing events in a stream can cripple
5777     // a trace.
5778     if (!ShouldXRayInstrumentFunction())
5779       return RValue::getIgnored();
5780 
5781     if (!CGM.getCodeGenOpts().XRayInstrumentationBundle.has(
5782             XRayInstrKind::Typed))
5783       return RValue::getIgnored();
5784 
5785     if (const auto *XRayAttr = CurFuncDecl->getAttr<XRayInstrumentAttr>())
5786       if (XRayAttr->neverXRayInstrument() && !AlwaysEmitXRayTypedEvents())
5787         return RValue::getIgnored();
5788 
5789     Function *F = CGM.getIntrinsic(Intrinsic::xray_typedevent);
5790     auto FTy = F->getFunctionType();
5791     auto Arg0 = EmitScalarExpr(E->getArg(0));
5792     auto PTy0 = FTy->getParamType(0);
5793     if (PTy0 != Arg0->getType())
5794       Arg0 = Builder.CreateTruncOrBitCast(Arg0, PTy0);
5795     auto Arg1 = E->getArg(1);
5796     auto Arg1Val = EmitScalarExpr(Arg1);
5797     auto Arg1Ty = Arg1->getType();
5798     auto PTy1 = FTy->getParamType(1);
5799     if (PTy1 != Arg1Val->getType()) {
5800       if (Arg1Ty->isArrayType())
5801         Arg1Val = EmitArrayToPointerDecay(Arg1).getPointer();
5802       else
5803         Arg1Val = Builder.CreatePointerCast(Arg1Val, PTy1);
5804     }
5805     auto Arg2 = EmitScalarExpr(E->getArg(2));
5806     auto PTy2 = FTy->getParamType(2);
5807     if (PTy2 != Arg2->getType())
5808       Arg2 = Builder.CreateTruncOrBitCast(Arg2, PTy2);
5809     return RValue::get(Builder.CreateCall(F, {Arg0, Arg1Val, Arg2}));
5810   }
5811 
5812   case Builtin::BI__builtin_ms_va_start:
5813   case Builtin::BI__builtin_ms_va_end:
5814     return RValue::get(
5815         EmitVAStartEnd(EmitMSVAListRef(E->getArg(0)).getPointer(),
5816                        BuiltinID == Builtin::BI__builtin_ms_va_start));
5817 
5818   case Builtin::BI__builtin_ms_va_copy: {
5819     // Lower this manually. We can't reliably determine whether or not any
5820     // given va_copy() is for a Win64 va_list from the calling convention
5821     // alone, because it's legal to do this from a System V ABI function.
5822     // With opaque pointer types, we won't have enough information in LLVM
5823     // IR to determine this from the argument types, either. Best to do it
5824     // now, while we have enough information.
5825     Address DestAddr = EmitMSVAListRef(E->getArg(0));
5826     Address SrcAddr = EmitMSVAListRef(E->getArg(1));
5827 
5828     DestAddr = DestAddr.withElementType(Int8PtrTy);
5829     SrcAddr = SrcAddr.withElementType(Int8PtrTy);
5830 
5831     Value *ArgPtr = Builder.CreateLoad(SrcAddr, "ap.val");
5832     return RValue::get(Builder.CreateStore(ArgPtr, DestAddr));
5833   }
5834 
5835   case Builtin::BI__builtin_get_device_side_mangled_name: {
5836     auto Name = CGM.getCUDARuntime().getDeviceSideName(
5837         cast<DeclRefExpr>(E->getArg(0)->IgnoreImpCasts())->getDecl());
5838     auto Str = CGM.GetAddrOfConstantCString(Name, "");
5839     llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
5840                                llvm::ConstantInt::get(SizeTy, 0)};
5841     auto *Ptr = llvm::ConstantExpr::getGetElementPtr(Str.getElementType(),
5842                                                      Str.getPointer(), Zeros);
5843     return RValue::get(Ptr);
5844   }
5845   }
5846 
5847   // If this is an alias for a lib function (e.g. __builtin_sin), emit
5848   // the call using the normal call path, but using the unmangled
5849   // version of the function name.
5850   if (getContext().BuiltinInfo.isLibFunction(BuiltinID))
5851     return emitLibraryCall(*this, FD, E,
5852                            CGM.getBuiltinLibFunction(FD, BuiltinID));
5853 
5854   // If this is a predefined lib function (e.g. malloc), emit the call
5855   // using exactly the normal call path.
5856   if (getContext().BuiltinInfo.isPredefinedLibFunction(BuiltinID))
5857     return emitLibraryCall(*this, FD, E,
5858                       cast<llvm::Constant>(EmitScalarExpr(E->getCallee())));
5859 
5860   // Check that a call to a target specific builtin has the correct target
5861   // features.
5862   // This is down here to avoid non-target specific builtins, however, if
5863   // generic builtins start to require generic target features then we
5864   // can move this up to the beginning of the function.
5865   checkTargetFeatures(E, FD);
5866 
5867   if (unsigned VectorWidth = getContext().BuiltinInfo.getRequiredVectorWidth(BuiltinID))
5868     LargestVectorWidth = std::max(LargestVectorWidth, VectorWidth);
5869 
5870   // See if we have a target specific intrinsic.
5871   StringRef Name = getContext().BuiltinInfo.getName(BuiltinID);
5872   Intrinsic::ID IntrinsicID = Intrinsic::not_intrinsic;
5873   StringRef Prefix =
5874       llvm::Triple::getArchTypePrefix(getTarget().getTriple().getArch());
5875   if (!Prefix.empty()) {
5876     IntrinsicID = Intrinsic::getIntrinsicForClangBuiltin(Prefix.data(), Name);
5877     // NOTE we don't need to perform a compatibility flag check here since the
5878     // intrinsics are declared in Builtins*.def via LANGBUILTIN which filter the
5879     // MS builtins via ALL_MS_LANGUAGES and are filtered earlier.
5880     if (IntrinsicID == Intrinsic::not_intrinsic)
5881       IntrinsicID = Intrinsic::getIntrinsicForMSBuiltin(Prefix.data(), Name);
5882   }
5883 
5884   if (IntrinsicID != Intrinsic::not_intrinsic) {
5885     SmallVector<Value*, 16> Args;
5886 
5887     // Find out if any arguments are required to be integer constant
5888     // expressions.
5889     unsigned ICEArguments = 0;
5890     ASTContext::GetBuiltinTypeError Error;
5891     getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
5892     assert(Error == ASTContext::GE_None && "Should not codegen an error");
5893 
5894     Function *F = CGM.getIntrinsic(IntrinsicID);
5895     llvm::FunctionType *FTy = F->getFunctionType();
5896 
5897     for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) {
5898       Value *ArgValue = EmitScalarOrConstFoldImmArg(ICEArguments, i, E);
5899       // If the intrinsic arg type is different from the builtin arg type
5900       // we need to do a bit cast.
5901       llvm::Type *PTy = FTy->getParamType(i);
5902       if (PTy != ArgValue->getType()) {
5903         // XXX - vector of pointers?
5904         if (auto *PtrTy = dyn_cast<llvm::PointerType>(PTy)) {
5905           if (PtrTy->getAddressSpace() !=
5906               ArgValue->getType()->getPointerAddressSpace()) {
5907             ArgValue = Builder.CreateAddrSpaceCast(
5908                 ArgValue, llvm::PointerType::get(getLLVMContext(),
5909                                                  PtrTy->getAddressSpace()));
5910           }
5911         }
5912 
5913         assert(PTy->canLosslesslyBitCastTo(FTy->getParamType(i)) &&
5914                "Must be able to losslessly bit cast to param");
5915         // Cast vector type (e.g., v256i32) to x86_amx, this only happen
5916         // in amx intrinsics.
5917         if (PTy->isX86_AMXTy())
5918           ArgValue = Builder.CreateIntrinsic(Intrinsic::x86_cast_vector_to_tile,
5919                                              {ArgValue->getType()}, {ArgValue});
5920         else
5921           ArgValue = Builder.CreateBitCast(ArgValue, PTy);
5922       }
5923 
5924       Args.push_back(ArgValue);
5925     }
5926 
5927     Value *V = Builder.CreateCall(F, Args);
5928     QualType BuiltinRetType = E->getType();
5929 
5930     llvm::Type *RetTy = VoidTy;
5931     if (!BuiltinRetType->isVoidType())
5932       RetTy = ConvertType(BuiltinRetType);
5933 
5934     if (RetTy != V->getType()) {
5935       // XXX - vector of pointers?
5936       if (auto *PtrTy = dyn_cast<llvm::PointerType>(RetTy)) {
5937         if (PtrTy->getAddressSpace() != V->getType()->getPointerAddressSpace()) {
5938           V = Builder.CreateAddrSpaceCast(
5939               V, llvm::PointerType::get(getLLVMContext(),
5940                                         PtrTy->getAddressSpace()));
5941         }
5942       }
5943 
5944       assert(V->getType()->canLosslesslyBitCastTo(RetTy) &&
5945              "Must be able to losslessly bit cast result type");
5946       // Cast x86_amx to vector type (e.g., v256i32), this only happen
5947       // in amx intrinsics.
5948       if (V->getType()->isX86_AMXTy())
5949         V = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector, {RetTy},
5950                                     {V});
5951       else
5952         V = Builder.CreateBitCast(V, RetTy);
5953     }
5954 
5955     if (RetTy->isVoidTy())
5956       return RValue::get(nullptr);
5957 
5958     return RValue::get(V);
5959   }
5960 
5961   // Some target-specific builtins can have aggregate return values, e.g.
5962   // __builtin_arm_mve_vld2q_u32. So if the result is an aggregate, force
5963   // ReturnValue to be non-null, so that the target-specific emission code can
5964   // always just emit into it.
5965   TypeEvaluationKind EvalKind = getEvaluationKind(E->getType());
5966   if (EvalKind == TEK_Aggregate && ReturnValue.isNull()) {
5967     Address DestPtr = CreateMemTemp(E->getType(), "agg.tmp");
5968     ReturnValue = ReturnValueSlot(DestPtr, false);
5969   }
5970 
5971   // Now see if we can emit a target-specific builtin.
5972   if (Value *V = EmitTargetBuiltinExpr(BuiltinID, E, ReturnValue)) {
5973     switch (EvalKind) {
5974     case TEK_Scalar:
5975       if (V->getType()->isVoidTy())
5976         return RValue::get(nullptr);
5977       return RValue::get(V);
5978     case TEK_Aggregate:
5979       return RValue::getAggregate(ReturnValue.getValue(),
5980                                   ReturnValue.isVolatile());
5981     case TEK_Complex:
5982       llvm_unreachable("No current target builtin returns complex");
5983     }
5984     llvm_unreachable("Bad evaluation kind in EmitBuiltinExpr");
5985   }
5986 
5987   if (getLangOpts().HIPStdPar && getLangOpts().CUDAIsDevice)
5988     return EmitHipStdParUnsupportedBuiltin(this, FD);
5989 
5990   ErrorUnsupported(E, "builtin function");
5991 
5992   // Unknown builtin, for now just dump it out and return undef.
5993   return GetUndefRValue(E->getType());
5994 }
5995 
5996 static Value *EmitTargetArchBuiltinExpr(CodeGenFunction *CGF,
5997                                         unsigned BuiltinID, const CallExpr *E,
5998                                         ReturnValueSlot ReturnValue,
5999                                         llvm::Triple::ArchType Arch) {
6000   // When compiling in HipStdPar mode we have to be conservative in rejecting
6001   // target specific features in the FE, and defer the possible error to the
6002   // AcceleratorCodeSelection pass, wherein iff an unsupported target builtin is
6003   // referenced by an accelerator executable function, we emit an error.
6004   // Returning nullptr here leads to the builtin being handled in
6005   // EmitStdParUnsupportedBuiltin.
6006   if (CGF->getLangOpts().HIPStdPar && CGF->getLangOpts().CUDAIsDevice &&
6007       Arch != CGF->getTarget().getTriple().getArch())
6008     return nullptr;
6009 
6010   switch (Arch) {
6011   case llvm::Triple::arm:
6012   case llvm::Triple::armeb:
6013   case llvm::Triple::thumb:
6014   case llvm::Triple::thumbeb:
6015     return CGF->EmitARMBuiltinExpr(BuiltinID, E, ReturnValue, Arch);
6016   case llvm::Triple::aarch64:
6017   case llvm::Triple::aarch64_32:
6018   case llvm::Triple::aarch64_be:
6019     return CGF->EmitAArch64BuiltinExpr(BuiltinID, E, Arch);
6020   case llvm::Triple::bpfeb:
6021   case llvm::Triple::bpfel:
6022     return CGF->EmitBPFBuiltinExpr(BuiltinID, E);
6023   case llvm::Triple::x86:
6024   case llvm::Triple::x86_64:
6025     return CGF->EmitX86BuiltinExpr(BuiltinID, E);
6026   case llvm::Triple::ppc:
6027   case llvm::Triple::ppcle:
6028   case llvm::Triple::ppc64:
6029   case llvm::Triple::ppc64le:
6030     return CGF->EmitPPCBuiltinExpr(BuiltinID, E);
6031   case llvm::Triple::r600:
6032   case llvm::Triple::amdgcn:
6033     return CGF->EmitAMDGPUBuiltinExpr(BuiltinID, E);
6034   case llvm::Triple::systemz:
6035     return CGF->EmitSystemZBuiltinExpr(BuiltinID, E);
6036   case llvm::Triple::nvptx:
6037   case llvm::Triple::nvptx64:
6038     return CGF->EmitNVPTXBuiltinExpr(BuiltinID, E);
6039   case llvm::Triple::wasm32:
6040   case llvm::Triple::wasm64:
6041     return CGF->EmitWebAssemblyBuiltinExpr(BuiltinID, E);
6042   case llvm::Triple::hexagon:
6043     return CGF->EmitHexagonBuiltinExpr(BuiltinID, E);
6044   case llvm::Triple::riscv32:
6045   case llvm::Triple::riscv64:
6046     return CGF->EmitRISCVBuiltinExpr(BuiltinID, E, ReturnValue);
6047   default:
6048     return nullptr;
6049   }
6050 }
6051 
6052 Value *CodeGenFunction::EmitTargetBuiltinExpr(unsigned BuiltinID,
6053                                               const CallExpr *E,
6054                                               ReturnValueSlot ReturnValue) {
6055   if (getContext().BuiltinInfo.isAuxBuiltinID(BuiltinID)) {
6056     assert(getContext().getAuxTargetInfo() && "Missing aux target info");
6057     return EmitTargetArchBuiltinExpr(
6058         this, getContext().BuiltinInfo.getAuxBuiltinID(BuiltinID), E,
6059         ReturnValue, getContext().getAuxTargetInfo()->getTriple().getArch());
6060   }
6061 
6062   return EmitTargetArchBuiltinExpr(this, BuiltinID, E, ReturnValue,
6063                                    getTarget().getTriple().getArch());
6064 }
6065 
6066 static llvm::FixedVectorType *GetNeonType(CodeGenFunction *CGF,
6067                                           NeonTypeFlags TypeFlags,
6068                                           bool HasLegalHalfType = true,
6069                                           bool V1Ty = false,
6070                                           bool AllowBFloatArgsAndRet = true) {
6071   int IsQuad = TypeFlags.isQuad();
6072   switch (TypeFlags.getEltType()) {
6073   case NeonTypeFlags::Int8:
6074   case NeonTypeFlags::Poly8:
6075     return llvm::FixedVectorType::get(CGF->Int8Ty, V1Ty ? 1 : (8 << IsQuad));
6076   case NeonTypeFlags::Int16:
6077   case NeonTypeFlags::Poly16:
6078     return llvm::FixedVectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
6079   case NeonTypeFlags::BFloat16:
6080     if (AllowBFloatArgsAndRet)
6081       return llvm::FixedVectorType::get(CGF->BFloatTy, V1Ty ? 1 : (4 << IsQuad));
6082     else
6083       return llvm::FixedVectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
6084   case NeonTypeFlags::Float16:
6085     if (HasLegalHalfType)
6086       return llvm::FixedVectorType::get(CGF->HalfTy, V1Ty ? 1 : (4 << IsQuad));
6087     else
6088       return llvm::FixedVectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
6089   case NeonTypeFlags::Int32:
6090     return llvm::FixedVectorType::get(CGF->Int32Ty, V1Ty ? 1 : (2 << IsQuad));
6091   case NeonTypeFlags::Int64:
6092   case NeonTypeFlags::Poly64:
6093     return llvm::FixedVectorType::get(CGF->Int64Ty, V1Ty ? 1 : (1 << IsQuad));
6094   case NeonTypeFlags::Poly128:
6095     // FIXME: i128 and f128 doesn't get fully support in Clang and llvm.
6096     // There is a lot of i128 and f128 API missing.
6097     // so we use v16i8 to represent poly128 and get pattern matched.
6098     return llvm::FixedVectorType::get(CGF->Int8Ty, 16);
6099   case NeonTypeFlags::Float32:
6100     return llvm::FixedVectorType::get(CGF->FloatTy, V1Ty ? 1 : (2 << IsQuad));
6101   case NeonTypeFlags::Float64:
6102     return llvm::FixedVectorType::get(CGF->DoubleTy, V1Ty ? 1 : (1 << IsQuad));
6103   }
6104   llvm_unreachable("Unknown vector element type!");
6105 }
6106 
6107 static llvm::VectorType *GetFloatNeonType(CodeGenFunction *CGF,
6108                                           NeonTypeFlags IntTypeFlags) {
6109   int IsQuad = IntTypeFlags.isQuad();
6110   switch (IntTypeFlags.getEltType()) {
6111   case NeonTypeFlags::Int16:
6112     return llvm::FixedVectorType::get(CGF->HalfTy, (4 << IsQuad));
6113   case NeonTypeFlags::Int32:
6114     return llvm::FixedVectorType::get(CGF->FloatTy, (2 << IsQuad));
6115   case NeonTypeFlags::Int64:
6116     return llvm::FixedVectorType::get(CGF->DoubleTy, (1 << IsQuad));
6117   default:
6118     llvm_unreachable("Type can't be converted to floating-point!");
6119   }
6120 }
6121 
6122 Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C,
6123                                       const ElementCount &Count) {
6124   Value *SV = llvm::ConstantVector::getSplat(Count, C);
6125   return Builder.CreateShuffleVector(V, V, SV, "lane");
6126 }
6127 
6128 Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C) {
6129   ElementCount EC = cast<llvm::VectorType>(V->getType())->getElementCount();
6130   return EmitNeonSplat(V, C, EC);
6131 }
6132 
6133 Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl<Value*> &Ops,
6134                                      const char *name,
6135                                      unsigned shift, bool rightshift) {
6136   unsigned j = 0;
6137   for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
6138        ai != ae; ++ai, ++j) {
6139     if (F->isConstrainedFPIntrinsic())
6140       if (ai->getType()->isMetadataTy())
6141         continue;
6142     if (shift > 0 && shift == j)
6143       Ops[j] = EmitNeonShiftVector(Ops[j], ai->getType(), rightshift);
6144     else
6145       Ops[j] = Builder.CreateBitCast(Ops[j], ai->getType(), name);
6146   }
6147 
6148   if (F->isConstrainedFPIntrinsic())
6149     return Builder.CreateConstrainedFPCall(F, Ops, name);
6150   else
6151     return Builder.CreateCall(F, Ops, name);
6152 }
6153 
6154 Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
6155                                             bool neg) {
6156   int SV = cast<ConstantInt>(V)->getSExtValue();
6157   return ConstantInt::get(Ty, neg ? -SV : SV);
6158 }
6159 
6160 // Right-shift a vector by a constant.
6161 Value *CodeGenFunction::EmitNeonRShiftImm(Value *Vec, Value *Shift,
6162                                           llvm::Type *Ty, bool usgn,
6163                                           const char *name) {
6164   llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
6165 
6166   int ShiftAmt = cast<ConstantInt>(Shift)->getSExtValue();
6167   int EltSize = VTy->getScalarSizeInBits();
6168 
6169   Vec = Builder.CreateBitCast(Vec, Ty);
6170 
6171   // lshr/ashr are undefined when the shift amount is equal to the vector
6172   // element size.
6173   if (ShiftAmt == EltSize) {
6174     if (usgn) {
6175       // Right-shifting an unsigned value by its size yields 0.
6176       return llvm::ConstantAggregateZero::get(VTy);
6177     } else {
6178       // Right-shifting a signed value by its size is equivalent
6179       // to a shift of size-1.
6180       --ShiftAmt;
6181       Shift = ConstantInt::get(VTy->getElementType(), ShiftAmt);
6182     }
6183   }
6184 
6185   Shift = EmitNeonShiftVector(Shift, Ty, false);
6186   if (usgn)
6187     return Builder.CreateLShr(Vec, Shift, name);
6188   else
6189     return Builder.CreateAShr(Vec, Shift, name);
6190 }
6191 
6192 enum {
6193   AddRetType = (1 << 0),
6194   Add1ArgType = (1 << 1),
6195   Add2ArgTypes = (1 << 2),
6196 
6197   VectorizeRetType = (1 << 3),
6198   VectorizeArgTypes = (1 << 4),
6199 
6200   InventFloatType = (1 << 5),
6201   UnsignedAlts = (1 << 6),
6202 
6203   Use64BitVectors = (1 << 7),
6204   Use128BitVectors = (1 << 8),
6205 
6206   Vectorize1ArgType = Add1ArgType | VectorizeArgTypes,
6207   VectorRet = AddRetType | VectorizeRetType,
6208   VectorRetGetArgs01 =
6209       AddRetType | Add2ArgTypes | VectorizeRetType | VectorizeArgTypes,
6210   FpCmpzModifiers =
6211       AddRetType | VectorizeRetType | Add1ArgType | InventFloatType
6212 };
6213 
6214 namespace {
6215 struct ARMVectorIntrinsicInfo {
6216   const char *NameHint;
6217   unsigned BuiltinID;
6218   unsigned LLVMIntrinsic;
6219   unsigned AltLLVMIntrinsic;
6220   uint64_t TypeModifier;
6221 
6222   bool operator<(unsigned RHSBuiltinID) const {
6223     return BuiltinID < RHSBuiltinID;
6224   }
6225   bool operator<(const ARMVectorIntrinsicInfo &TE) const {
6226     return BuiltinID < TE.BuiltinID;
6227   }
6228 };
6229 } // end anonymous namespace
6230 
6231 #define NEONMAP0(NameBase) \
6232   { #NameBase, NEON::BI__builtin_neon_ ## NameBase, 0, 0, 0 }
6233 
6234 #define NEONMAP1(NameBase, LLVMIntrinsic, TypeModifier) \
6235   { #NameBase, NEON:: BI__builtin_neon_ ## NameBase, \
6236       Intrinsic::LLVMIntrinsic, 0, TypeModifier }
6237 
6238 #define NEONMAP2(NameBase, LLVMIntrinsic, AltLLVMIntrinsic, TypeModifier) \
6239   { #NameBase, NEON:: BI__builtin_neon_ ## NameBase, \
6240       Intrinsic::LLVMIntrinsic, Intrinsic::AltLLVMIntrinsic, \
6241       TypeModifier }
6242 
6243 static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
6244   NEONMAP1(__a32_vcvt_bf16_f32, arm_neon_vcvtfp2bf, 0),
6245   NEONMAP0(splat_lane_v),
6246   NEONMAP0(splat_laneq_v),
6247   NEONMAP0(splatq_lane_v),
6248   NEONMAP0(splatq_laneq_v),
6249   NEONMAP2(vabd_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
6250   NEONMAP2(vabdq_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
6251   NEONMAP1(vabs_v, arm_neon_vabs, 0),
6252   NEONMAP1(vabsq_v, arm_neon_vabs, 0),
6253   NEONMAP0(vadd_v),
6254   NEONMAP0(vaddhn_v),
6255   NEONMAP0(vaddq_v),
6256   NEONMAP1(vaesdq_u8, arm_neon_aesd, 0),
6257   NEONMAP1(vaeseq_u8, arm_neon_aese, 0),
6258   NEONMAP1(vaesimcq_u8, arm_neon_aesimc, 0),
6259   NEONMAP1(vaesmcq_u8, arm_neon_aesmc, 0),
6260   NEONMAP1(vbfdot_f32, arm_neon_bfdot, 0),
6261   NEONMAP1(vbfdotq_f32, arm_neon_bfdot, 0),
6262   NEONMAP1(vbfmlalbq_f32, arm_neon_bfmlalb, 0),
6263   NEONMAP1(vbfmlaltq_f32, arm_neon_bfmlalt, 0),
6264   NEONMAP1(vbfmmlaq_f32, arm_neon_bfmmla, 0),
6265   NEONMAP1(vbsl_v, arm_neon_vbsl, AddRetType),
6266   NEONMAP1(vbslq_v, arm_neon_vbsl, AddRetType),
6267   NEONMAP1(vcadd_rot270_f16, arm_neon_vcadd_rot270, Add1ArgType),
6268   NEONMAP1(vcadd_rot270_f32, arm_neon_vcadd_rot270, Add1ArgType),
6269   NEONMAP1(vcadd_rot90_f16, arm_neon_vcadd_rot90, Add1ArgType),
6270   NEONMAP1(vcadd_rot90_f32, arm_neon_vcadd_rot90, Add1ArgType),
6271   NEONMAP1(vcaddq_rot270_f16, arm_neon_vcadd_rot270, Add1ArgType),
6272   NEONMAP1(vcaddq_rot270_f32, arm_neon_vcadd_rot270, Add1ArgType),
6273   NEONMAP1(vcaddq_rot270_f64, arm_neon_vcadd_rot270, Add1ArgType),
6274   NEONMAP1(vcaddq_rot90_f16, arm_neon_vcadd_rot90, Add1ArgType),
6275   NEONMAP1(vcaddq_rot90_f32, arm_neon_vcadd_rot90, Add1ArgType),
6276   NEONMAP1(vcaddq_rot90_f64, arm_neon_vcadd_rot90, Add1ArgType),
6277   NEONMAP1(vcage_v, arm_neon_vacge, 0),
6278   NEONMAP1(vcageq_v, arm_neon_vacge, 0),
6279   NEONMAP1(vcagt_v, arm_neon_vacgt, 0),
6280   NEONMAP1(vcagtq_v, arm_neon_vacgt, 0),
6281   NEONMAP1(vcale_v, arm_neon_vacge, 0),
6282   NEONMAP1(vcaleq_v, arm_neon_vacge, 0),
6283   NEONMAP1(vcalt_v, arm_neon_vacgt, 0),
6284   NEONMAP1(vcaltq_v, arm_neon_vacgt, 0),
6285   NEONMAP0(vceqz_v),
6286   NEONMAP0(vceqzq_v),
6287   NEONMAP0(vcgez_v),
6288   NEONMAP0(vcgezq_v),
6289   NEONMAP0(vcgtz_v),
6290   NEONMAP0(vcgtzq_v),
6291   NEONMAP0(vclez_v),
6292   NEONMAP0(vclezq_v),
6293   NEONMAP1(vcls_v, arm_neon_vcls, Add1ArgType),
6294   NEONMAP1(vclsq_v, arm_neon_vcls, Add1ArgType),
6295   NEONMAP0(vcltz_v),
6296   NEONMAP0(vcltzq_v),
6297   NEONMAP1(vclz_v, ctlz, Add1ArgType),
6298   NEONMAP1(vclzq_v, ctlz, Add1ArgType),
6299   NEONMAP1(vcnt_v, ctpop, Add1ArgType),
6300   NEONMAP1(vcntq_v, ctpop, Add1ArgType),
6301   NEONMAP1(vcvt_f16_f32, arm_neon_vcvtfp2hf, 0),
6302   NEONMAP0(vcvt_f16_s16),
6303   NEONMAP0(vcvt_f16_u16),
6304   NEONMAP1(vcvt_f32_f16, arm_neon_vcvthf2fp, 0),
6305   NEONMAP0(vcvt_f32_v),
6306   NEONMAP1(vcvt_n_f16_s16, arm_neon_vcvtfxs2fp, 0),
6307   NEONMAP1(vcvt_n_f16_u16, arm_neon_vcvtfxu2fp, 0),
6308   NEONMAP2(vcvt_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
6309   NEONMAP1(vcvt_n_s16_f16, arm_neon_vcvtfp2fxs, 0),
6310   NEONMAP1(vcvt_n_s32_v, arm_neon_vcvtfp2fxs, 0),
6311   NEONMAP1(vcvt_n_s64_v, arm_neon_vcvtfp2fxs, 0),
6312   NEONMAP1(vcvt_n_u16_f16, arm_neon_vcvtfp2fxu, 0),
6313   NEONMAP1(vcvt_n_u32_v, arm_neon_vcvtfp2fxu, 0),
6314   NEONMAP1(vcvt_n_u64_v, arm_neon_vcvtfp2fxu, 0),
6315   NEONMAP0(vcvt_s16_f16),
6316   NEONMAP0(vcvt_s32_v),
6317   NEONMAP0(vcvt_s64_v),
6318   NEONMAP0(vcvt_u16_f16),
6319   NEONMAP0(vcvt_u32_v),
6320   NEONMAP0(vcvt_u64_v),
6321   NEONMAP1(vcvta_s16_f16, arm_neon_vcvtas, 0),
6322   NEONMAP1(vcvta_s32_v, arm_neon_vcvtas, 0),
6323   NEONMAP1(vcvta_s64_v, arm_neon_vcvtas, 0),
6324   NEONMAP1(vcvta_u16_f16, arm_neon_vcvtau, 0),
6325   NEONMAP1(vcvta_u32_v, arm_neon_vcvtau, 0),
6326   NEONMAP1(vcvta_u64_v, arm_neon_vcvtau, 0),
6327   NEONMAP1(vcvtaq_s16_f16, arm_neon_vcvtas, 0),
6328   NEONMAP1(vcvtaq_s32_v, arm_neon_vcvtas, 0),
6329   NEONMAP1(vcvtaq_s64_v, arm_neon_vcvtas, 0),
6330   NEONMAP1(vcvtaq_u16_f16, arm_neon_vcvtau, 0),
6331   NEONMAP1(vcvtaq_u32_v, arm_neon_vcvtau, 0),
6332   NEONMAP1(vcvtaq_u64_v, arm_neon_vcvtau, 0),
6333   NEONMAP1(vcvth_bf16_f32, arm_neon_vcvtbfp2bf, 0),
6334   NEONMAP1(vcvtm_s16_f16, arm_neon_vcvtms, 0),
6335   NEONMAP1(vcvtm_s32_v, arm_neon_vcvtms, 0),
6336   NEONMAP1(vcvtm_s64_v, arm_neon_vcvtms, 0),
6337   NEONMAP1(vcvtm_u16_f16, arm_neon_vcvtmu, 0),
6338   NEONMAP1(vcvtm_u32_v, arm_neon_vcvtmu, 0),
6339   NEONMAP1(vcvtm_u64_v, arm_neon_vcvtmu, 0),
6340   NEONMAP1(vcvtmq_s16_f16, arm_neon_vcvtms, 0),
6341   NEONMAP1(vcvtmq_s32_v, arm_neon_vcvtms, 0),
6342   NEONMAP1(vcvtmq_s64_v, arm_neon_vcvtms, 0),
6343   NEONMAP1(vcvtmq_u16_f16, arm_neon_vcvtmu, 0),
6344   NEONMAP1(vcvtmq_u32_v, arm_neon_vcvtmu, 0),
6345   NEONMAP1(vcvtmq_u64_v, arm_neon_vcvtmu, 0),
6346   NEONMAP1(vcvtn_s16_f16, arm_neon_vcvtns, 0),
6347   NEONMAP1(vcvtn_s32_v, arm_neon_vcvtns, 0),
6348   NEONMAP1(vcvtn_s64_v, arm_neon_vcvtns, 0),
6349   NEONMAP1(vcvtn_u16_f16, arm_neon_vcvtnu, 0),
6350   NEONMAP1(vcvtn_u32_v, arm_neon_vcvtnu, 0),
6351   NEONMAP1(vcvtn_u64_v, arm_neon_vcvtnu, 0),
6352   NEONMAP1(vcvtnq_s16_f16, arm_neon_vcvtns, 0),
6353   NEONMAP1(vcvtnq_s32_v, arm_neon_vcvtns, 0),
6354   NEONMAP1(vcvtnq_s64_v, arm_neon_vcvtns, 0),
6355   NEONMAP1(vcvtnq_u16_f16, arm_neon_vcvtnu, 0),
6356   NEONMAP1(vcvtnq_u32_v, arm_neon_vcvtnu, 0),
6357   NEONMAP1(vcvtnq_u64_v, arm_neon_vcvtnu, 0),
6358   NEONMAP1(vcvtp_s16_f16, arm_neon_vcvtps, 0),
6359   NEONMAP1(vcvtp_s32_v, arm_neon_vcvtps, 0),
6360   NEONMAP1(vcvtp_s64_v, arm_neon_vcvtps, 0),
6361   NEONMAP1(vcvtp_u16_f16, arm_neon_vcvtpu, 0),
6362   NEONMAP1(vcvtp_u32_v, arm_neon_vcvtpu, 0),
6363   NEONMAP1(vcvtp_u64_v, arm_neon_vcvtpu, 0),
6364   NEONMAP1(vcvtpq_s16_f16, arm_neon_vcvtps, 0),
6365   NEONMAP1(vcvtpq_s32_v, arm_neon_vcvtps, 0),
6366   NEONMAP1(vcvtpq_s64_v, arm_neon_vcvtps, 0),
6367   NEONMAP1(vcvtpq_u16_f16, arm_neon_vcvtpu, 0),
6368   NEONMAP1(vcvtpq_u32_v, arm_neon_vcvtpu, 0),
6369   NEONMAP1(vcvtpq_u64_v, arm_neon_vcvtpu, 0),
6370   NEONMAP0(vcvtq_f16_s16),
6371   NEONMAP0(vcvtq_f16_u16),
6372   NEONMAP0(vcvtq_f32_v),
6373   NEONMAP1(vcvtq_n_f16_s16, arm_neon_vcvtfxs2fp, 0),
6374   NEONMAP1(vcvtq_n_f16_u16, arm_neon_vcvtfxu2fp, 0),
6375   NEONMAP2(vcvtq_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
6376   NEONMAP1(vcvtq_n_s16_f16, arm_neon_vcvtfp2fxs, 0),
6377   NEONMAP1(vcvtq_n_s32_v, arm_neon_vcvtfp2fxs, 0),
6378   NEONMAP1(vcvtq_n_s64_v, arm_neon_vcvtfp2fxs, 0),
6379   NEONMAP1(vcvtq_n_u16_f16, arm_neon_vcvtfp2fxu, 0),
6380   NEONMAP1(vcvtq_n_u32_v, arm_neon_vcvtfp2fxu, 0),
6381   NEONMAP1(vcvtq_n_u64_v, arm_neon_vcvtfp2fxu, 0),
6382   NEONMAP0(vcvtq_s16_f16),
6383   NEONMAP0(vcvtq_s32_v),
6384   NEONMAP0(vcvtq_s64_v),
6385   NEONMAP0(vcvtq_u16_f16),
6386   NEONMAP0(vcvtq_u32_v),
6387   NEONMAP0(vcvtq_u64_v),
6388   NEONMAP1(vdot_s32, arm_neon_sdot, 0),
6389   NEONMAP1(vdot_u32, arm_neon_udot, 0),
6390   NEONMAP1(vdotq_s32, arm_neon_sdot, 0),
6391   NEONMAP1(vdotq_u32, arm_neon_udot, 0),
6392   NEONMAP0(vext_v),
6393   NEONMAP0(vextq_v),
6394   NEONMAP0(vfma_v),
6395   NEONMAP0(vfmaq_v),
6396   NEONMAP2(vhadd_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts),
6397   NEONMAP2(vhaddq_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts),
6398   NEONMAP2(vhsub_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
6399   NEONMAP2(vhsubq_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
6400   NEONMAP0(vld1_dup_v),
6401   NEONMAP1(vld1_v, arm_neon_vld1, 0),
6402   NEONMAP1(vld1_x2_v, arm_neon_vld1x2, 0),
6403   NEONMAP1(vld1_x3_v, arm_neon_vld1x3, 0),
6404   NEONMAP1(vld1_x4_v, arm_neon_vld1x4, 0),
6405   NEONMAP0(vld1q_dup_v),
6406   NEONMAP1(vld1q_v, arm_neon_vld1, 0),
6407   NEONMAP1(vld1q_x2_v, arm_neon_vld1x2, 0),
6408   NEONMAP1(vld1q_x3_v, arm_neon_vld1x3, 0),
6409   NEONMAP1(vld1q_x4_v, arm_neon_vld1x4, 0),
6410   NEONMAP1(vld2_dup_v, arm_neon_vld2dup, 0),
6411   NEONMAP1(vld2_lane_v, arm_neon_vld2lane, 0),
6412   NEONMAP1(vld2_v, arm_neon_vld2, 0),
6413   NEONMAP1(vld2q_dup_v, arm_neon_vld2dup, 0),
6414   NEONMAP1(vld2q_lane_v, arm_neon_vld2lane, 0),
6415   NEONMAP1(vld2q_v, arm_neon_vld2, 0),
6416   NEONMAP1(vld3_dup_v, arm_neon_vld3dup, 0),
6417   NEONMAP1(vld3_lane_v, arm_neon_vld3lane, 0),
6418   NEONMAP1(vld3_v, arm_neon_vld3, 0),
6419   NEONMAP1(vld3q_dup_v, arm_neon_vld3dup, 0),
6420   NEONMAP1(vld3q_lane_v, arm_neon_vld3lane, 0),
6421   NEONMAP1(vld3q_v, arm_neon_vld3, 0),
6422   NEONMAP1(vld4_dup_v, arm_neon_vld4dup, 0),
6423   NEONMAP1(vld4_lane_v, arm_neon_vld4lane, 0),
6424   NEONMAP1(vld4_v, arm_neon_vld4, 0),
6425   NEONMAP1(vld4q_dup_v, arm_neon_vld4dup, 0),
6426   NEONMAP1(vld4q_lane_v, arm_neon_vld4lane, 0),
6427   NEONMAP1(vld4q_v, arm_neon_vld4, 0),
6428   NEONMAP2(vmax_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
6429   NEONMAP1(vmaxnm_v, arm_neon_vmaxnm, Add1ArgType),
6430   NEONMAP1(vmaxnmq_v, arm_neon_vmaxnm, Add1ArgType),
6431   NEONMAP2(vmaxq_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
6432   NEONMAP2(vmin_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
6433   NEONMAP1(vminnm_v, arm_neon_vminnm, Add1ArgType),
6434   NEONMAP1(vminnmq_v, arm_neon_vminnm, Add1ArgType),
6435   NEONMAP2(vminq_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
6436   NEONMAP1(vmmlaq_s32, arm_neon_smmla, 0),
6437   NEONMAP1(vmmlaq_u32, arm_neon_ummla, 0),
6438   NEONMAP0(vmovl_v),
6439   NEONMAP0(vmovn_v),
6440   NEONMAP1(vmul_v, arm_neon_vmulp, Add1ArgType),
6441   NEONMAP0(vmull_v),
6442   NEONMAP1(vmulq_v, arm_neon_vmulp, Add1ArgType),
6443   NEONMAP2(vpadal_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts),
6444   NEONMAP2(vpadalq_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts),
6445   NEONMAP1(vpadd_v, arm_neon_vpadd, Add1ArgType),
6446   NEONMAP2(vpaddl_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts),
6447   NEONMAP2(vpaddlq_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts),
6448   NEONMAP1(vpaddq_v, arm_neon_vpadd, Add1ArgType),
6449   NEONMAP2(vpmax_v, arm_neon_vpmaxu, arm_neon_vpmaxs, Add1ArgType | UnsignedAlts),
6450   NEONMAP2(vpmin_v, arm_neon_vpminu, arm_neon_vpmins, Add1ArgType | UnsignedAlts),
6451   NEONMAP1(vqabs_v, arm_neon_vqabs, Add1ArgType),
6452   NEONMAP1(vqabsq_v, arm_neon_vqabs, Add1ArgType),
6453   NEONMAP2(vqadd_v, uadd_sat, sadd_sat, Add1ArgType | UnsignedAlts),
6454   NEONMAP2(vqaddq_v, uadd_sat, sadd_sat, Add1ArgType | UnsignedAlts),
6455   NEONMAP2(vqdmlal_v, arm_neon_vqdmull, sadd_sat, 0),
6456   NEONMAP2(vqdmlsl_v, arm_neon_vqdmull, ssub_sat, 0),
6457   NEONMAP1(vqdmulh_v, arm_neon_vqdmulh, Add1ArgType),
6458   NEONMAP1(vqdmulhq_v, arm_neon_vqdmulh, Add1ArgType),
6459   NEONMAP1(vqdmull_v, arm_neon_vqdmull, Add1ArgType),
6460   NEONMAP2(vqmovn_v, arm_neon_vqmovnu, arm_neon_vqmovns, Add1ArgType | UnsignedAlts),
6461   NEONMAP1(vqmovun_v, arm_neon_vqmovnsu, Add1ArgType),
6462   NEONMAP1(vqneg_v, arm_neon_vqneg, Add1ArgType),
6463   NEONMAP1(vqnegq_v, arm_neon_vqneg, Add1ArgType),
6464   NEONMAP1(vqrdmlah_s16, arm_neon_vqrdmlah, Add1ArgType),
6465   NEONMAP1(vqrdmlah_s32, arm_neon_vqrdmlah, Add1ArgType),
6466   NEONMAP1(vqrdmlahq_s16, arm_neon_vqrdmlah, Add1ArgType),
6467   NEONMAP1(vqrdmlahq_s32, arm_neon_vqrdmlah, Add1ArgType),
6468   NEONMAP1(vqrdmlsh_s16, arm_neon_vqrdmlsh, Add1ArgType),
6469   NEONMAP1(vqrdmlsh_s32, arm_neon_vqrdmlsh, Add1ArgType),
6470   NEONMAP1(vqrdmlshq_s16, arm_neon_vqrdmlsh, Add1ArgType),
6471   NEONMAP1(vqrdmlshq_s32, arm_neon_vqrdmlsh, Add1ArgType),
6472   NEONMAP1(vqrdmulh_v, arm_neon_vqrdmulh, Add1ArgType),
6473   NEONMAP1(vqrdmulhq_v, arm_neon_vqrdmulh, Add1ArgType),
6474   NEONMAP2(vqrshl_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
6475   NEONMAP2(vqrshlq_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
6476   NEONMAP2(vqshl_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts),
6477   NEONMAP2(vqshl_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
6478   NEONMAP2(vqshlq_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts),
6479   NEONMAP2(vqshlq_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
6480   NEONMAP1(vqshlu_n_v, arm_neon_vqshiftsu, 0),
6481   NEONMAP1(vqshluq_n_v, arm_neon_vqshiftsu, 0),
6482   NEONMAP2(vqsub_v, usub_sat, ssub_sat, Add1ArgType | UnsignedAlts),
6483   NEONMAP2(vqsubq_v, usub_sat, ssub_sat, Add1ArgType | UnsignedAlts),
6484   NEONMAP1(vraddhn_v, arm_neon_vraddhn, Add1ArgType),
6485   NEONMAP2(vrecpe_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
6486   NEONMAP2(vrecpeq_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
6487   NEONMAP1(vrecps_v, arm_neon_vrecps, Add1ArgType),
6488   NEONMAP1(vrecpsq_v, arm_neon_vrecps, Add1ArgType),
6489   NEONMAP2(vrhadd_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts),
6490   NEONMAP2(vrhaddq_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts),
6491   NEONMAP1(vrnd_v, arm_neon_vrintz, Add1ArgType),
6492   NEONMAP1(vrnda_v, arm_neon_vrinta, Add1ArgType),
6493   NEONMAP1(vrndaq_v, arm_neon_vrinta, Add1ArgType),
6494   NEONMAP0(vrndi_v),
6495   NEONMAP0(vrndiq_v),
6496   NEONMAP1(vrndm_v, arm_neon_vrintm, Add1ArgType),
6497   NEONMAP1(vrndmq_v, arm_neon_vrintm, Add1ArgType),
6498   NEONMAP1(vrndn_v, arm_neon_vrintn, Add1ArgType),
6499   NEONMAP1(vrndnq_v, arm_neon_vrintn, Add1ArgType),
6500   NEONMAP1(vrndp_v, arm_neon_vrintp, Add1ArgType),
6501   NEONMAP1(vrndpq_v, arm_neon_vrintp, Add1ArgType),
6502   NEONMAP1(vrndq_v, arm_neon_vrintz, Add1ArgType),
6503   NEONMAP1(vrndx_v, arm_neon_vrintx, Add1ArgType),
6504   NEONMAP1(vrndxq_v, arm_neon_vrintx, Add1ArgType),
6505   NEONMAP2(vrshl_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts),
6506   NEONMAP2(vrshlq_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts),
6507   NEONMAP2(vrshr_n_v, arm_neon_vrshiftu, arm_neon_vrshifts, UnsignedAlts),
6508   NEONMAP2(vrshrq_n_v, arm_neon_vrshiftu, arm_neon_vrshifts, UnsignedAlts),
6509   NEONMAP2(vrsqrte_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0),
6510   NEONMAP2(vrsqrteq_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0),
6511   NEONMAP1(vrsqrts_v, arm_neon_vrsqrts, Add1ArgType),
6512   NEONMAP1(vrsqrtsq_v, arm_neon_vrsqrts, Add1ArgType),
6513   NEONMAP1(vrsubhn_v, arm_neon_vrsubhn, Add1ArgType),
6514   NEONMAP1(vsha1su0q_u32, arm_neon_sha1su0, 0),
6515   NEONMAP1(vsha1su1q_u32, arm_neon_sha1su1, 0),
6516   NEONMAP1(vsha256h2q_u32, arm_neon_sha256h2, 0),
6517   NEONMAP1(vsha256hq_u32, arm_neon_sha256h, 0),
6518   NEONMAP1(vsha256su0q_u32, arm_neon_sha256su0, 0),
6519   NEONMAP1(vsha256su1q_u32, arm_neon_sha256su1, 0),
6520   NEONMAP0(vshl_n_v),
6521   NEONMAP2(vshl_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts),
6522   NEONMAP0(vshll_n_v),
6523   NEONMAP0(vshlq_n_v),
6524   NEONMAP2(vshlq_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts),
6525   NEONMAP0(vshr_n_v),
6526   NEONMAP0(vshrn_n_v),
6527   NEONMAP0(vshrq_n_v),
6528   NEONMAP1(vst1_v, arm_neon_vst1, 0),
6529   NEONMAP1(vst1_x2_v, arm_neon_vst1x2, 0),
6530   NEONMAP1(vst1_x3_v, arm_neon_vst1x3, 0),
6531   NEONMAP1(vst1_x4_v, arm_neon_vst1x4, 0),
6532   NEONMAP1(vst1q_v, arm_neon_vst1, 0),
6533   NEONMAP1(vst1q_x2_v, arm_neon_vst1x2, 0),
6534   NEONMAP1(vst1q_x3_v, arm_neon_vst1x3, 0),
6535   NEONMAP1(vst1q_x4_v, arm_neon_vst1x4, 0),
6536   NEONMAP1(vst2_lane_v, arm_neon_vst2lane, 0),
6537   NEONMAP1(vst2_v, arm_neon_vst2, 0),
6538   NEONMAP1(vst2q_lane_v, arm_neon_vst2lane, 0),
6539   NEONMAP1(vst2q_v, arm_neon_vst2, 0),
6540   NEONMAP1(vst3_lane_v, arm_neon_vst3lane, 0),
6541   NEONMAP1(vst3_v, arm_neon_vst3, 0),
6542   NEONMAP1(vst3q_lane_v, arm_neon_vst3lane, 0),
6543   NEONMAP1(vst3q_v, arm_neon_vst3, 0),
6544   NEONMAP1(vst4_lane_v, arm_neon_vst4lane, 0),
6545   NEONMAP1(vst4_v, arm_neon_vst4, 0),
6546   NEONMAP1(vst4q_lane_v, arm_neon_vst4lane, 0),
6547   NEONMAP1(vst4q_v, arm_neon_vst4, 0),
6548   NEONMAP0(vsubhn_v),
6549   NEONMAP0(vtrn_v),
6550   NEONMAP0(vtrnq_v),
6551   NEONMAP0(vtst_v),
6552   NEONMAP0(vtstq_v),
6553   NEONMAP1(vusdot_s32, arm_neon_usdot, 0),
6554   NEONMAP1(vusdotq_s32, arm_neon_usdot, 0),
6555   NEONMAP1(vusmmlaq_s32, arm_neon_usmmla, 0),
6556   NEONMAP0(vuzp_v),
6557   NEONMAP0(vuzpq_v),
6558   NEONMAP0(vzip_v),
6559   NEONMAP0(vzipq_v)
6560 };
6561 
6562 static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
6563   NEONMAP1(__a64_vcvtq_low_bf16_f32, aarch64_neon_bfcvtn, 0),
6564   NEONMAP0(splat_lane_v),
6565   NEONMAP0(splat_laneq_v),
6566   NEONMAP0(splatq_lane_v),
6567   NEONMAP0(splatq_laneq_v),
6568   NEONMAP1(vabs_v, aarch64_neon_abs, 0),
6569   NEONMAP1(vabsq_v, aarch64_neon_abs, 0),
6570   NEONMAP0(vadd_v),
6571   NEONMAP0(vaddhn_v),
6572   NEONMAP0(vaddq_p128),
6573   NEONMAP0(vaddq_v),
6574   NEONMAP1(vaesdq_u8, aarch64_crypto_aesd, 0),
6575   NEONMAP1(vaeseq_u8, aarch64_crypto_aese, 0),
6576   NEONMAP1(vaesimcq_u8, aarch64_crypto_aesimc, 0),
6577   NEONMAP1(vaesmcq_u8, aarch64_crypto_aesmc, 0),
6578   NEONMAP2(vbcaxq_s16, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6579   NEONMAP2(vbcaxq_s32, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6580   NEONMAP2(vbcaxq_s64, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6581   NEONMAP2(vbcaxq_s8, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6582   NEONMAP2(vbcaxq_u16, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6583   NEONMAP2(vbcaxq_u32, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6584   NEONMAP2(vbcaxq_u64, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6585   NEONMAP2(vbcaxq_u8, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6586   NEONMAP1(vbfdot_f32, aarch64_neon_bfdot, 0),
6587   NEONMAP1(vbfdotq_f32, aarch64_neon_bfdot, 0),
6588   NEONMAP1(vbfmlalbq_f32, aarch64_neon_bfmlalb, 0),
6589   NEONMAP1(vbfmlaltq_f32, aarch64_neon_bfmlalt, 0),
6590   NEONMAP1(vbfmmlaq_f32, aarch64_neon_bfmmla, 0),
6591   NEONMAP1(vcadd_rot270_f16, aarch64_neon_vcadd_rot270, Add1ArgType),
6592   NEONMAP1(vcadd_rot270_f32, aarch64_neon_vcadd_rot270, Add1ArgType),
6593   NEONMAP1(vcadd_rot90_f16, aarch64_neon_vcadd_rot90, Add1ArgType),
6594   NEONMAP1(vcadd_rot90_f32, aarch64_neon_vcadd_rot90, Add1ArgType),
6595   NEONMAP1(vcaddq_rot270_f16, aarch64_neon_vcadd_rot270, Add1ArgType),
6596   NEONMAP1(vcaddq_rot270_f32, aarch64_neon_vcadd_rot270, Add1ArgType),
6597   NEONMAP1(vcaddq_rot270_f64, aarch64_neon_vcadd_rot270, Add1ArgType),
6598   NEONMAP1(vcaddq_rot90_f16, aarch64_neon_vcadd_rot90, Add1ArgType),
6599   NEONMAP1(vcaddq_rot90_f32, aarch64_neon_vcadd_rot90, Add1ArgType),
6600   NEONMAP1(vcaddq_rot90_f64, aarch64_neon_vcadd_rot90, Add1ArgType),
6601   NEONMAP1(vcage_v, aarch64_neon_facge, 0),
6602   NEONMAP1(vcageq_v, aarch64_neon_facge, 0),
6603   NEONMAP1(vcagt_v, aarch64_neon_facgt, 0),
6604   NEONMAP1(vcagtq_v, aarch64_neon_facgt, 0),
6605   NEONMAP1(vcale_v, aarch64_neon_facge, 0),
6606   NEONMAP1(vcaleq_v, aarch64_neon_facge, 0),
6607   NEONMAP1(vcalt_v, aarch64_neon_facgt, 0),
6608   NEONMAP1(vcaltq_v, aarch64_neon_facgt, 0),
6609   NEONMAP0(vceqz_v),
6610   NEONMAP0(vceqzq_v),
6611   NEONMAP0(vcgez_v),
6612   NEONMAP0(vcgezq_v),
6613   NEONMAP0(vcgtz_v),
6614   NEONMAP0(vcgtzq_v),
6615   NEONMAP0(vclez_v),
6616   NEONMAP0(vclezq_v),
6617   NEONMAP1(vcls_v, aarch64_neon_cls, Add1ArgType),
6618   NEONMAP1(vclsq_v, aarch64_neon_cls, Add1ArgType),
6619   NEONMAP0(vcltz_v),
6620   NEONMAP0(vcltzq_v),
6621   NEONMAP1(vclz_v, ctlz, Add1ArgType),
6622   NEONMAP1(vclzq_v, ctlz, Add1ArgType),
6623   NEONMAP1(vcmla_f16, aarch64_neon_vcmla_rot0, Add1ArgType),
6624   NEONMAP1(vcmla_f32, aarch64_neon_vcmla_rot0, Add1ArgType),
6625   NEONMAP1(vcmla_rot180_f16, aarch64_neon_vcmla_rot180, Add1ArgType),
6626   NEONMAP1(vcmla_rot180_f32, aarch64_neon_vcmla_rot180, Add1ArgType),
6627   NEONMAP1(vcmla_rot270_f16, aarch64_neon_vcmla_rot270, Add1ArgType),
6628   NEONMAP1(vcmla_rot270_f32, aarch64_neon_vcmla_rot270, Add1ArgType),
6629   NEONMAP1(vcmla_rot90_f16, aarch64_neon_vcmla_rot90, Add1ArgType),
6630   NEONMAP1(vcmla_rot90_f32, aarch64_neon_vcmla_rot90, Add1ArgType),
6631   NEONMAP1(vcmlaq_f16, aarch64_neon_vcmla_rot0, Add1ArgType),
6632   NEONMAP1(vcmlaq_f32, aarch64_neon_vcmla_rot0, Add1ArgType),
6633   NEONMAP1(vcmlaq_f64, aarch64_neon_vcmla_rot0, Add1ArgType),
6634   NEONMAP1(vcmlaq_rot180_f16, aarch64_neon_vcmla_rot180, Add1ArgType),
6635   NEONMAP1(vcmlaq_rot180_f32, aarch64_neon_vcmla_rot180, Add1ArgType),
6636   NEONMAP1(vcmlaq_rot180_f64, aarch64_neon_vcmla_rot180, Add1ArgType),
6637   NEONMAP1(vcmlaq_rot270_f16, aarch64_neon_vcmla_rot270, Add1ArgType),
6638   NEONMAP1(vcmlaq_rot270_f32, aarch64_neon_vcmla_rot270, Add1ArgType),
6639   NEONMAP1(vcmlaq_rot270_f64, aarch64_neon_vcmla_rot270, Add1ArgType),
6640   NEONMAP1(vcmlaq_rot90_f16, aarch64_neon_vcmla_rot90, Add1ArgType),
6641   NEONMAP1(vcmlaq_rot90_f32, aarch64_neon_vcmla_rot90, Add1ArgType),
6642   NEONMAP1(vcmlaq_rot90_f64, aarch64_neon_vcmla_rot90, Add1ArgType),
6643   NEONMAP1(vcnt_v, ctpop, Add1ArgType),
6644   NEONMAP1(vcntq_v, ctpop, Add1ArgType),
6645   NEONMAP1(vcvt_f16_f32, aarch64_neon_vcvtfp2hf, 0),
6646   NEONMAP0(vcvt_f16_s16),
6647   NEONMAP0(vcvt_f16_u16),
6648   NEONMAP1(vcvt_f32_f16, aarch64_neon_vcvthf2fp, 0),
6649   NEONMAP0(vcvt_f32_v),
6650   NEONMAP1(vcvt_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
6651   NEONMAP1(vcvt_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
6652   NEONMAP2(vcvt_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
6653   NEONMAP2(vcvt_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
6654   NEONMAP1(vcvt_n_s16_f16, aarch64_neon_vcvtfp2fxs, 0),
6655   NEONMAP1(vcvt_n_s32_v, aarch64_neon_vcvtfp2fxs, 0),
6656   NEONMAP1(vcvt_n_s64_v, aarch64_neon_vcvtfp2fxs, 0),
6657   NEONMAP1(vcvt_n_u16_f16, aarch64_neon_vcvtfp2fxu, 0),
6658   NEONMAP1(vcvt_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
6659   NEONMAP1(vcvt_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
6660   NEONMAP0(vcvtq_f16_s16),
6661   NEONMAP0(vcvtq_f16_u16),
6662   NEONMAP0(vcvtq_f32_v),
6663   NEONMAP1(vcvtq_high_bf16_f32, aarch64_neon_bfcvtn2, 0),
6664   NEONMAP1(vcvtq_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
6665   NEONMAP1(vcvtq_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
6666   NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
6667   NEONMAP2(vcvtq_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
6668   NEONMAP1(vcvtq_n_s16_f16, aarch64_neon_vcvtfp2fxs, 0),
6669   NEONMAP1(vcvtq_n_s32_v, aarch64_neon_vcvtfp2fxs, 0),
6670   NEONMAP1(vcvtq_n_s64_v, aarch64_neon_vcvtfp2fxs, 0),
6671   NEONMAP1(vcvtq_n_u16_f16, aarch64_neon_vcvtfp2fxu, 0),
6672   NEONMAP1(vcvtq_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
6673   NEONMAP1(vcvtq_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
6674   NEONMAP1(vcvtx_f32_v, aarch64_neon_fcvtxn, AddRetType | Add1ArgType),
6675   NEONMAP1(vdot_s32, aarch64_neon_sdot, 0),
6676   NEONMAP1(vdot_u32, aarch64_neon_udot, 0),
6677   NEONMAP1(vdotq_s32, aarch64_neon_sdot, 0),
6678   NEONMAP1(vdotq_u32, aarch64_neon_udot, 0),
6679   NEONMAP2(veor3q_s16, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6680   NEONMAP2(veor3q_s32, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6681   NEONMAP2(veor3q_s64, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6682   NEONMAP2(veor3q_s8, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6683   NEONMAP2(veor3q_u16, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6684   NEONMAP2(veor3q_u32, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6685   NEONMAP2(veor3q_u64, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6686   NEONMAP2(veor3q_u8, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6687   NEONMAP0(vext_v),
6688   NEONMAP0(vextq_v),
6689   NEONMAP0(vfma_v),
6690   NEONMAP0(vfmaq_v),
6691   NEONMAP1(vfmlal_high_f16, aarch64_neon_fmlal2, 0),
6692   NEONMAP1(vfmlal_low_f16, aarch64_neon_fmlal, 0),
6693   NEONMAP1(vfmlalq_high_f16, aarch64_neon_fmlal2, 0),
6694   NEONMAP1(vfmlalq_low_f16, aarch64_neon_fmlal, 0),
6695   NEONMAP1(vfmlsl_high_f16, aarch64_neon_fmlsl2, 0),
6696   NEONMAP1(vfmlsl_low_f16, aarch64_neon_fmlsl, 0),
6697   NEONMAP1(vfmlslq_high_f16, aarch64_neon_fmlsl2, 0),
6698   NEONMAP1(vfmlslq_low_f16, aarch64_neon_fmlsl, 0),
6699   NEONMAP2(vhadd_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
6700   NEONMAP2(vhaddq_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
6701   NEONMAP2(vhsub_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts),
6702   NEONMAP2(vhsubq_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts),
6703   NEONMAP1(vld1_x2_v, aarch64_neon_ld1x2, 0),
6704   NEONMAP1(vld1_x3_v, aarch64_neon_ld1x3, 0),
6705   NEONMAP1(vld1_x4_v, aarch64_neon_ld1x4, 0),
6706   NEONMAP1(vld1q_x2_v, aarch64_neon_ld1x2, 0),
6707   NEONMAP1(vld1q_x3_v, aarch64_neon_ld1x3, 0),
6708   NEONMAP1(vld1q_x4_v, aarch64_neon_ld1x4, 0),
6709   NEONMAP1(vmmlaq_s32, aarch64_neon_smmla, 0),
6710   NEONMAP1(vmmlaq_u32, aarch64_neon_ummla, 0),
6711   NEONMAP0(vmovl_v),
6712   NEONMAP0(vmovn_v),
6713   NEONMAP1(vmul_v, aarch64_neon_pmul, Add1ArgType),
6714   NEONMAP1(vmulq_v, aarch64_neon_pmul, Add1ArgType),
6715   NEONMAP1(vpadd_v, aarch64_neon_addp, Add1ArgType),
6716   NEONMAP2(vpaddl_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts),
6717   NEONMAP2(vpaddlq_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts),
6718   NEONMAP1(vpaddq_v, aarch64_neon_addp, Add1ArgType),
6719   NEONMAP1(vqabs_v, aarch64_neon_sqabs, Add1ArgType),
6720   NEONMAP1(vqabsq_v, aarch64_neon_sqabs, Add1ArgType),
6721   NEONMAP2(vqadd_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts),
6722   NEONMAP2(vqaddq_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts),
6723   NEONMAP2(vqdmlal_v, aarch64_neon_sqdmull, aarch64_neon_sqadd, 0),
6724   NEONMAP2(vqdmlsl_v, aarch64_neon_sqdmull, aarch64_neon_sqsub, 0),
6725   NEONMAP1(vqdmulh_lane_v, aarch64_neon_sqdmulh_lane, 0),
6726   NEONMAP1(vqdmulh_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
6727   NEONMAP1(vqdmulh_v, aarch64_neon_sqdmulh, Add1ArgType),
6728   NEONMAP1(vqdmulhq_lane_v, aarch64_neon_sqdmulh_lane, 0),
6729   NEONMAP1(vqdmulhq_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
6730   NEONMAP1(vqdmulhq_v, aarch64_neon_sqdmulh, Add1ArgType),
6731   NEONMAP1(vqdmull_v, aarch64_neon_sqdmull, Add1ArgType),
6732   NEONMAP2(vqmovn_v, aarch64_neon_uqxtn, aarch64_neon_sqxtn, Add1ArgType | UnsignedAlts),
6733   NEONMAP1(vqmovun_v, aarch64_neon_sqxtun, Add1ArgType),
6734   NEONMAP1(vqneg_v, aarch64_neon_sqneg, Add1ArgType),
6735   NEONMAP1(vqnegq_v, aarch64_neon_sqneg, Add1ArgType),
6736   NEONMAP1(vqrdmlah_s16, aarch64_neon_sqrdmlah, Add1ArgType),
6737   NEONMAP1(vqrdmlah_s32, aarch64_neon_sqrdmlah, Add1ArgType),
6738   NEONMAP1(vqrdmlahq_s16, aarch64_neon_sqrdmlah, Add1ArgType),
6739   NEONMAP1(vqrdmlahq_s32, aarch64_neon_sqrdmlah, Add1ArgType),
6740   NEONMAP1(vqrdmlsh_s16, aarch64_neon_sqrdmlsh, Add1ArgType),
6741   NEONMAP1(vqrdmlsh_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
6742   NEONMAP1(vqrdmlshq_s16, aarch64_neon_sqrdmlsh, Add1ArgType),
6743   NEONMAP1(vqrdmlshq_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
6744   NEONMAP1(vqrdmulh_lane_v, aarch64_neon_sqrdmulh_lane, 0),
6745   NEONMAP1(vqrdmulh_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
6746   NEONMAP1(vqrdmulh_v, aarch64_neon_sqrdmulh, Add1ArgType),
6747   NEONMAP1(vqrdmulhq_lane_v, aarch64_neon_sqrdmulh_lane, 0),
6748   NEONMAP1(vqrdmulhq_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
6749   NEONMAP1(vqrdmulhq_v, aarch64_neon_sqrdmulh, Add1ArgType),
6750   NEONMAP2(vqrshl_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
6751   NEONMAP2(vqrshlq_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
6752   NEONMAP2(vqshl_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl, UnsignedAlts),
6753   NEONMAP2(vqshl_v, aarch64_neon_uqshl, aarch64_neon_sqshl, Add1ArgType | UnsignedAlts),
6754   NEONMAP2(vqshlq_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl,UnsignedAlts),
6755   NEONMAP2(vqshlq_v, aarch64_neon_uqshl, aarch64_neon_sqshl, Add1ArgType | UnsignedAlts),
6756   NEONMAP1(vqshlu_n_v, aarch64_neon_sqshlu, 0),
6757   NEONMAP1(vqshluq_n_v, aarch64_neon_sqshlu, 0),
6758   NEONMAP2(vqsub_v, aarch64_neon_uqsub, aarch64_neon_sqsub, Add1ArgType | UnsignedAlts),
6759   NEONMAP2(vqsubq_v, aarch64_neon_uqsub, aarch64_neon_sqsub, Add1ArgType | UnsignedAlts),
6760   NEONMAP1(vraddhn_v, aarch64_neon_raddhn, Add1ArgType),
6761   NEONMAP1(vrax1q_u64, aarch64_crypto_rax1, 0),
6762   NEONMAP2(vrecpe_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0),
6763   NEONMAP2(vrecpeq_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0),
6764   NEONMAP1(vrecps_v, aarch64_neon_frecps, Add1ArgType),
6765   NEONMAP1(vrecpsq_v, aarch64_neon_frecps, Add1ArgType),
6766   NEONMAP2(vrhadd_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts),
6767   NEONMAP2(vrhaddq_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts),
6768   NEONMAP1(vrnd32x_f32, aarch64_neon_frint32x, Add1ArgType),
6769   NEONMAP1(vrnd32x_f64, aarch64_neon_frint32x, Add1ArgType),
6770   NEONMAP1(vrnd32xq_f32, aarch64_neon_frint32x, Add1ArgType),
6771   NEONMAP1(vrnd32xq_f64, aarch64_neon_frint32x, Add1ArgType),
6772   NEONMAP1(vrnd32z_f32, aarch64_neon_frint32z, Add1ArgType),
6773   NEONMAP1(vrnd32z_f64, aarch64_neon_frint32z, Add1ArgType),
6774   NEONMAP1(vrnd32zq_f32, aarch64_neon_frint32z, Add1ArgType),
6775   NEONMAP1(vrnd32zq_f64, aarch64_neon_frint32z, Add1ArgType),
6776   NEONMAP1(vrnd64x_f32, aarch64_neon_frint64x, Add1ArgType),
6777   NEONMAP1(vrnd64x_f64, aarch64_neon_frint64x, Add1ArgType),
6778   NEONMAP1(vrnd64xq_f32, aarch64_neon_frint64x, Add1ArgType),
6779   NEONMAP1(vrnd64xq_f64, aarch64_neon_frint64x, Add1ArgType),
6780   NEONMAP1(vrnd64z_f32, aarch64_neon_frint64z, Add1ArgType),
6781   NEONMAP1(vrnd64z_f64, aarch64_neon_frint64z, Add1ArgType),
6782   NEONMAP1(vrnd64zq_f32, aarch64_neon_frint64z, Add1ArgType),
6783   NEONMAP1(vrnd64zq_f64, aarch64_neon_frint64z, Add1ArgType),
6784   NEONMAP0(vrndi_v),
6785   NEONMAP0(vrndiq_v),
6786   NEONMAP2(vrshl_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts),
6787   NEONMAP2(vrshlq_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts),
6788   NEONMAP2(vrshr_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts),
6789   NEONMAP2(vrshrq_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts),
6790   NEONMAP2(vrsqrte_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0),
6791   NEONMAP2(vrsqrteq_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0),
6792   NEONMAP1(vrsqrts_v, aarch64_neon_frsqrts, Add1ArgType),
6793   NEONMAP1(vrsqrtsq_v, aarch64_neon_frsqrts, Add1ArgType),
6794   NEONMAP1(vrsubhn_v, aarch64_neon_rsubhn, Add1ArgType),
6795   NEONMAP1(vsha1su0q_u32, aarch64_crypto_sha1su0, 0),
6796   NEONMAP1(vsha1su1q_u32, aarch64_crypto_sha1su1, 0),
6797   NEONMAP1(vsha256h2q_u32, aarch64_crypto_sha256h2, 0),
6798   NEONMAP1(vsha256hq_u32, aarch64_crypto_sha256h, 0),
6799   NEONMAP1(vsha256su0q_u32, aarch64_crypto_sha256su0, 0),
6800   NEONMAP1(vsha256su1q_u32, aarch64_crypto_sha256su1, 0),
6801   NEONMAP1(vsha512h2q_u64, aarch64_crypto_sha512h2, 0),
6802   NEONMAP1(vsha512hq_u64, aarch64_crypto_sha512h, 0),
6803   NEONMAP1(vsha512su0q_u64, aarch64_crypto_sha512su0, 0),
6804   NEONMAP1(vsha512su1q_u64, aarch64_crypto_sha512su1, 0),
6805   NEONMAP0(vshl_n_v),
6806   NEONMAP2(vshl_v, aarch64_neon_ushl, aarch64_neon_sshl, Add1ArgType | UnsignedAlts),
6807   NEONMAP0(vshll_n_v),
6808   NEONMAP0(vshlq_n_v),
6809   NEONMAP2(vshlq_v, aarch64_neon_ushl, aarch64_neon_sshl, Add1ArgType | UnsignedAlts),
6810   NEONMAP0(vshr_n_v),
6811   NEONMAP0(vshrn_n_v),
6812   NEONMAP0(vshrq_n_v),
6813   NEONMAP1(vsm3partw1q_u32, aarch64_crypto_sm3partw1, 0),
6814   NEONMAP1(vsm3partw2q_u32, aarch64_crypto_sm3partw2, 0),
6815   NEONMAP1(vsm3ss1q_u32, aarch64_crypto_sm3ss1, 0),
6816   NEONMAP1(vsm3tt1aq_u32, aarch64_crypto_sm3tt1a, 0),
6817   NEONMAP1(vsm3tt1bq_u32, aarch64_crypto_sm3tt1b, 0),
6818   NEONMAP1(vsm3tt2aq_u32, aarch64_crypto_sm3tt2a, 0),
6819   NEONMAP1(vsm3tt2bq_u32, aarch64_crypto_sm3tt2b, 0),
6820   NEONMAP1(vsm4ekeyq_u32, aarch64_crypto_sm4ekey, 0),
6821   NEONMAP1(vsm4eq_u32, aarch64_crypto_sm4e, 0),
6822   NEONMAP1(vst1_x2_v, aarch64_neon_st1x2, 0),
6823   NEONMAP1(vst1_x3_v, aarch64_neon_st1x3, 0),
6824   NEONMAP1(vst1_x4_v, aarch64_neon_st1x4, 0),
6825   NEONMAP1(vst1q_x2_v, aarch64_neon_st1x2, 0),
6826   NEONMAP1(vst1q_x3_v, aarch64_neon_st1x3, 0),
6827   NEONMAP1(vst1q_x4_v, aarch64_neon_st1x4, 0),
6828   NEONMAP0(vsubhn_v),
6829   NEONMAP0(vtst_v),
6830   NEONMAP0(vtstq_v),
6831   NEONMAP1(vusdot_s32, aarch64_neon_usdot, 0),
6832   NEONMAP1(vusdotq_s32, aarch64_neon_usdot, 0),
6833   NEONMAP1(vusmmlaq_s32, aarch64_neon_usmmla, 0),
6834   NEONMAP1(vxarq_u64, aarch64_crypto_xar, 0),
6835 };
6836 
6837 static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
6838   NEONMAP1(vabdd_f64, aarch64_sisd_fabd, Add1ArgType),
6839   NEONMAP1(vabds_f32, aarch64_sisd_fabd, Add1ArgType),
6840   NEONMAP1(vabsd_s64, aarch64_neon_abs, Add1ArgType),
6841   NEONMAP1(vaddlv_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType),
6842   NEONMAP1(vaddlv_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType),
6843   NEONMAP1(vaddlvq_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType),
6844   NEONMAP1(vaddlvq_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType),
6845   NEONMAP1(vaddv_f32, aarch64_neon_faddv, AddRetType | Add1ArgType),
6846   NEONMAP1(vaddv_s32, aarch64_neon_saddv, AddRetType | Add1ArgType),
6847   NEONMAP1(vaddv_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType),
6848   NEONMAP1(vaddvq_f32, aarch64_neon_faddv, AddRetType | Add1ArgType),
6849   NEONMAP1(vaddvq_f64, aarch64_neon_faddv, AddRetType | Add1ArgType),
6850   NEONMAP1(vaddvq_s32, aarch64_neon_saddv, AddRetType | Add1ArgType),
6851   NEONMAP1(vaddvq_s64, aarch64_neon_saddv, AddRetType | Add1ArgType),
6852   NEONMAP1(vaddvq_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType),
6853   NEONMAP1(vaddvq_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
6854   NEONMAP1(vcaged_f64, aarch64_neon_facge, AddRetType | Add1ArgType),
6855   NEONMAP1(vcages_f32, aarch64_neon_facge, AddRetType | Add1ArgType),
6856   NEONMAP1(vcagtd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType),
6857   NEONMAP1(vcagts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType),
6858   NEONMAP1(vcaled_f64, aarch64_neon_facge, AddRetType | Add1ArgType),
6859   NEONMAP1(vcales_f32, aarch64_neon_facge, AddRetType | Add1ArgType),
6860   NEONMAP1(vcaltd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType),
6861   NEONMAP1(vcalts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType),
6862   NEONMAP1(vcvtad_s64_f64, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
6863   NEONMAP1(vcvtad_u64_f64, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
6864   NEONMAP1(vcvtas_s32_f32, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
6865   NEONMAP1(vcvtas_u32_f32, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
6866   NEONMAP1(vcvtd_n_f64_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
6867   NEONMAP1(vcvtd_n_f64_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
6868   NEONMAP1(vcvtd_n_s64_f64, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
6869   NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
6870   NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
6871   NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
6872   NEONMAP1(vcvth_bf16_f32, aarch64_neon_bfcvt, 0),
6873   NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
6874   NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
6875   NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
6876   NEONMAP1(vcvtms_u32_f32, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
6877   NEONMAP1(vcvtnd_s64_f64, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
6878   NEONMAP1(vcvtnd_u64_f64, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
6879   NEONMAP1(vcvtns_s32_f32, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
6880   NEONMAP1(vcvtns_u32_f32, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
6881   NEONMAP1(vcvtpd_s64_f64, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
6882   NEONMAP1(vcvtpd_u64_f64, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
6883   NEONMAP1(vcvtps_s32_f32, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
6884   NEONMAP1(vcvtps_u32_f32, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
6885   NEONMAP1(vcvts_n_f32_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
6886   NEONMAP1(vcvts_n_f32_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
6887   NEONMAP1(vcvts_n_s32_f32, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
6888   NEONMAP1(vcvts_n_u32_f32, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
6889   NEONMAP1(vcvts_s32_f32, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
6890   NEONMAP1(vcvts_u32_f32, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
6891   NEONMAP1(vcvtxd_f32_f64, aarch64_sisd_fcvtxn, 0),
6892   NEONMAP1(vmaxnmv_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
6893   NEONMAP1(vmaxnmvq_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
6894   NEONMAP1(vmaxnmvq_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
6895   NEONMAP1(vmaxv_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
6896   NEONMAP1(vmaxv_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType),
6897   NEONMAP1(vmaxv_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType),
6898   NEONMAP1(vmaxvq_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
6899   NEONMAP1(vmaxvq_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
6900   NEONMAP1(vmaxvq_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType),
6901   NEONMAP1(vmaxvq_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType),
6902   NEONMAP1(vminnmv_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
6903   NEONMAP1(vminnmvq_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
6904   NEONMAP1(vminnmvq_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
6905   NEONMAP1(vminv_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
6906   NEONMAP1(vminv_s32, aarch64_neon_sminv, AddRetType | Add1ArgType),
6907   NEONMAP1(vminv_u32, aarch64_neon_uminv, AddRetType | Add1ArgType),
6908   NEONMAP1(vminvq_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
6909   NEONMAP1(vminvq_f64, aarch64_neon_fminv, AddRetType | Add1ArgType),
6910   NEONMAP1(vminvq_s32, aarch64_neon_sminv, AddRetType | Add1ArgType),
6911   NEONMAP1(vminvq_u32, aarch64_neon_uminv, AddRetType | Add1ArgType),
6912   NEONMAP1(vmull_p64, aarch64_neon_pmull64, 0),
6913   NEONMAP1(vmulxd_f64, aarch64_neon_fmulx, Add1ArgType),
6914   NEONMAP1(vmulxs_f32, aarch64_neon_fmulx, Add1ArgType),
6915   NEONMAP1(vpaddd_s64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
6916   NEONMAP1(vpaddd_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
6917   NEONMAP1(vpmaxnmqd_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
6918   NEONMAP1(vpmaxnms_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
6919   NEONMAP1(vpmaxqd_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
6920   NEONMAP1(vpmaxs_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
6921   NEONMAP1(vpminnmqd_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
6922   NEONMAP1(vpminnms_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
6923   NEONMAP1(vpminqd_f64, aarch64_neon_fminv, AddRetType | Add1ArgType),
6924   NEONMAP1(vpmins_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
6925   NEONMAP1(vqabsb_s8, aarch64_neon_sqabs, Vectorize1ArgType | Use64BitVectors),
6926   NEONMAP1(vqabsd_s64, aarch64_neon_sqabs, Add1ArgType),
6927   NEONMAP1(vqabsh_s16, aarch64_neon_sqabs, Vectorize1ArgType | Use64BitVectors),
6928   NEONMAP1(vqabss_s32, aarch64_neon_sqabs, Add1ArgType),
6929   NEONMAP1(vqaddb_s8, aarch64_neon_sqadd, Vectorize1ArgType | Use64BitVectors),
6930   NEONMAP1(vqaddb_u8, aarch64_neon_uqadd, Vectorize1ArgType | Use64BitVectors),
6931   NEONMAP1(vqaddd_s64, aarch64_neon_sqadd, Add1ArgType),
6932   NEONMAP1(vqaddd_u64, aarch64_neon_uqadd, Add1ArgType),
6933   NEONMAP1(vqaddh_s16, aarch64_neon_sqadd, Vectorize1ArgType | Use64BitVectors),
6934   NEONMAP1(vqaddh_u16, aarch64_neon_uqadd, Vectorize1ArgType | Use64BitVectors),
6935   NEONMAP1(vqadds_s32, aarch64_neon_sqadd, Add1ArgType),
6936   NEONMAP1(vqadds_u32, aarch64_neon_uqadd, Add1ArgType),
6937   NEONMAP1(vqdmulhh_s16, aarch64_neon_sqdmulh, Vectorize1ArgType | Use64BitVectors),
6938   NEONMAP1(vqdmulhs_s32, aarch64_neon_sqdmulh, Add1ArgType),
6939   NEONMAP1(vqdmullh_s16, aarch64_neon_sqdmull, VectorRet | Use128BitVectors),
6940   NEONMAP1(vqdmulls_s32, aarch64_neon_sqdmulls_scalar, 0),
6941   NEONMAP1(vqmovnd_s64, aarch64_neon_scalar_sqxtn, AddRetType | Add1ArgType),
6942   NEONMAP1(vqmovnd_u64, aarch64_neon_scalar_uqxtn, AddRetType | Add1ArgType),
6943   NEONMAP1(vqmovnh_s16, aarch64_neon_sqxtn, VectorRet | Use64BitVectors),
6944   NEONMAP1(vqmovnh_u16, aarch64_neon_uqxtn, VectorRet | Use64BitVectors),
6945   NEONMAP1(vqmovns_s32, aarch64_neon_sqxtn, VectorRet | Use64BitVectors),
6946   NEONMAP1(vqmovns_u32, aarch64_neon_uqxtn, VectorRet | Use64BitVectors),
6947   NEONMAP1(vqmovund_s64, aarch64_neon_scalar_sqxtun, AddRetType | Add1ArgType),
6948   NEONMAP1(vqmovunh_s16, aarch64_neon_sqxtun, VectorRet | Use64BitVectors),
6949   NEONMAP1(vqmovuns_s32, aarch64_neon_sqxtun, VectorRet | Use64BitVectors),
6950   NEONMAP1(vqnegb_s8, aarch64_neon_sqneg, Vectorize1ArgType | Use64BitVectors),
6951   NEONMAP1(vqnegd_s64, aarch64_neon_sqneg, Add1ArgType),
6952   NEONMAP1(vqnegh_s16, aarch64_neon_sqneg, Vectorize1ArgType | Use64BitVectors),
6953   NEONMAP1(vqnegs_s32, aarch64_neon_sqneg, Add1ArgType),
6954   NEONMAP1(vqrdmlahh_s16, aarch64_neon_sqrdmlah, Vectorize1ArgType | Use64BitVectors),
6955   NEONMAP1(vqrdmlahs_s32, aarch64_neon_sqrdmlah, Add1ArgType),
6956   NEONMAP1(vqrdmlshh_s16, aarch64_neon_sqrdmlsh, Vectorize1ArgType | Use64BitVectors),
6957   NEONMAP1(vqrdmlshs_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
6958   NEONMAP1(vqrdmulhh_s16, aarch64_neon_sqrdmulh, Vectorize1ArgType | Use64BitVectors),
6959   NEONMAP1(vqrdmulhs_s32, aarch64_neon_sqrdmulh, Add1ArgType),
6960   NEONMAP1(vqrshlb_s8, aarch64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors),
6961   NEONMAP1(vqrshlb_u8, aarch64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors),
6962   NEONMAP1(vqrshld_s64, aarch64_neon_sqrshl, Add1ArgType),
6963   NEONMAP1(vqrshld_u64, aarch64_neon_uqrshl, Add1ArgType),
6964   NEONMAP1(vqrshlh_s16, aarch64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors),
6965   NEONMAP1(vqrshlh_u16, aarch64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors),
6966   NEONMAP1(vqrshls_s32, aarch64_neon_sqrshl, Add1ArgType),
6967   NEONMAP1(vqrshls_u32, aarch64_neon_uqrshl, Add1ArgType),
6968   NEONMAP1(vqrshrnd_n_s64, aarch64_neon_sqrshrn, AddRetType),
6969   NEONMAP1(vqrshrnd_n_u64, aarch64_neon_uqrshrn, AddRetType),
6970   NEONMAP1(vqrshrnh_n_s16, aarch64_neon_sqrshrn, VectorRet | Use64BitVectors),
6971   NEONMAP1(vqrshrnh_n_u16, aarch64_neon_uqrshrn, VectorRet | Use64BitVectors),
6972   NEONMAP1(vqrshrns_n_s32, aarch64_neon_sqrshrn, VectorRet | Use64BitVectors),
6973   NEONMAP1(vqrshrns_n_u32, aarch64_neon_uqrshrn, VectorRet | Use64BitVectors),
6974   NEONMAP1(vqrshrund_n_s64, aarch64_neon_sqrshrun, AddRetType),
6975   NEONMAP1(vqrshrunh_n_s16, aarch64_neon_sqrshrun, VectorRet | Use64BitVectors),
6976   NEONMAP1(vqrshruns_n_s32, aarch64_neon_sqrshrun, VectorRet | Use64BitVectors),
6977   NEONMAP1(vqshlb_n_s8, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
6978   NEONMAP1(vqshlb_n_u8, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
6979   NEONMAP1(vqshlb_s8, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
6980   NEONMAP1(vqshlb_u8, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
6981   NEONMAP1(vqshld_s64, aarch64_neon_sqshl, Add1ArgType),
6982   NEONMAP1(vqshld_u64, aarch64_neon_uqshl, Add1ArgType),
6983   NEONMAP1(vqshlh_n_s16, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
6984   NEONMAP1(vqshlh_n_u16, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
6985   NEONMAP1(vqshlh_s16, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
6986   NEONMAP1(vqshlh_u16, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
6987   NEONMAP1(vqshls_n_s32, aarch64_neon_sqshl, Add1ArgType),
6988   NEONMAP1(vqshls_n_u32, aarch64_neon_uqshl, Add1ArgType),
6989   NEONMAP1(vqshls_s32, aarch64_neon_sqshl, Add1ArgType),
6990   NEONMAP1(vqshls_u32, aarch64_neon_uqshl, Add1ArgType),
6991   NEONMAP1(vqshlub_n_s8, aarch64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors),
6992   NEONMAP1(vqshluh_n_s16, aarch64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors),
6993   NEONMAP1(vqshlus_n_s32, aarch64_neon_sqshlu, Add1ArgType),
6994   NEONMAP1(vqshrnd_n_s64, aarch64_neon_sqshrn, AddRetType),
6995   NEONMAP1(vqshrnd_n_u64, aarch64_neon_uqshrn, AddRetType),
6996   NEONMAP1(vqshrnh_n_s16, aarch64_neon_sqshrn, VectorRet | Use64BitVectors),
6997   NEONMAP1(vqshrnh_n_u16, aarch64_neon_uqshrn, VectorRet | Use64BitVectors),
6998   NEONMAP1(vqshrns_n_s32, aarch64_neon_sqshrn, VectorRet | Use64BitVectors),
6999   NEONMAP1(vqshrns_n_u32, aarch64_neon_uqshrn, VectorRet | Use64BitVectors),
7000   NEONMAP1(vqshrund_n_s64, aarch64_neon_sqshrun, AddRetType),
7001   NEONMAP1(vqshrunh_n_s16, aarch64_neon_sqshrun, VectorRet | Use64BitVectors),
7002   NEONMAP1(vqshruns_n_s32, aarch64_neon_sqshrun, VectorRet | Use64BitVectors),
7003   NEONMAP1(vqsubb_s8, aarch64_neon_sqsub, Vectorize1ArgType | Use64BitVectors),
7004   NEONMAP1(vqsubb_u8, aarch64_neon_uqsub, Vectorize1ArgType | Use64BitVectors),
7005   NEONMAP1(vqsubd_s64, aarch64_neon_sqsub, Add1ArgType),
7006   NEONMAP1(vqsubd_u64, aarch64_neon_uqsub, Add1ArgType),
7007   NEONMAP1(vqsubh_s16, aarch64_neon_sqsub, Vectorize1ArgType | Use64BitVectors),
7008   NEONMAP1(vqsubh_u16, aarch64_neon_uqsub, Vectorize1ArgType | Use64BitVectors),
7009   NEONMAP1(vqsubs_s32, aarch64_neon_sqsub, Add1ArgType),
7010   NEONMAP1(vqsubs_u32, aarch64_neon_uqsub, Add1ArgType),
7011   NEONMAP1(vrecped_f64, aarch64_neon_frecpe, Add1ArgType),
7012   NEONMAP1(vrecpes_f32, aarch64_neon_frecpe, Add1ArgType),
7013   NEONMAP1(vrecpxd_f64, aarch64_neon_frecpx, Add1ArgType),
7014   NEONMAP1(vrecpxs_f32, aarch64_neon_frecpx, Add1ArgType),
7015   NEONMAP1(vrshld_s64, aarch64_neon_srshl, Add1ArgType),
7016   NEONMAP1(vrshld_u64, aarch64_neon_urshl, Add1ArgType),
7017   NEONMAP1(vrsqrted_f64, aarch64_neon_frsqrte, Add1ArgType),
7018   NEONMAP1(vrsqrtes_f32, aarch64_neon_frsqrte, Add1ArgType),
7019   NEONMAP1(vrsqrtsd_f64, aarch64_neon_frsqrts, Add1ArgType),
7020   NEONMAP1(vrsqrtss_f32, aarch64_neon_frsqrts, Add1ArgType),
7021   NEONMAP1(vsha1cq_u32, aarch64_crypto_sha1c, 0),
7022   NEONMAP1(vsha1h_u32, aarch64_crypto_sha1h, 0),
7023   NEONMAP1(vsha1mq_u32, aarch64_crypto_sha1m, 0),
7024   NEONMAP1(vsha1pq_u32, aarch64_crypto_sha1p, 0),
7025   NEONMAP1(vshld_s64, aarch64_neon_sshl, Add1ArgType),
7026   NEONMAP1(vshld_u64, aarch64_neon_ushl, Add1ArgType),
7027   NEONMAP1(vslid_n_s64, aarch64_neon_vsli, Vectorize1ArgType),
7028   NEONMAP1(vslid_n_u64, aarch64_neon_vsli, Vectorize1ArgType),
7029   NEONMAP1(vsqaddb_u8, aarch64_neon_usqadd, Vectorize1ArgType | Use64BitVectors),
7030   NEONMAP1(vsqaddd_u64, aarch64_neon_usqadd, Add1ArgType),
7031   NEONMAP1(vsqaddh_u16, aarch64_neon_usqadd, Vectorize1ArgType | Use64BitVectors),
7032   NEONMAP1(vsqadds_u32, aarch64_neon_usqadd, Add1ArgType),
7033   NEONMAP1(vsrid_n_s64, aarch64_neon_vsri, Vectorize1ArgType),
7034   NEONMAP1(vsrid_n_u64, aarch64_neon_vsri, Vectorize1ArgType),
7035   NEONMAP1(vuqaddb_s8, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
7036   NEONMAP1(vuqaddd_s64, aarch64_neon_suqadd, Add1ArgType),
7037   NEONMAP1(vuqaddh_s16, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
7038   NEONMAP1(vuqadds_s32, aarch64_neon_suqadd, Add1ArgType),
7039   // FP16 scalar intrinisics go here.
7040   NEONMAP1(vabdh_f16, aarch64_sisd_fabd, Add1ArgType),
7041   NEONMAP1(vcvtah_s32_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
7042   NEONMAP1(vcvtah_s64_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
7043   NEONMAP1(vcvtah_u32_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
7044   NEONMAP1(vcvtah_u64_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
7045   NEONMAP1(vcvth_n_f16_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
7046   NEONMAP1(vcvth_n_f16_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
7047   NEONMAP1(vcvth_n_f16_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
7048   NEONMAP1(vcvth_n_f16_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
7049   NEONMAP1(vcvth_n_s32_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
7050   NEONMAP1(vcvth_n_s64_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
7051   NEONMAP1(vcvth_n_u32_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
7052   NEONMAP1(vcvth_n_u64_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
7053   NEONMAP1(vcvth_s32_f16, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
7054   NEONMAP1(vcvth_s64_f16, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
7055   NEONMAP1(vcvth_u32_f16, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
7056   NEONMAP1(vcvth_u64_f16, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
7057   NEONMAP1(vcvtmh_s32_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
7058   NEONMAP1(vcvtmh_s64_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
7059   NEONMAP1(vcvtmh_u32_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
7060   NEONMAP1(vcvtmh_u64_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
7061   NEONMAP1(vcvtnh_s32_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
7062   NEONMAP1(vcvtnh_s64_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
7063   NEONMAP1(vcvtnh_u32_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
7064   NEONMAP1(vcvtnh_u64_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
7065   NEONMAP1(vcvtph_s32_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
7066   NEONMAP1(vcvtph_s64_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
7067   NEONMAP1(vcvtph_u32_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
7068   NEONMAP1(vcvtph_u64_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
7069   NEONMAP1(vmulxh_f16, aarch64_neon_fmulx, Add1ArgType),
7070   NEONMAP1(vrecpeh_f16, aarch64_neon_frecpe, Add1ArgType),
7071   NEONMAP1(vrecpxh_f16, aarch64_neon_frecpx, Add1ArgType),
7072   NEONMAP1(vrsqrteh_f16, aarch64_neon_frsqrte, Add1ArgType),
7073   NEONMAP1(vrsqrtsh_f16, aarch64_neon_frsqrts, Add1ArgType),
7074 };
7075 
7076 // Some intrinsics are equivalent for codegen.
7077 static const std::pair<unsigned, unsigned> NEONEquivalentIntrinsicMap[] = {
7078   { NEON::BI__builtin_neon_splat_lane_bf16, NEON::BI__builtin_neon_splat_lane_v, },
7079   { NEON::BI__builtin_neon_splat_laneq_bf16, NEON::BI__builtin_neon_splat_laneq_v, },
7080   { NEON::BI__builtin_neon_splatq_lane_bf16, NEON::BI__builtin_neon_splatq_lane_v, },
7081   { NEON::BI__builtin_neon_splatq_laneq_bf16, NEON::BI__builtin_neon_splatq_laneq_v, },
7082   { NEON::BI__builtin_neon_vabd_f16, NEON::BI__builtin_neon_vabd_v, },
7083   { NEON::BI__builtin_neon_vabdq_f16, NEON::BI__builtin_neon_vabdq_v, },
7084   { NEON::BI__builtin_neon_vabs_f16, NEON::BI__builtin_neon_vabs_v, },
7085   { NEON::BI__builtin_neon_vabsq_f16, NEON::BI__builtin_neon_vabsq_v, },
7086   { NEON::BI__builtin_neon_vbsl_f16, NEON::BI__builtin_neon_vbsl_v, },
7087   { NEON::BI__builtin_neon_vbslq_f16, NEON::BI__builtin_neon_vbslq_v, },
7088   { NEON::BI__builtin_neon_vcage_f16, NEON::BI__builtin_neon_vcage_v, },
7089   { NEON::BI__builtin_neon_vcageq_f16, NEON::BI__builtin_neon_vcageq_v, },
7090   { NEON::BI__builtin_neon_vcagt_f16, NEON::BI__builtin_neon_vcagt_v, },
7091   { NEON::BI__builtin_neon_vcagtq_f16, NEON::BI__builtin_neon_vcagtq_v, },
7092   { NEON::BI__builtin_neon_vcale_f16, NEON::BI__builtin_neon_vcale_v, },
7093   { NEON::BI__builtin_neon_vcaleq_f16, NEON::BI__builtin_neon_vcaleq_v, },
7094   { NEON::BI__builtin_neon_vcalt_f16, NEON::BI__builtin_neon_vcalt_v, },
7095   { NEON::BI__builtin_neon_vcaltq_f16, NEON::BI__builtin_neon_vcaltq_v, },
7096   { NEON::BI__builtin_neon_vceqz_f16, NEON::BI__builtin_neon_vceqz_v, },
7097   { NEON::BI__builtin_neon_vceqzq_f16, NEON::BI__builtin_neon_vceqzq_v, },
7098   { NEON::BI__builtin_neon_vcgez_f16, NEON::BI__builtin_neon_vcgez_v, },
7099   { NEON::BI__builtin_neon_vcgezq_f16, NEON::BI__builtin_neon_vcgezq_v, },
7100   { NEON::BI__builtin_neon_vcgtz_f16, NEON::BI__builtin_neon_vcgtz_v, },
7101   { NEON::BI__builtin_neon_vcgtzq_f16, NEON::BI__builtin_neon_vcgtzq_v, },
7102   { NEON::BI__builtin_neon_vclez_f16, NEON::BI__builtin_neon_vclez_v, },
7103   { NEON::BI__builtin_neon_vclezq_f16, NEON::BI__builtin_neon_vclezq_v, },
7104   { NEON::BI__builtin_neon_vcltz_f16, NEON::BI__builtin_neon_vcltz_v, },
7105   { NEON::BI__builtin_neon_vcltzq_f16, NEON::BI__builtin_neon_vcltzq_v, },
7106   { NEON::BI__builtin_neon_vext_f16, NEON::BI__builtin_neon_vext_v, },
7107   { NEON::BI__builtin_neon_vextq_f16, NEON::BI__builtin_neon_vextq_v, },
7108   { NEON::BI__builtin_neon_vfma_f16, NEON::BI__builtin_neon_vfma_v, },
7109   { NEON::BI__builtin_neon_vfma_lane_f16, NEON::BI__builtin_neon_vfma_lane_v, },
7110   { NEON::BI__builtin_neon_vfma_laneq_f16, NEON::BI__builtin_neon_vfma_laneq_v, },
7111   { NEON::BI__builtin_neon_vfmaq_f16, NEON::BI__builtin_neon_vfmaq_v, },
7112   { NEON::BI__builtin_neon_vfmaq_lane_f16, NEON::BI__builtin_neon_vfmaq_lane_v, },
7113   { NEON::BI__builtin_neon_vfmaq_laneq_f16, NEON::BI__builtin_neon_vfmaq_laneq_v, },
7114   { NEON::BI__builtin_neon_vld1_bf16_x2, NEON::BI__builtin_neon_vld1_x2_v },
7115   { NEON::BI__builtin_neon_vld1_bf16_x3, NEON::BI__builtin_neon_vld1_x3_v },
7116   { NEON::BI__builtin_neon_vld1_bf16_x4, NEON::BI__builtin_neon_vld1_x4_v },
7117   { NEON::BI__builtin_neon_vld1_bf16, NEON::BI__builtin_neon_vld1_v },
7118   { NEON::BI__builtin_neon_vld1_dup_bf16, NEON::BI__builtin_neon_vld1_dup_v },
7119   { NEON::BI__builtin_neon_vld1_lane_bf16, NEON::BI__builtin_neon_vld1_lane_v },
7120   { NEON::BI__builtin_neon_vld1q_bf16_x2, NEON::BI__builtin_neon_vld1q_x2_v },
7121   { NEON::BI__builtin_neon_vld1q_bf16_x3, NEON::BI__builtin_neon_vld1q_x3_v },
7122   { NEON::BI__builtin_neon_vld1q_bf16_x4, NEON::BI__builtin_neon_vld1q_x4_v },
7123   { NEON::BI__builtin_neon_vld1q_bf16, NEON::BI__builtin_neon_vld1q_v },
7124   { NEON::BI__builtin_neon_vld1q_dup_bf16, NEON::BI__builtin_neon_vld1q_dup_v },
7125   { NEON::BI__builtin_neon_vld1q_lane_bf16, NEON::BI__builtin_neon_vld1q_lane_v },
7126   { NEON::BI__builtin_neon_vld2_bf16, NEON::BI__builtin_neon_vld2_v },
7127   { NEON::BI__builtin_neon_vld2_dup_bf16, NEON::BI__builtin_neon_vld2_dup_v },
7128   { NEON::BI__builtin_neon_vld2_lane_bf16, NEON::BI__builtin_neon_vld2_lane_v },
7129   { NEON::BI__builtin_neon_vld2q_bf16, NEON::BI__builtin_neon_vld2q_v },
7130   { NEON::BI__builtin_neon_vld2q_dup_bf16, NEON::BI__builtin_neon_vld2q_dup_v },
7131   { NEON::BI__builtin_neon_vld2q_lane_bf16, NEON::BI__builtin_neon_vld2q_lane_v },
7132   { NEON::BI__builtin_neon_vld3_bf16, NEON::BI__builtin_neon_vld3_v },
7133   { NEON::BI__builtin_neon_vld3_dup_bf16, NEON::BI__builtin_neon_vld3_dup_v },
7134   { NEON::BI__builtin_neon_vld3_lane_bf16, NEON::BI__builtin_neon_vld3_lane_v },
7135   { NEON::BI__builtin_neon_vld3q_bf16, NEON::BI__builtin_neon_vld3q_v },
7136   { NEON::BI__builtin_neon_vld3q_dup_bf16, NEON::BI__builtin_neon_vld3q_dup_v },
7137   { NEON::BI__builtin_neon_vld3q_lane_bf16, NEON::BI__builtin_neon_vld3q_lane_v },
7138   { NEON::BI__builtin_neon_vld4_bf16, NEON::BI__builtin_neon_vld4_v },
7139   { NEON::BI__builtin_neon_vld4_dup_bf16, NEON::BI__builtin_neon_vld4_dup_v },
7140   { NEON::BI__builtin_neon_vld4_lane_bf16, NEON::BI__builtin_neon_vld4_lane_v },
7141   { NEON::BI__builtin_neon_vld4q_bf16, NEON::BI__builtin_neon_vld4q_v },
7142   { NEON::BI__builtin_neon_vld4q_dup_bf16, NEON::BI__builtin_neon_vld4q_dup_v },
7143   { NEON::BI__builtin_neon_vld4q_lane_bf16, NEON::BI__builtin_neon_vld4q_lane_v },
7144   { NEON::BI__builtin_neon_vmax_f16, NEON::BI__builtin_neon_vmax_v, },
7145   { NEON::BI__builtin_neon_vmaxnm_f16, NEON::BI__builtin_neon_vmaxnm_v, },
7146   { NEON::BI__builtin_neon_vmaxnmq_f16, NEON::BI__builtin_neon_vmaxnmq_v, },
7147   { NEON::BI__builtin_neon_vmaxq_f16, NEON::BI__builtin_neon_vmaxq_v, },
7148   { NEON::BI__builtin_neon_vmin_f16, NEON::BI__builtin_neon_vmin_v, },
7149   { NEON::BI__builtin_neon_vminnm_f16, NEON::BI__builtin_neon_vminnm_v, },
7150   { NEON::BI__builtin_neon_vminnmq_f16, NEON::BI__builtin_neon_vminnmq_v, },
7151   { NEON::BI__builtin_neon_vminq_f16, NEON::BI__builtin_neon_vminq_v, },
7152   { NEON::BI__builtin_neon_vmulx_f16, NEON::BI__builtin_neon_vmulx_v, },
7153   { NEON::BI__builtin_neon_vmulxq_f16, NEON::BI__builtin_neon_vmulxq_v, },
7154   { NEON::BI__builtin_neon_vpadd_f16, NEON::BI__builtin_neon_vpadd_v, },
7155   { NEON::BI__builtin_neon_vpaddq_f16, NEON::BI__builtin_neon_vpaddq_v, },
7156   { NEON::BI__builtin_neon_vpmax_f16, NEON::BI__builtin_neon_vpmax_v, },
7157   { NEON::BI__builtin_neon_vpmaxnm_f16, NEON::BI__builtin_neon_vpmaxnm_v, },
7158   { NEON::BI__builtin_neon_vpmaxnmq_f16, NEON::BI__builtin_neon_vpmaxnmq_v, },
7159   { NEON::BI__builtin_neon_vpmaxq_f16, NEON::BI__builtin_neon_vpmaxq_v, },
7160   { NEON::BI__builtin_neon_vpmin_f16, NEON::BI__builtin_neon_vpmin_v, },
7161   { NEON::BI__builtin_neon_vpminnm_f16, NEON::BI__builtin_neon_vpminnm_v, },
7162   { NEON::BI__builtin_neon_vpminnmq_f16, NEON::BI__builtin_neon_vpminnmq_v, },
7163   { NEON::BI__builtin_neon_vpminq_f16, NEON::BI__builtin_neon_vpminq_v, },
7164   { NEON::BI__builtin_neon_vrecpe_f16, NEON::BI__builtin_neon_vrecpe_v, },
7165   { NEON::BI__builtin_neon_vrecpeq_f16, NEON::BI__builtin_neon_vrecpeq_v, },
7166   { NEON::BI__builtin_neon_vrecps_f16, NEON::BI__builtin_neon_vrecps_v, },
7167   { NEON::BI__builtin_neon_vrecpsq_f16, NEON::BI__builtin_neon_vrecpsq_v, },
7168   { NEON::BI__builtin_neon_vrnd_f16, NEON::BI__builtin_neon_vrnd_v, },
7169   { NEON::BI__builtin_neon_vrnda_f16, NEON::BI__builtin_neon_vrnda_v, },
7170   { NEON::BI__builtin_neon_vrndaq_f16, NEON::BI__builtin_neon_vrndaq_v, },
7171   { NEON::BI__builtin_neon_vrndi_f16, NEON::BI__builtin_neon_vrndi_v, },
7172   { NEON::BI__builtin_neon_vrndiq_f16, NEON::BI__builtin_neon_vrndiq_v, },
7173   { NEON::BI__builtin_neon_vrndm_f16, NEON::BI__builtin_neon_vrndm_v, },
7174   { NEON::BI__builtin_neon_vrndmq_f16, NEON::BI__builtin_neon_vrndmq_v, },
7175   { NEON::BI__builtin_neon_vrndn_f16, NEON::BI__builtin_neon_vrndn_v, },
7176   { NEON::BI__builtin_neon_vrndnq_f16, NEON::BI__builtin_neon_vrndnq_v, },
7177   { NEON::BI__builtin_neon_vrndp_f16, NEON::BI__builtin_neon_vrndp_v, },
7178   { NEON::BI__builtin_neon_vrndpq_f16, NEON::BI__builtin_neon_vrndpq_v, },
7179   { NEON::BI__builtin_neon_vrndq_f16, NEON::BI__builtin_neon_vrndq_v, },
7180   { NEON::BI__builtin_neon_vrndx_f16, NEON::BI__builtin_neon_vrndx_v, },
7181   { NEON::BI__builtin_neon_vrndxq_f16, NEON::BI__builtin_neon_vrndxq_v, },
7182   { NEON::BI__builtin_neon_vrsqrte_f16, NEON::BI__builtin_neon_vrsqrte_v, },
7183   { NEON::BI__builtin_neon_vrsqrteq_f16, NEON::BI__builtin_neon_vrsqrteq_v, },
7184   { NEON::BI__builtin_neon_vrsqrts_f16, NEON::BI__builtin_neon_vrsqrts_v, },
7185   { NEON::BI__builtin_neon_vrsqrtsq_f16, NEON::BI__builtin_neon_vrsqrtsq_v, },
7186   { NEON::BI__builtin_neon_vsqrt_f16, NEON::BI__builtin_neon_vsqrt_v, },
7187   { NEON::BI__builtin_neon_vsqrtq_f16, NEON::BI__builtin_neon_vsqrtq_v, },
7188   { NEON::BI__builtin_neon_vst1_bf16_x2, NEON::BI__builtin_neon_vst1_x2_v },
7189   { NEON::BI__builtin_neon_vst1_bf16_x3, NEON::BI__builtin_neon_vst1_x3_v },
7190   { NEON::BI__builtin_neon_vst1_bf16_x4, NEON::BI__builtin_neon_vst1_x4_v },
7191   { NEON::BI__builtin_neon_vst1_bf16, NEON::BI__builtin_neon_vst1_v },
7192   { NEON::BI__builtin_neon_vst1_lane_bf16, NEON::BI__builtin_neon_vst1_lane_v },
7193   { NEON::BI__builtin_neon_vst1q_bf16_x2, NEON::BI__builtin_neon_vst1q_x2_v },
7194   { NEON::BI__builtin_neon_vst1q_bf16_x3, NEON::BI__builtin_neon_vst1q_x3_v },
7195   { NEON::BI__builtin_neon_vst1q_bf16_x4, NEON::BI__builtin_neon_vst1q_x4_v },
7196   { NEON::BI__builtin_neon_vst1q_bf16, NEON::BI__builtin_neon_vst1q_v },
7197   { NEON::BI__builtin_neon_vst1q_lane_bf16, NEON::BI__builtin_neon_vst1q_lane_v },
7198   { NEON::BI__builtin_neon_vst2_bf16, NEON::BI__builtin_neon_vst2_v },
7199   { NEON::BI__builtin_neon_vst2_lane_bf16, NEON::BI__builtin_neon_vst2_lane_v },
7200   { NEON::BI__builtin_neon_vst2q_bf16, NEON::BI__builtin_neon_vst2q_v },
7201   { NEON::BI__builtin_neon_vst2q_lane_bf16, NEON::BI__builtin_neon_vst2q_lane_v },
7202   { NEON::BI__builtin_neon_vst3_bf16, NEON::BI__builtin_neon_vst3_v },
7203   { NEON::BI__builtin_neon_vst3_lane_bf16, NEON::BI__builtin_neon_vst3_lane_v },
7204   { NEON::BI__builtin_neon_vst3q_bf16, NEON::BI__builtin_neon_vst3q_v },
7205   { NEON::BI__builtin_neon_vst3q_lane_bf16, NEON::BI__builtin_neon_vst3q_lane_v },
7206   { NEON::BI__builtin_neon_vst4_bf16, NEON::BI__builtin_neon_vst4_v },
7207   { NEON::BI__builtin_neon_vst4_lane_bf16, NEON::BI__builtin_neon_vst4_lane_v },
7208   { NEON::BI__builtin_neon_vst4q_bf16, NEON::BI__builtin_neon_vst4q_v },
7209   { NEON::BI__builtin_neon_vst4q_lane_bf16, NEON::BI__builtin_neon_vst4q_lane_v },
7210   { NEON::BI__builtin_neon_vtrn_f16, NEON::BI__builtin_neon_vtrn_v, },
7211   { NEON::BI__builtin_neon_vtrnq_f16, NEON::BI__builtin_neon_vtrnq_v, },
7212   { NEON::BI__builtin_neon_vuzp_f16, NEON::BI__builtin_neon_vuzp_v, },
7213   { NEON::BI__builtin_neon_vuzpq_f16, NEON::BI__builtin_neon_vuzpq_v, },
7214   { NEON::BI__builtin_neon_vzip_f16, NEON::BI__builtin_neon_vzip_v, },
7215   { NEON::BI__builtin_neon_vzipq_f16, NEON::BI__builtin_neon_vzipq_v, },
7216   // The mangling rules cause us to have one ID for each type for vldap1(q)_lane
7217   // and vstl1(q)_lane, but codegen is equivalent for all of them. Choose an
7218   // arbitrary one to be handled as tha canonical variation.
7219   { NEON::BI__builtin_neon_vldap1_lane_u64, NEON::BI__builtin_neon_vldap1_lane_s64 },
7220   { NEON::BI__builtin_neon_vldap1_lane_f64, NEON::BI__builtin_neon_vldap1_lane_s64 },
7221   { NEON::BI__builtin_neon_vldap1_lane_p64, NEON::BI__builtin_neon_vldap1_lane_s64 },
7222   { NEON::BI__builtin_neon_vldap1q_lane_u64, NEON::BI__builtin_neon_vldap1q_lane_s64 },
7223   { NEON::BI__builtin_neon_vldap1q_lane_f64, NEON::BI__builtin_neon_vldap1q_lane_s64 },
7224   { NEON::BI__builtin_neon_vldap1q_lane_p64, NEON::BI__builtin_neon_vldap1q_lane_s64 },
7225   { NEON::BI__builtin_neon_vstl1_lane_u64, NEON::BI__builtin_neon_vstl1_lane_s64 },
7226   { NEON::BI__builtin_neon_vstl1_lane_f64, NEON::BI__builtin_neon_vstl1_lane_s64 },
7227   { NEON::BI__builtin_neon_vstl1_lane_p64, NEON::BI__builtin_neon_vstl1_lane_s64 },
7228   { NEON::BI__builtin_neon_vstl1q_lane_u64, NEON::BI__builtin_neon_vstl1q_lane_s64 },
7229   { NEON::BI__builtin_neon_vstl1q_lane_f64, NEON::BI__builtin_neon_vstl1q_lane_s64 },
7230   { NEON::BI__builtin_neon_vstl1q_lane_p64, NEON::BI__builtin_neon_vstl1q_lane_s64 },
7231 };
7232 
7233 #undef NEONMAP0
7234 #undef NEONMAP1
7235 #undef NEONMAP2
7236 
7237 #define SVEMAP1(NameBase, LLVMIntrinsic, TypeModifier)                         \
7238   {                                                                            \
7239     #NameBase, SVE::BI__builtin_sve_##NameBase, Intrinsic::LLVMIntrinsic, 0,   \
7240         TypeModifier                                                           \
7241   }
7242 
7243 #define SVEMAP2(NameBase, TypeModifier)                                        \
7244   { #NameBase, SVE::BI__builtin_sve_##NameBase, 0, 0, TypeModifier }
7245 static const ARMVectorIntrinsicInfo AArch64SVEIntrinsicMap[] = {
7246 #define GET_SVE_LLVM_INTRINSIC_MAP
7247 #include "clang/Basic/arm_sve_builtin_cg.inc"
7248 #include "clang/Basic/BuiltinsAArch64NeonSVEBridge_cg.def"
7249 #undef GET_SVE_LLVM_INTRINSIC_MAP
7250 };
7251 
7252 #undef SVEMAP1
7253 #undef SVEMAP2
7254 
7255 #define SMEMAP1(NameBase, LLVMIntrinsic, TypeModifier)                         \
7256   {                                                                            \
7257     #NameBase, SME::BI__builtin_sme_##NameBase, Intrinsic::LLVMIntrinsic, 0,   \
7258         TypeModifier                                                           \
7259   }
7260 
7261 #define SMEMAP2(NameBase, TypeModifier)                                        \
7262   { #NameBase, SME::BI__builtin_sme_##NameBase, 0, 0, TypeModifier }
7263 static const ARMVectorIntrinsicInfo AArch64SMEIntrinsicMap[] = {
7264 #define GET_SME_LLVM_INTRINSIC_MAP
7265 #include "clang/Basic/arm_sme_builtin_cg.inc"
7266 #undef GET_SME_LLVM_INTRINSIC_MAP
7267 };
7268 
7269 #undef SMEMAP1
7270 #undef SMEMAP2
7271 
7272 static bool NEONSIMDIntrinsicsProvenSorted = false;
7273 
7274 static bool AArch64SIMDIntrinsicsProvenSorted = false;
7275 static bool AArch64SISDIntrinsicsProvenSorted = false;
7276 static bool AArch64SVEIntrinsicsProvenSorted = false;
7277 static bool AArch64SMEIntrinsicsProvenSorted = false;
7278 
7279 static const ARMVectorIntrinsicInfo *
7280 findARMVectorIntrinsicInMap(ArrayRef<ARMVectorIntrinsicInfo> IntrinsicMap,
7281                             unsigned BuiltinID, bool &MapProvenSorted) {
7282 
7283 #ifndef NDEBUG
7284   if (!MapProvenSorted) {
7285     assert(llvm::is_sorted(IntrinsicMap));
7286     MapProvenSorted = true;
7287   }
7288 #endif
7289 
7290   const ARMVectorIntrinsicInfo *Builtin =
7291       llvm::lower_bound(IntrinsicMap, BuiltinID);
7292 
7293   if (Builtin != IntrinsicMap.end() && Builtin->BuiltinID == BuiltinID)
7294     return Builtin;
7295 
7296   return nullptr;
7297 }
7298 
7299 Function *CodeGenFunction::LookupNeonLLVMIntrinsic(unsigned IntrinsicID,
7300                                                    unsigned Modifier,
7301                                                    llvm::Type *ArgType,
7302                                                    const CallExpr *E) {
7303   int VectorSize = 0;
7304   if (Modifier & Use64BitVectors)
7305     VectorSize = 64;
7306   else if (Modifier & Use128BitVectors)
7307     VectorSize = 128;
7308 
7309   // Return type.
7310   SmallVector<llvm::Type *, 3> Tys;
7311   if (Modifier & AddRetType) {
7312     llvm::Type *Ty = ConvertType(E->getCallReturnType(getContext()));
7313     if (Modifier & VectorizeRetType)
7314       Ty = llvm::FixedVectorType::get(
7315           Ty, VectorSize ? VectorSize / Ty->getPrimitiveSizeInBits() : 1);
7316 
7317     Tys.push_back(Ty);
7318   }
7319 
7320   // Arguments.
7321   if (Modifier & VectorizeArgTypes) {
7322     int Elts = VectorSize ? VectorSize / ArgType->getPrimitiveSizeInBits() : 1;
7323     ArgType = llvm::FixedVectorType::get(ArgType, Elts);
7324   }
7325 
7326   if (Modifier & (Add1ArgType | Add2ArgTypes))
7327     Tys.push_back(ArgType);
7328 
7329   if (Modifier & Add2ArgTypes)
7330     Tys.push_back(ArgType);
7331 
7332   if (Modifier & InventFloatType)
7333     Tys.push_back(FloatTy);
7334 
7335   return CGM.getIntrinsic(IntrinsicID, Tys);
7336 }
7337 
7338 static Value *EmitCommonNeonSISDBuiltinExpr(
7339     CodeGenFunction &CGF, const ARMVectorIntrinsicInfo &SISDInfo,
7340     SmallVectorImpl<Value *> &Ops, const CallExpr *E) {
7341   unsigned BuiltinID = SISDInfo.BuiltinID;
7342   unsigned int Int = SISDInfo.LLVMIntrinsic;
7343   unsigned Modifier = SISDInfo.TypeModifier;
7344   const char *s = SISDInfo.NameHint;
7345 
7346   switch (BuiltinID) {
7347   case NEON::BI__builtin_neon_vcled_s64:
7348   case NEON::BI__builtin_neon_vcled_u64:
7349   case NEON::BI__builtin_neon_vcles_f32:
7350   case NEON::BI__builtin_neon_vcled_f64:
7351   case NEON::BI__builtin_neon_vcltd_s64:
7352   case NEON::BI__builtin_neon_vcltd_u64:
7353   case NEON::BI__builtin_neon_vclts_f32:
7354   case NEON::BI__builtin_neon_vcltd_f64:
7355   case NEON::BI__builtin_neon_vcales_f32:
7356   case NEON::BI__builtin_neon_vcaled_f64:
7357   case NEON::BI__builtin_neon_vcalts_f32:
7358   case NEON::BI__builtin_neon_vcaltd_f64:
7359     // Only one direction of comparisons actually exist, cmle is actually a cmge
7360     // with swapped operands. The table gives us the right intrinsic but we
7361     // still need to do the swap.
7362     std::swap(Ops[0], Ops[1]);
7363     break;
7364   }
7365 
7366   assert(Int && "Generic code assumes a valid intrinsic");
7367 
7368   // Determine the type(s) of this overloaded AArch64 intrinsic.
7369   const Expr *Arg = E->getArg(0);
7370   llvm::Type *ArgTy = CGF.ConvertType(Arg->getType());
7371   Function *F = CGF.LookupNeonLLVMIntrinsic(Int, Modifier, ArgTy, E);
7372 
7373   int j = 0;
7374   ConstantInt *C0 = ConstantInt::get(CGF.SizeTy, 0);
7375   for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
7376        ai != ae; ++ai, ++j) {
7377     llvm::Type *ArgTy = ai->getType();
7378     if (Ops[j]->getType()->getPrimitiveSizeInBits() ==
7379              ArgTy->getPrimitiveSizeInBits())
7380       continue;
7381 
7382     assert(ArgTy->isVectorTy() && !Ops[j]->getType()->isVectorTy());
7383     // The constant argument to an _n_ intrinsic always has Int32Ty, so truncate
7384     // it before inserting.
7385     Ops[j] = CGF.Builder.CreateTruncOrBitCast(
7386         Ops[j], cast<llvm::VectorType>(ArgTy)->getElementType());
7387     Ops[j] =
7388         CGF.Builder.CreateInsertElement(PoisonValue::get(ArgTy), Ops[j], C0);
7389   }
7390 
7391   Value *Result = CGF.EmitNeonCall(F, Ops, s);
7392   llvm::Type *ResultType = CGF.ConvertType(E->getType());
7393   if (ResultType->getPrimitiveSizeInBits().getFixedValue() <
7394       Result->getType()->getPrimitiveSizeInBits().getFixedValue())
7395     return CGF.Builder.CreateExtractElement(Result, C0);
7396 
7397   return CGF.Builder.CreateBitCast(Result, ResultType, s);
7398 }
7399 
7400 Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
7401     unsigned BuiltinID, unsigned LLVMIntrinsic, unsigned AltLLVMIntrinsic,
7402     const char *NameHint, unsigned Modifier, const CallExpr *E,
7403     SmallVectorImpl<llvm::Value *> &Ops, Address PtrOp0, Address PtrOp1,
7404     llvm::Triple::ArchType Arch) {
7405   // Get the last argument, which specifies the vector type.
7406   const Expr *Arg = E->getArg(E->getNumArgs() - 1);
7407   std::optional<llvm::APSInt> NeonTypeConst =
7408       Arg->getIntegerConstantExpr(getContext());
7409   if (!NeonTypeConst)
7410     return nullptr;
7411 
7412   // Determine the type of this overloaded NEON intrinsic.
7413   NeonTypeFlags Type(NeonTypeConst->getZExtValue());
7414   bool Usgn = Type.isUnsigned();
7415   bool Quad = Type.isQuad();
7416   const bool HasLegalHalfType = getTarget().hasLegalHalfType();
7417   const bool AllowBFloatArgsAndRet =
7418       getTargetHooks().getABIInfo().allowBFloatArgsAndRet();
7419 
7420   llvm::FixedVectorType *VTy =
7421       GetNeonType(this, Type, HasLegalHalfType, false, AllowBFloatArgsAndRet);
7422   llvm::Type *Ty = VTy;
7423   if (!Ty)
7424     return nullptr;
7425 
7426   auto getAlignmentValue32 = [&](Address addr) -> Value* {
7427     return Builder.getInt32(addr.getAlignment().getQuantity());
7428   };
7429 
7430   unsigned Int = LLVMIntrinsic;
7431   if ((Modifier & UnsignedAlts) && !Usgn)
7432     Int = AltLLVMIntrinsic;
7433 
7434   switch (BuiltinID) {
7435   default: break;
7436   case NEON::BI__builtin_neon_splat_lane_v:
7437   case NEON::BI__builtin_neon_splat_laneq_v:
7438   case NEON::BI__builtin_neon_splatq_lane_v:
7439   case NEON::BI__builtin_neon_splatq_laneq_v: {
7440     auto NumElements = VTy->getElementCount();
7441     if (BuiltinID == NEON::BI__builtin_neon_splatq_lane_v)
7442       NumElements = NumElements * 2;
7443     if (BuiltinID == NEON::BI__builtin_neon_splat_laneq_v)
7444       NumElements = NumElements.divideCoefficientBy(2);
7445 
7446     Ops[0] = Builder.CreateBitCast(Ops[0], VTy);
7447     return EmitNeonSplat(Ops[0], cast<ConstantInt>(Ops[1]), NumElements);
7448   }
7449   case NEON::BI__builtin_neon_vpadd_v:
7450   case NEON::BI__builtin_neon_vpaddq_v:
7451     // We don't allow fp/int overloading of intrinsics.
7452     if (VTy->getElementType()->isFloatingPointTy() &&
7453         Int == Intrinsic::aarch64_neon_addp)
7454       Int = Intrinsic::aarch64_neon_faddp;
7455     break;
7456   case NEON::BI__builtin_neon_vabs_v:
7457   case NEON::BI__builtin_neon_vabsq_v:
7458     if (VTy->getElementType()->isFloatingPointTy())
7459       return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, Ty), Ops, "vabs");
7460     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), Ops, "vabs");
7461   case NEON::BI__builtin_neon_vadd_v:
7462   case NEON::BI__builtin_neon_vaddq_v: {
7463     llvm::Type *VTy = llvm::FixedVectorType::get(Int8Ty, Quad ? 16 : 8);
7464     Ops[0] = Builder.CreateBitCast(Ops[0], VTy);
7465     Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
7466     Ops[0] =  Builder.CreateXor(Ops[0], Ops[1]);
7467     return Builder.CreateBitCast(Ops[0], Ty);
7468   }
7469   case NEON::BI__builtin_neon_vaddhn_v: {
7470     llvm::FixedVectorType *SrcTy =
7471         llvm::FixedVectorType::getExtendedElementVectorType(VTy);
7472 
7473     // %sum = add <4 x i32> %lhs, %rhs
7474     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
7475     Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy);
7476     Ops[0] = Builder.CreateAdd(Ops[0], Ops[1], "vaddhn");
7477 
7478     // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
7479     Constant *ShiftAmt =
7480         ConstantInt::get(SrcTy, SrcTy->getScalarSizeInBits() / 2);
7481     Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vaddhn");
7482 
7483     // %res = trunc <4 x i32> %high to <4 x i16>
7484     return Builder.CreateTrunc(Ops[0], VTy, "vaddhn");
7485   }
7486   case NEON::BI__builtin_neon_vcale_v:
7487   case NEON::BI__builtin_neon_vcaleq_v:
7488   case NEON::BI__builtin_neon_vcalt_v:
7489   case NEON::BI__builtin_neon_vcaltq_v:
7490     std::swap(Ops[0], Ops[1]);
7491     [[fallthrough]];
7492   case NEON::BI__builtin_neon_vcage_v:
7493   case NEON::BI__builtin_neon_vcageq_v:
7494   case NEON::BI__builtin_neon_vcagt_v:
7495   case NEON::BI__builtin_neon_vcagtq_v: {
7496     llvm::Type *Ty;
7497     switch (VTy->getScalarSizeInBits()) {
7498     default: llvm_unreachable("unexpected type");
7499     case 32:
7500       Ty = FloatTy;
7501       break;
7502     case 64:
7503       Ty = DoubleTy;
7504       break;
7505     case 16:
7506       Ty = HalfTy;
7507       break;
7508     }
7509     auto *VecFlt = llvm::FixedVectorType::get(Ty, VTy->getNumElements());
7510     llvm::Type *Tys[] = { VTy, VecFlt };
7511     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
7512     return EmitNeonCall(F, Ops, NameHint);
7513   }
7514   case NEON::BI__builtin_neon_vceqz_v:
7515   case NEON::BI__builtin_neon_vceqzq_v:
7516     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OEQ,
7517                                          ICmpInst::ICMP_EQ, "vceqz");
7518   case NEON::BI__builtin_neon_vcgez_v:
7519   case NEON::BI__builtin_neon_vcgezq_v:
7520     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGE,
7521                                          ICmpInst::ICMP_SGE, "vcgez");
7522   case NEON::BI__builtin_neon_vclez_v:
7523   case NEON::BI__builtin_neon_vclezq_v:
7524     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLE,
7525                                          ICmpInst::ICMP_SLE, "vclez");
7526   case NEON::BI__builtin_neon_vcgtz_v:
7527   case NEON::BI__builtin_neon_vcgtzq_v:
7528     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGT,
7529                                          ICmpInst::ICMP_SGT, "vcgtz");
7530   case NEON::BI__builtin_neon_vcltz_v:
7531   case NEON::BI__builtin_neon_vcltzq_v:
7532     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLT,
7533                                          ICmpInst::ICMP_SLT, "vcltz");
7534   case NEON::BI__builtin_neon_vclz_v:
7535   case NEON::BI__builtin_neon_vclzq_v:
7536     // We generate target-independent intrinsic, which needs a second argument
7537     // for whether or not clz of zero is undefined; on ARM it isn't.
7538     Ops.push_back(Builder.getInt1(getTarget().isCLZForZeroUndef()));
7539     break;
7540   case NEON::BI__builtin_neon_vcvt_f32_v:
7541   case NEON::BI__builtin_neon_vcvtq_f32_v:
7542     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7543     Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, Quad),
7544                      HasLegalHalfType);
7545     return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
7546                 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
7547   case NEON::BI__builtin_neon_vcvt_f16_s16:
7548   case NEON::BI__builtin_neon_vcvt_f16_u16:
7549   case NEON::BI__builtin_neon_vcvtq_f16_s16:
7550   case NEON::BI__builtin_neon_vcvtq_f16_u16:
7551     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7552     Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float16, false, Quad),
7553                      HasLegalHalfType);
7554     return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
7555                 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
7556   case NEON::BI__builtin_neon_vcvt_n_f16_s16:
7557   case NEON::BI__builtin_neon_vcvt_n_f16_u16:
7558   case NEON::BI__builtin_neon_vcvtq_n_f16_s16:
7559   case NEON::BI__builtin_neon_vcvtq_n_f16_u16: {
7560     llvm::Type *Tys[2] = { GetFloatNeonType(this, Type), Ty };
7561     Function *F = CGM.getIntrinsic(Int, Tys);
7562     return EmitNeonCall(F, Ops, "vcvt_n");
7563   }
7564   case NEON::BI__builtin_neon_vcvt_n_f32_v:
7565   case NEON::BI__builtin_neon_vcvt_n_f64_v:
7566   case NEON::BI__builtin_neon_vcvtq_n_f32_v:
7567   case NEON::BI__builtin_neon_vcvtq_n_f64_v: {
7568     llvm::Type *Tys[2] = { GetFloatNeonType(this, Type), Ty };
7569     Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
7570     Function *F = CGM.getIntrinsic(Int, Tys);
7571     return EmitNeonCall(F, Ops, "vcvt_n");
7572   }
7573   case NEON::BI__builtin_neon_vcvt_n_s16_f16:
7574   case NEON::BI__builtin_neon_vcvt_n_s32_v:
7575   case NEON::BI__builtin_neon_vcvt_n_u16_f16:
7576   case NEON::BI__builtin_neon_vcvt_n_u32_v:
7577   case NEON::BI__builtin_neon_vcvt_n_s64_v:
7578   case NEON::BI__builtin_neon_vcvt_n_u64_v:
7579   case NEON::BI__builtin_neon_vcvtq_n_s16_f16:
7580   case NEON::BI__builtin_neon_vcvtq_n_s32_v:
7581   case NEON::BI__builtin_neon_vcvtq_n_u16_f16:
7582   case NEON::BI__builtin_neon_vcvtq_n_u32_v:
7583   case NEON::BI__builtin_neon_vcvtq_n_s64_v:
7584   case NEON::BI__builtin_neon_vcvtq_n_u64_v: {
7585     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
7586     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
7587     return EmitNeonCall(F, Ops, "vcvt_n");
7588   }
7589   case NEON::BI__builtin_neon_vcvt_s32_v:
7590   case NEON::BI__builtin_neon_vcvt_u32_v:
7591   case NEON::BI__builtin_neon_vcvt_s64_v:
7592   case NEON::BI__builtin_neon_vcvt_u64_v:
7593   case NEON::BI__builtin_neon_vcvt_s16_f16:
7594   case NEON::BI__builtin_neon_vcvt_u16_f16:
7595   case NEON::BI__builtin_neon_vcvtq_s32_v:
7596   case NEON::BI__builtin_neon_vcvtq_u32_v:
7597   case NEON::BI__builtin_neon_vcvtq_s64_v:
7598   case NEON::BI__builtin_neon_vcvtq_u64_v:
7599   case NEON::BI__builtin_neon_vcvtq_s16_f16:
7600   case NEON::BI__builtin_neon_vcvtq_u16_f16: {
7601     Ops[0] = Builder.CreateBitCast(Ops[0], GetFloatNeonType(this, Type));
7602     return Usgn ? Builder.CreateFPToUI(Ops[0], Ty, "vcvt")
7603                 : Builder.CreateFPToSI(Ops[0], Ty, "vcvt");
7604   }
7605   case NEON::BI__builtin_neon_vcvta_s16_f16:
7606   case NEON::BI__builtin_neon_vcvta_s32_v:
7607   case NEON::BI__builtin_neon_vcvta_s64_v:
7608   case NEON::BI__builtin_neon_vcvta_u16_f16:
7609   case NEON::BI__builtin_neon_vcvta_u32_v:
7610   case NEON::BI__builtin_neon_vcvta_u64_v:
7611   case NEON::BI__builtin_neon_vcvtaq_s16_f16:
7612   case NEON::BI__builtin_neon_vcvtaq_s32_v:
7613   case NEON::BI__builtin_neon_vcvtaq_s64_v:
7614   case NEON::BI__builtin_neon_vcvtaq_u16_f16:
7615   case NEON::BI__builtin_neon_vcvtaq_u32_v:
7616   case NEON::BI__builtin_neon_vcvtaq_u64_v:
7617   case NEON::BI__builtin_neon_vcvtn_s16_f16:
7618   case NEON::BI__builtin_neon_vcvtn_s32_v:
7619   case NEON::BI__builtin_neon_vcvtn_s64_v:
7620   case NEON::BI__builtin_neon_vcvtn_u16_f16:
7621   case NEON::BI__builtin_neon_vcvtn_u32_v:
7622   case NEON::BI__builtin_neon_vcvtn_u64_v:
7623   case NEON::BI__builtin_neon_vcvtnq_s16_f16:
7624   case NEON::BI__builtin_neon_vcvtnq_s32_v:
7625   case NEON::BI__builtin_neon_vcvtnq_s64_v:
7626   case NEON::BI__builtin_neon_vcvtnq_u16_f16:
7627   case NEON::BI__builtin_neon_vcvtnq_u32_v:
7628   case NEON::BI__builtin_neon_vcvtnq_u64_v:
7629   case NEON::BI__builtin_neon_vcvtp_s16_f16:
7630   case NEON::BI__builtin_neon_vcvtp_s32_v:
7631   case NEON::BI__builtin_neon_vcvtp_s64_v:
7632   case NEON::BI__builtin_neon_vcvtp_u16_f16:
7633   case NEON::BI__builtin_neon_vcvtp_u32_v:
7634   case NEON::BI__builtin_neon_vcvtp_u64_v:
7635   case NEON::BI__builtin_neon_vcvtpq_s16_f16:
7636   case NEON::BI__builtin_neon_vcvtpq_s32_v:
7637   case NEON::BI__builtin_neon_vcvtpq_s64_v:
7638   case NEON::BI__builtin_neon_vcvtpq_u16_f16:
7639   case NEON::BI__builtin_neon_vcvtpq_u32_v:
7640   case NEON::BI__builtin_neon_vcvtpq_u64_v:
7641   case NEON::BI__builtin_neon_vcvtm_s16_f16:
7642   case NEON::BI__builtin_neon_vcvtm_s32_v:
7643   case NEON::BI__builtin_neon_vcvtm_s64_v:
7644   case NEON::BI__builtin_neon_vcvtm_u16_f16:
7645   case NEON::BI__builtin_neon_vcvtm_u32_v:
7646   case NEON::BI__builtin_neon_vcvtm_u64_v:
7647   case NEON::BI__builtin_neon_vcvtmq_s16_f16:
7648   case NEON::BI__builtin_neon_vcvtmq_s32_v:
7649   case NEON::BI__builtin_neon_vcvtmq_s64_v:
7650   case NEON::BI__builtin_neon_vcvtmq_u16_f16:
7651   case NEON::BI__builtin_neon_vcvtmq_u32_v:
7652   case NEON::BI__builtin_neon_vcvtmq_u64_v: {
7653     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
7654     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, NameHint);
7655   }
7656   case NEON::BI__builtin_neon_vcvtx_f32_v: {
7657     llvm::Type *Tys[2] = { VTy->getTruncatedElementVectorType(VTy), Ty};
7658     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, NameHint);
7659 
7660   }
7661   case NEON::BI__builtin_neon_vext_v:
7662   case NEON::BI__builtin_neon_vextq_v: {
7663     int CV = cast<ConstantInt>(Ops[2])->getSExtValue();
7664     SmallVector<int, 16> Indices;
7665     for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
7666       Indices.push_back(i+CV);
7667 
7668     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7669     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7670     return Builder.CreateShuffleVector(Ops[0], Ops[1], Indices, "vext");
7671   }
7672   case NEON::BI__builtin_neon_vfma_v:
7673   case NEON::BI__builtin_neon_vfmaq_v: {
7674     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7675     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7676     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
7677 
7678     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
7679     return emitCallMaybeConstrainedFPBuiltin(
7680         *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
7681         {Ops[1], Ops[2], Ops[0]});
7682   }
7683   case NEON::BI__builtin_neon_vld1_v:
7684   case NEON::BI__builtin_neon_vld1q_v: {
7685     llvm::Type *Tys[] = {Ty, Int8PtrTy};
7686     Ops.push_back(getAlignmentValue32(PtrOp0));
7687     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "vld1");
7688   }
7689   case NEON::BI__builtin_neon_vld1_x2_v:
7690   case NEON::BI__builtin_neon_vld1q_x2_v:
7691   case NEON::BI__builtin_neon_vld1_x3_v:
7692   case NEON::BI__builtin_neon_vld1q_x3_v:
7693   case NEON::BI__builtin_neon_vld1_x4_v:
7694   case NEON::BI__builtin_neon_vld1q_x4_v: {
7695     llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
7696     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
7697     Ops[1] = Builder.CreateCall(F, Ops[1], "vld1xN");
7698     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
7699   }
7700   case NEON::BI__builtin_neon_vld2_v:
7701   case NEON::BI__builtin_neon_vld2q_v:
7702   case NEON::BI__builtin_neon_vld3_v:
7703   case NEON::BI__builtin_neon_vld3q_v:
7704   case NEON::BI__builtin_neon_vld4_v:
7705   case NEON::BI__builtin_neon_vld4q_v:
7706   case NEON::BI__builtin_neon_vld2_dup_v:
7707   case NEON::BI__builtin_neon_vld2q_dup_v:
7708   case NEON::BI__builtin_neon_vld3_dup_v:
7709   case NEON::BI__builtin_neon_vld3q_dup_v:
7710   case NEON::BI__builtin_neon_vld4_dup_v:
7711   case NEON::BI__builtin_neon_vld4q_dup_v: {
7712     llvm::Type *Tys[] = {Ty, Int8PtrTy};
7713     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
7714     Value *Align = getAlignmentValue32(PtrOp1);
7715     Ops[1] = Builder.CreateCall(F, {Ops[1], Align}, NameHint);
7716     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
7717   }
7718   case NEON::BI__builtin_neon_vld1_dup_v:
7719   case NEON::BI__builtin_neon_vld1q_dup_v: {
7720     Value *V = PoisonValue::get(Ty);
7721     PtrOp0 = PtrOp0.withElementType(VTy->getElementType());
7722     LoadInst *Ld = Builder.CreateLoad(PtrOp0);
7723     llvm::Constant *CI = ConstantInt::get(SizeTy, 0);
7724     Ops[0] = Builder.CreateInsertElement(V, Ld, CI);
7725     return EmitNeonSplat(Ops[0], CI);
7726   }
7727   case NEON::BI__builtin_neon_vld2_lane_v:
7728   case NEON::BI__builtin_neon_vld2q_lane_v:
7729   case NEON::BI__builtin_neon_vld3_lane_v:
7730   case NEON::BI__builtin_neon_vld3q_lane_v:
7731   case NEON::BI__builtin_neon_vld4_lane_v:
7732   case NEON::BI__builtin_neon_vld4q_lane_v: {
7733     llvm::Type *Tys[] = {Ty, Int8PtrTy};
7734     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
7735     for (unsigned I = 2; I < Ops.size() - 1; ++I)
7736       Ops[I] = Builder.CreateBitCast(Ops[I], Ty);
7737     Ops.push_back(getAlignmentValue32(PtrOp1));
7738     Ops[1] = Builder.CreateCall(F, ArrayRef(Ops).slice(1), NameHint);
7739     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
7740   }
7741   case NEON::BI__builtin_neon_vmovl_v: {
7742     llvm::FixedVectorType *DTy =
7743         llvm::FixedVectorType::getTruncatedElementVectorType(VTy);
7744     Ops[0] = Builder.CreateBitCast(Ops[0], DTy);
7745     if (Usgn)
7746       return Builder.CreateZExt(Ops[0], Ty, "vmovl");
7747     return Builder.CreateSExt(Ops[0], Ty, "vmovl");
7748   }
7749   case NEON::BI__builtin_neon_vmovn_v: {
7750     llvm::FixedVectorType *QTy =
7751         llvm::FixedVectorType::getExtendedElementVectorType(VTy);
7752     Ops[0] = Builder.CreateBitCast(Ops[0], QTy);
7753     return Builder.CreateTrunc(Ops[0], Ty, "vmovn");
7754   }
7755   case NEON::BI__builtin_neon_vmull_v:
7756     // FIXME: the integer vmull operations could be emitted in terms of pure
7757     // LLVM IR (2 exts followed by a mul). Unfortunately LLVM has a habit of
7758     // hoisting the exts outside loops. Until global ISel comes along that can
7759     // see through such movement this leads to bad CodeGen. So we need an
7760     // intrinsic for now.
7761     Int = Usgn ? Intrinsic::arm_neon_vmullu : Intrinsic::arm_neon_vmulls;
7762     Int = Type.isPoly() ? (unsigned)Intrinsic::arm_neon_vmullp : Int;
7763     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull");
7764   case NEON::BI__builtin_neon_vpadal_v:
7765   case NEON::BI__builtin_neon_vpadalq_v: {
7766     // The source operand type has twice as many elements of half the size.
7767     unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
7768     llvm::Type *EltTy =
7769       llvm::IntegerType::get(getLLVMContext(), EltBits / 2);
7770     auto *NarrowTy =
7771         llvm::FixedVectorType::get(EltTy, VTy->getNumElements() * 2);
7772     llvm::Type *Tys[2] = { Ty, NarrowTy };
7773     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
7774   }
7775   case NEON::BI__builtin_neon_vpaddl_v:
7776   case NEON::BI__builtin_neon_vpaddlq_v: {
7777     // The source operand type has twice as many elements of half the size.
7778     unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
7779     llvm::Type *EltTy = llvm::IntegerType::get(getLLVMContext(), EltBits / 2);
7780     auto *NarrowTy =
7781         llvm::FixedVectorType::get(EltTy, VTy->getNumElements() * 2);
7782     llvm::Type *Tys[2] = { Ty, NarrowTy };
7783     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vpaddl");
7784   }
7785   case NEON::BI__builtin_neon_vqdmlal_v:
7786   case NEON::BI__builtin_neon_vqdmlsl_v: {
7787     SmallVector<Value *, 2> MulOps(Ops.begin() + 1, Ops.end());
7788     Ops[1] =
7789         EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), MulOps, "vqdmlal");
7790     Ops.resize(2);
7791     return EmitNeonCall(CGM.getIntrinsic(AltLLVMIntrinsic, Ty), Ops, NameHint);
7792   }
7793   case NEON::BI__builtin_neon_vqdmulhq_lane_v:
7794   case NEON::BI__builtin_neon_vqdmulh_lane_v:
7795   case NEON::BI__builtin_neon_vqrdmulhq_lane_v:
7796   case NEON::BI__builtin_neon_vqrdmulh_lane_v: {
7797     auto *RTy = cast<llvm::FixedVectorType>(Ty);
7798     if (BuiltinID == NEON::BI__builtin_neon_vqdmulhq_lane_v ||
7799         BuiltinID == NEON::BI__builtin_neon_vqrdmulhq_lane_v)
7800       RTy = llvm::FixedVectorType::get(RTy->getElementType(),
7801                                        RTy->getNumElements() * 2);
7802     llvm::Type *Tys[2] = {
7803         RTy, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
7804                                              /*isQuad*/ false))};
7805     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
7806   }
7807   case NEON::BI__builtin_neon_vqdmulhq_laneq_v:
7808   case NEON::BI__builtin_neon_vqdmulh_laneq_v:
7809   case NEON::BI__builtin_neon_vqrdmulhq_laneq_v:
7810   case NEON::BI__builtin_neon_vqrdmulh_laneq_v: {
7811     llvm::Type *Tys[2] = {
7812         Ty, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
7813                                             /*isQuad*/ true))};
7814     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
7815   }
7816   case NEON::BI__builtin_neon_vqshl_n_v:
7817   case NEON::BI__builtin_neon_vqshlq_n_v:
7818     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshl_n",
7819                         1, false);
7820   case NEON::BI__builtin_neon_vqshlu_n_v:
7821   case NEON::BI__builtin_neon_vqshluq_n_v:
7822     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshlu_n",
7823                         1, false);
7824   case NEON::BI__builtin_neon_vrecpe_v:
7825   case NEON::BI__builtin_neon_vrecpeq_v:
7826   case NEON::BI__builtin_neon_vrsqrte_v:
7827   case NEON::BI__builtin_neon_vrsqrteq_v:
7828     Int = Ty->isFPOrFPVectorTy() ? LLVMIntrinsic : AltLLVMIntrinsic;
7829     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint);
7830   case NEON::BI__builtin_neon_vrndi_v:
7831   case NEON::BI__builtin_neon_vrndiq_v:
7832     Int = Builder.getIsFPConstrained()
7833               ? Intrinsic::experimental_constrained_nearbyint
7834               : Intrinsic::nearbyint;
7835     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint);
7836   case NEON::BI__builtin_neon_vrshr_n_v:
7837   case NEON::BI__builtin_neon_vrshrq_n_v:
7838     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshr_n",
7839                         1, true);
7840   case NEON::BI__builtin_neon_vsha512hq_u64:
7841   case NEON::BI__builtin_neon_vsha512h2q_u64:
7842   case NEON::BI__builtin_neon_vsha512su0q_u64:
7843   case NEON::BI__builtin_neon_vsha512su1q_u64: {
7844     Function *F = CGM.getIntrinsic(Int);
7845     return EmitNeonCall(F, Ops, "");
7846   }
7847   case NEON::BI__builtin_neon_vshl_n_v:
7848   case NEON::BI__builtin_neon_vshlq_n_v:
7849     Ops[1] = EmitNeonShiftVector(Ops[1], Ty, false);
7850     return Builder.CreateShl(Builder.CreateBitCast(Ops[0],Ty), Ops[1],
7851                              "vshl_n");
7852   case NEON::BI__builtin_neon_vshll_n_v: {
7853     llvm::FixedVectorType *SrcTy =
7854         llvm::FixedVectorType::getTruncatedElementVectorType(VTy);
7855     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
7856     if (Usgn)
7857       Ops[0] = Builder.CreateZExt(Ops[0], VTy);
7858     else
7859       Ops[0] = Builder.CreateSExt(Ops[0], VTy);
7860     Ops[1] = EmitNeonShiftVector(Ops[1], VTy, false);
7861     return Builder.CreateShl(Ops[0], Ops[1], "vshll_n");
7862   }
7863   case NEON::BI__builtin_neon_vshrn_n_v: {
7864     llvm::FixedVectorType *SrcTy =
7865         llvm::FixedVectorType::getExtendedElementVectorType(VTy);
7866     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
7867     Ops[1] = EmitNeonShiftVector(Ops[1], SrcTy, false);
7868     if (Usgn)
7869       Ops[0] = Builder.CreateLShr(Ops[0], Ops[1]);
7870     else
7871       Ops[0] = Builder.CreateAShr(Ops[0], Ops[1]);
7872     return Builder.CreateTrunc(Ops[0], Ty, "vshrn_n");
7873   }
7874   case NEON::BI__builtin_neon_vshr_n_v:
7875   case NEON::BI__builtin_neon_vshrq_n_v:
7876     return EmitNeonRShiftImm(Ops[0], Ops[1], Ty, Usgn, "vshr_n");
7877   case NEON::BI__builtin_neon_vst1_v:
7878   case NEON::BI__builtin_neon_vst1q_v:
7879   case NEON::BI__builtin_neon_vst2_v:
7880   case NEON::BI__builtin_neon_vst2q_v:
7881   case NEON::BI__builtin_neon_vst3_v:
7882   case NEON::BI__builtin_neon_vst3q_v:
7883   case NEON::BI__builtin_neon_vst4_v:
7884   case NEON::BI__builtin_neon_vst4q_v:
7885   case NEON::BI__builtin_neon_vst2_lane_v:
7886   case NEON::BI__builtin_neon_vst2q_lane_v:
7887   case NEON::BI__builtin_neon_vst3_lane_v:
7888   case NEON::BI__builtin_neon_vst3q_lane_v:
7889   case NEON::BI__builtin_neon_vst4_lane_v:
7890   case NEON::BI__builtin_neon_vst4q_lane_v: {
7891     llvm::Type *Tys[] = {Int8PtrTy, Ty};
7892     Ops.push_back(getAlignmentValue32(PtrOp0));
7893     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "");
7894   }
7895   case NEON::BI__builtin_neon_vsm3partw1q_u32:
7896   case NEON::BI__builtin_neon_vsm3partw2q_u32:
7897   case NEON::BI__builtin_neon_vsm3ss1q_u32:
7898   case NEON::BI__builtin_neon_vsm4ekeyq_u32:
7899   case NEON::BI__builtin_neon_vsm4eq_u32: {
7900     Function *F = CGM.getIntrinsic(Int);
7901     return EmitNeonCall(F, Ops, "");
7902   }
7903   case NEON::BI__builtin_neon_vsm3tt1aq_u32:
7904   case NEON::BI__builtin_neon_vsm3tt1bq_u32:
7905   case NEON::BI__builtin_neon_vsm3tt2aq_u32:
7906   case NEON::BI__builtin_neon_vsm3tt2bq_u32: {
7907     Function *F = CGM.getIntrinsic(Int);
7908     Ops[3] = Builder.CreateZExt(Ops[3], Int64Ty);
7909     return EmitNeonCall(F, Ops, "");
7910   }
7911   case NEON::BI__builtin_neon_vst1_x2_v:
7912   case NEON::BI__builtin_neon_vst1q_x2_v:
7913   case NEON::BI__builtin_neon_vst1_x3_v:
7914   case NEON::BI__builtin_neon_vst1q_x3_v:
7915   case NEON::BI__builtin_neon_vst1_x4_v:
7916   case NEON::BI__builtin_neon_vst1q_x4_v: {
7917     // TODO: Currently in AArch32 mode the pointer operand comes first, whereas
7918     // in AArch64 it comes last. We may want to stick to one or another.
7919     if (Arch == llvm::Triple::aarch64 || Arch == llvm::Triple::aarch64_be ||
7920         Arch == llvm::Triple::aarch64_32) {
7921       llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
7922       std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
7923       return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "");
7924     }
7925     llvm::Type *Tys[2] = {UnqualPtrTy, VTy};
7926     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "");
7927   }
7928   case NEON::BI__builtin_neon_vsubhn_v: {
7929     llvm::FixedVectorType *SrcTy =
7930         llvm::FixedVectorType::getExtendedElementVectorType(VTy);
7931 
7932     // %sum = add <4 x i32> %lhs, %rhs
7933     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
7934     Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy);
7935     Ops[0] = Builder.CreateSub(Ops[0], Ops[1], "vsubhn");
7936 
7937     // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
7938     Constant *ShiftAmt =
7939         ConstantInt::get(SrcTy, SrcTy->getScalarSizeInBits() / 2);
7940     Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vsubhn");
7941 
7942     // %res = trunc <4 x i32> %high to <4 x i16>
7943     return Builder.CreateTrunc(Ops[0], VTy, "vsubhn");
7944   }
7945   case NEON::BI__builtin_neon_vtrn_v:
7946   case NEON::BI__builtin_neon_vtrnq_v: {
7947     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7948     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
7949     Value *SV = nullptr;
7950 
7951     for (unsigned vi = 0; vi != 2; ++vi) {
7952       SmallVector<int, 16> Indices;
7953       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
7954         Indices.push_back(i+vi);
7955         Indices.push_back(i+e+vi);
7956       }
7957       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
7958       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vtrn");
7959       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
7960     }
7961     return SV;
7962   }
7963   case NEON::BI__builtin_neon_vtst_v:
7964   case NEON::BI__builtin_neon_vtstq_v: {
7965     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7966     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7967     Ops[0] = Builder.CreateAnd(Ops[0], Ops[1]);
7968     Ops[0] = Builder.CreateICmp(ICmpInst::ICMP_NE, Ops[0],
7969                                 ConstantAggregateZero::get(Ty));
7970     return Builder.CreateSExt(Ops[0], Ty, "vtst");
7971   }
7972   case NEON::BI__builtin_neon_vuzp_v:
7973   case NEON::BI__builtin_neon_vuzpq_v: {
7974     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7975     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
7976     Value *SV = nullptr;
7977 
7978     for (unsigned vi = 0; vi != 2; ++vi) {
7979       SmallVector<int, 16> Indices;
7980       for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
7981         Indices.push_back(2*i+vi);
7982 
7983       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
7984       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vuzp");
7985       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
7986     }
7987     return SV;
7988   }
7989   case NEON::BI__builtin_neon_vxarq_u64: {
7990     Function *F = CGM.getIntrinsic(Int);
7991     Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty);
7992     return EmitNeonCall(F, Ops, "");
7993   }
7994   case NEON::BI__builtin_neon_vzip_v:
7995   case NEON::BI__builtin_neon_vzipq_v: {
7996     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7997     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
7998     Value *SV = nullptr;
7999 
8000     for (unsigned vi = 0; vi != 2; ++vi) {
8001       SmallVector<int, 16> Indices;
8002       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
8003         Indices.push_back((i + vi*e) >> 1);
8004         Indices.push_back(((i + vi*e) >> 1)+e);
8005       }
8006       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
8007       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vzip");
8008       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
8009     }
8010     return SV;
8011   }
8012   case NEON::BI__builtin_neon_vdot_s32:
8013   case NEON::BI__builtin_neon_vdot_u32:
8014   case NEON::BI__builtin_neon_vdotq_s32:
8015   case NEON::BI__builtin_neon_vdotq_u32: {
8016     auto *InputTy =
8017         llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
8018     llvm::Type *Tys[2] = { Ty, InputTy };
8019     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vdot");
8020   }
8021   case NEON::BI__builtin_neon_vfmlal_low_f16:
8022   case NEON::BI__builtin_neon_vfmlalq_low_f16: {
8023     auto *InputTy =
8024         llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
8025     llvm::Type *Tys[2] = { Ty, InputTy };
8026     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_low");
8027   }
8028   case NEON::BI__builtin_neon_vfmlsl_low_f16:
8029   case NEON::BI__builtin_neon_vfmlslq_low_f16: {
8030     auto *InputTy =
8031         llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
8032     llvm::Type *Tys[2] = { Ty, InputTy };
8033     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_low");
8034   }
8035   case NEON::BI__builtin_neon_vfmlal_high_f16:
8036   case NEON::BI__builtin_neon_vfmlalq_high_f16: {
8037     auto *InputTy =
8038         llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
8039     llvm::Type *Tys[2] = { Ty, InputTy };
8040     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_high");
8041   }
8042   case NEON::BI__builtin_neon_vfmlsl_high_f16:
8043   case NEON::BI__builtin_neon_vfmlslq_high_f16: {
8044     auto *InputTy =
8045         llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
8046     llvm::Type *Tys[2] = { Ty, InputTy };
8047     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_high");
8048   }
8049   case NEON::BI__builtin_neon_vmmlaq_s32:
8050   case NEON::BI__builtin_neon_vmmlaq_u32: {
8051     auto *InputTy =
8052         llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
8053     llvm::Type *Tys[2] = { Ty, InputTy };
8054     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "vmmla");
8055   }
8056   case NEON::BI__builtin_neon_vusmmlaq_s32: {
8057     auto *InputTy =
8058         llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
8059     llvm::Type *Tys[2] = { Ty, InputTy };
8060     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusmmla");
8061   }
8062   case NEON::BI__builtin_neon_vusdot_s32:
8063   case NEON::BI__builtin_neon_vusdotq_s32: {
8064     auto *InputTy =
8065         llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
8066     llvm::Type *Tys[2] = { Ty, InputTy };
8067     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusdot");
8068   }
8069   case NEON::BI__builtin_neon_vbfdot_f32:
8070   case NEON::BI__builtin_neon_vbfdotq_f32: {
8071     llvm::Type *InputTy =
8072         llvm::FixedVectorType::get(BFloatTy, Ty->getPrimitiveSizeInBits() / 16);
8073     llvm::Type *Tys[2] = { Ty, InputTy };
8074     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfdot");
8075   }
8076   case NEON::BI__builtin_neon___a32_vcvt_bf16_f32: {
8077     llvm::Type *Tys[1] = { Ty };
8078     Function *F = CGM.getIntrinsic(Int, Tys);
8079     return EmitNeonCall(F, Ops, "vcvtfp2bf");
8080   }
8081 
8082   }
8083 
8084   assert(Int && "Expected valid intrinsic number");
8085 
8086   // Determine the type(s) of this overloaded AArch64 intrinsic.
8087   Function *F = LookupNeonLLVMIntrinsic(Int, Modifier, Ty, E);
8088 
8089   Value *Result = EmitNeonCall(F, Ops, NameHint);
8090   llvm::Type *ResultType = ConvertType(E->getType());
8091   // AArch64 intrinsic one-element vector type cast to
8092   // scalar type expected by the builtin
8093   return Builder.CreateBitCast(Result, ResultType, NameHint);
8094 }
8095 
8096 Value *CodeGenFunction::EmitAArch64CompareBuiltinExpr(
8097     Value *Op, llvm::Type *Ty, const CmpInst::Predicate Fp,
8098     const CmpInst::Predicate Ip, const Twine &Name) {
8099   llvm::Type *OTy = Op->getType();
8100 
8101   // FIXME: this is utterly horrific. We should not be looking at previous
8102   // codegen context to find out what needs doing. Unfortunately TableGen
8103   // currently gives us exactly the same calls for vceqz_f32 and vceqz_s32
8104   // (etc).
8105   if (BitCastInst *BI = dyn_cast<BitCastInst>(Op))
8106     OTy = BI->getOperand(0)->getType();
8107 
8108   Op = Builder.CreateBitCast(Op, OTy);
8109   if (OTy->getScalarType()->isFloatingPointTy()) {
8110     if (Fp == CmpInst::FCMP_OEQ)
8111       Op = Builder.CreateFCmp(Fp, Op, Constant::getNullValue(OTy));
8112     else
8113       Op = Builder.CreateFCmpS(Fp, Op, Constant::getNullValue(OTy));
8114   } else {
8115     Op = Builder.CreateICmp(Ip, Op, Constant::getNullValue(OTy));
8116   }
8117   return Builder.CreateSExt(Op, Ty, Name);
8118 }
8119 
8120 static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
8121                                  Value *ExtOp, Value *IndexOp,
8122                                  llvm::Type *ResTy, unsigned IntID,
8123                                  const char *Name) {
8124   SmallVector<Value *, 2> TblOps;
8125   if (ExtOp)
8126     TblOps.push_back(ExtOp);
8127 
8128   // Build a vector containing sequential number like (0, 1, 2, ..., 15)
8129   SmallVector<int, 16> Indices;
8130   auto *TblTy = cast<llvm::FixedVectorType>(Ops[0]->getType());
8131   for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) {
8132     Indices.push_back(2*i);
8133     Indices.push_back(2*i+1);
8134   }
8135 
8136   int PairPos = 0, End = Ops.size() - 1;
8137   while (PairPos < End) {
8138     TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos],
8139                                                      Ops[PairPos+1], Indices,
8140                                                      Name));
8141     PairPos += 2;
8142   }
8143 
8144   // If there's an odd number of 64-bit lookup table, fill the high 64-bit
8145   // of the 128-bit lookup table with zero.
8146   if (PairPos == End) {
8147     Value *ZeroTbl = ConstantAggregateZero::get(TblTy);
8148     TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos],
8149                                                      ZeroTbl, Indices, Name));
8150   }
8151 
8152   Function *TblF;
8153   TblOps.push_back(IndexOp);
8154   TblF = CGF.CGM.getIntrinsic(IntID, ResTy);
8155 
8156   return CGF.EmitNeonCall(TblF, TblOps, Name);
8157 }
8158 
8159 Value *CodeGenFunction::GetValueForARMHint(unsigned BuiltinID) {
8160   unsigned Value;
8161   switch (BuiltinID) {
8162   default:
8163     return nullptr;
8164   case clang::ARM::BI__builtin_arm_nop:
8165     Value = 0;
8166     break;
8167   case clang::ARM::BI__builtin_arm_yield:
8168   case clang::ARM::BI__yield:
8169     Value = 1;
8170     break;
8171   case clang::ARM::BI__builtin_arm_wfe:
8172   case clang::ARM::BI__wfe:
8173     Value = 2;
8174     break;
8175   case clang::ARM::BI__builtin_arm_wfi:
8176   case clang::ARM::BI__wfi:
8177     Value = 3;
8178     break;
8179   case clang::ARM::BI__builtin_arm_sev:
8180   case clang::ARM::BI__sev:
8181     Value = 4;
8182     break;
8183   case clang::ARM::BI__builtin_arm_sevl:
8184   case clang::ARM::BI__sevl:
8185     Value = 5;
8186     break;
8187   }
8188 
8189   return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_hint),
8190                             llvm::ConstantInt::get(Int32Ty, Value));
8191 }
8192 
8193 enum SpecialRegisterAccessKind {
8194   NormalRead,
8195   VolatileRead,
8196   Write,
8197 };
8198 
8199 // Generates the IR for __builtin_read_exec_*.
8200 // Lowers the builtin to amdgcn_ballot intrinsic.
8201 static Value *EmitAMDGCNBallotForExec(CodeGenFunction &CGF, const CallExpr *E,
8202                                       llvm::Type *RegisterType,
8203                                       llvm::Type *ValueType, bool isExecHi) {
8204   CodeGen::CGBuilderTy &Builder = CGF.Builder;
8205   CodeGen::CodeGenModule &CGM = CGF.CGM;
8206 
8207   Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, {RegisterType});
8208   llvm::Value *Call = Builder.CreateCall(F, {Builder.getInt1(true)});
8209 
8210   if (isExecHi) {
8211     Value *Rt2 = Builder.CreateLShr(Call, 32);
8212     Rt2 = Builder.CreateTrunc(Rt2, CGF.Int32Ty);
8213     return Rt2;
8214   }
8215 
8216   return Call;
8217 }
8218 
8219 // Generates the IR for the read/write special register builtin,
8220 // ValueType is the type of the value that is to be written or read,
8221 // RegisterType is the type of the register being written to or read from.
8222 static Value *EmitSpecialRegisterBuiltin(CodeGenFunction &CGF,
8223                                          const CallExpr *E,
8224                                          llvm::Type *RegisterType,
8225                                          llvm::Type *ValueType,
8226                                          SpecialRegisterAccessKind AccessKind,
8227                                          StringRef SysReg = "") {
8228   // write and register intrinsics only support 32, 64 and 128 bit operations.
8229   assert((RegisterType->isIntegerTy(32) || RegisterType->isIntegerTy(64) ||
8230           RegisterType->isIntegerTy(128)) &&
8231          "Unsupported size for register.");
8232 
8233   CodeGen::CGBuilderTy &Builder = CGF.Builder;
8234   CodeGen::CodeGenModule &CGM = CGF.CGM;
8235   LLVMContext &Context = CGM.getLLVMContext();
8236 
8237   if (SysReg.empty()) {
8238     const Expr *SysRegStrExpr = E->getArg(0)->IgnoreParenCasts();
8239     SysReg = cast<clang::StringLiteral>(SysRegStrExpr)->getString();
8240   }
8241 
8242   llvm::Metadata *Ops[] = { llvm::MDString::get(Context, SysReg) };
8243   llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
8244   llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
8245 
8246   llvm::Type *Types[] = { RegisterType };
8247 
8248   bool MixedTypes = RegisterType->isIntegerTy(64) && ValueType->isIntegerTy(32);
8249   assert(!(RegisterType->isIntegerTy(32) && ValueType->isIntegerTy(64))
8250             && "Can't fit 64-bit value in 32-bit register");
8251 
8252   if (AccessKind != Write) {
8253     assert(AccessKind == NormalRead || AccessKind == VolatileRead);
8254     llvm::Function *F = CGM.getIntrinsic(
8255         AccessKind == VolatileRead ? llvm::Intrinsic::read_volatile_register
8256                                    : llvm::Intrinsic::read_register,
8257         Types);
8258     llvm::Value *Call = Builder.CreateCall(F, Metadata);
8259 
8260     if (MixedTypes)
8261       // Read into 64 bit register and then truncate result to 32 bit.
8262       return Builder.CreateTrunc(Call, ValueType);
8263 
8264     if (ValueType->isPointerTy())
8265       // Have i32/i64 result (Call) but want to return a VoidPtrTy (i8*).
8266       return Builder.CreateIntToPtr(Call, ValueType);
8267 
8268     return Call;
8269   }
8270 
8271   llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::write_register, Types);
8272   llvm::Value *ArgValue = CGF.EmitScalarExpr(E->getArg(1));
8273   if (MixedTypes) {
8274     // Extend 32 bit write value to 64 bit to pass to write.
8275     ArgValue = Builder.CreateZExt(ArgValue, RegisterType);
8276     return Builder.CreateCall(F, { Metadata, ArgValue });
8277   }
8278 
8279   if (ValueType->isPointerTy()) {
8280     // Have VoidPtrTy ArgValue but want to return an i32/i64.
8281     ArgValue = Builder.CreatePtrToInt(ArgValue, RegisterType);
8282     return Builder.CreateCall(F, { Metadata, ArgValue });
8283   }
8284 
8285   return Builder.CreateCall(F, { Metadata, ArgValue });
8286 }
8287 
8288 /// Return true if BuiltinID is an overloaded Neon intrinsic with an extra
8289 /// argument that specifies the vector type.
8290 static bool HasExtraNeonArgument(unsigned BuiltinID) {
8291   switch (BuiltinID) {
8292   default: break;
8293   case NEON::BI__builtin_neon_vget_lane_i8:
8294   case NEON::BI__builtin_neon_vget_lane_i16:
8295   case NEON::BI__builtin_neon_vget_lane_bf16:
8296   case NEON::BI__builtin_neon_vget_lane_i32:
8297   case NEON::BI__builtin_neon_vget_lane_i64:
8298   case NEON::BI__builtin_neon_vget_lane_f32:
8299   case NEON::BI__builtin_neon_vgetq_lane_i8:
8300   case NEON::BI__builtin_neon_vgetq_lane_i16:
8301   case NEON::BI__builtin_neon_vgetq_lane_bf16:
8302   case NEON::BI__builtin_neon_vgetq_lane_i32:
8303   case NEON::BI__builtin_neon_vgetq_lane_i64:
8304   case NEON::BI__builtin_neon_vgetq_lane_f32:
8305   case NEON::BI__builtin_neon_vduph_lane_bf16:
8306   case NEON::BI__builtin_neon_vduph_laneq_bf16:
8307   case NEON::BI__builtin_neon_vset_lane_i8:
8308   case NEON::BI__builtin_neon_vset_lane_i16:
8309   case NEON::BI__builtin_neon_vset_lane_bf16:
8310   case NEON::BI__builtin_neon_vset_lane_i32:
8311   case NEON::BI__builtin_neon_vset_lane_i64:
8312   case NEON::BI__builtin_neon_vset_lane_f32:
8313   case NEON::BI__builtin_neon_vsetq_lane_i8:
8314   case NEON::BI__builtin_neon_vsetq_lane_i16:
8315   case NEON::BI__builtin_neon_vsetq_lane_bf16:
8316   case NEON::BI__builtin_neon_vsetq_lane_i32:
8317   case NEON::BI__builtin_neon_vsetq_lane_i64:
8318   case NEON::BI__builtin_neon_vsetq_lane_f32:
8319   case NEON::BI__builtin_neon_vsha1h_u32:
8320   case NEON::BI__builtin_neon_vsha1cq_u32:
8321   case NEON::BI__builtin_neon_vsha1pq_u32:
8322   case NEON::BI__builtin_neon_vsha1mq_u32:
8323   case NEON::BI__builtin_neon_vcvth_bf16_f32:
8324   case clang::ARM::BI_MoveToCoprocessor:
8325   case clang::ARM::BI_MoveToCoprocessor2:
8326     return false;
8327   }
8328   return true;
8329 }
8330 
8331 Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
8332                                            const CallExpr *E,
8333                                            ReturnValueSlot ReturnValue,
8334                                            llvm::Triple::ArchType Arch) {
8335   if (auto Hint = GetValueForARMHint(BuiltinID))
8336     return Hint;
8337 
8338   if (BuiltinID == clang::ARM::BI__emit) {
8339     bool IsThumb = getTarget().getTriple().getArch() == llvm::Triple::thumb;
8340     llvm::FunctionType *FTy =
8341         llvm::FunctionType::get(VoidTy, /*Variadic=*/false);
8342 
8343     Expr::EvalResult Result;
8344     if (!E->getArg(0)->EvaluateAsInt(Result, CGM.getContext()))
8345       llvm_unreachable("Sema will ensure that the parameter is constant");
8346 
8347     llvm::APSInt Value = Result.Val.getInt();
8348     uint64_t ZExtValue = Value.zextOrTrunc(IsThumb ? 16 : 32).getZExtValue();
8349 
8350     llvm::InlineAsm *Emit =
8351         IsThumb ? InlineAsm::get(FTy, ".inst.n 0x" + utohexstr(ZExtValue), "",
8352                                  /*hasSideEffects=*/true)
8353                 : InlineAsm::get(FTy, ".inst 0x" + utohexstr(ZExtValue), "",
8354                                  /*hasSideEffects=*/true);
8355 
8356     return Builder.CreateCall(Emit);
8357   }
8358 
8359   if (BuiltinID == clang::ARM::BI__builtin_arm_dbg) {
8360     Value *Option = EmitScalarExpr(E->getArg(0));
8361     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_dbg), Option);
8362   }
8363 
8364   if (BuiltinID == clang::ARM::BI__builtin_arm_prefetch) {
8365     Value *Address = EmitScalarExpr(E->getArg(0));
8366     Value *RW      = EmitScalarExpr(E->getArg(1));
8367     Value *IsData  = EmitScalarExpr(E->getArg(2));
8368 
8369     // Locality is not supported on ARM target
8370     Value *Locality = llvm::ConstantInt::get(Int32Ty, 3);
8371 
8372     Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType());
8373     return Builder.CreateCall(F, {Address, RW, Locality, IsData});
8374   }
8375 
8376   if (BuiltinID == clang::ARM::BI__builtin_arm_rbit) {
8377     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
8378     return Builder.CreateCall(
8379         CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit");
8380   }
8381 
8382   if (BuiltinID == clang::ARM::BI__builtin_arm_clz ||
8383       BuiltinID == clang::ARM::BI__builtin_arm_clz64) {
8384     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
8385     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Arg->getType());
8386     Value *Res = Builder.CreateCall(F, {Arg, Builder.getInt1(false)});
8387     if (BuiltinID == clang::ARM::BI__builtin_arm_clz64)
8388       Res = Builder.CreateTrunc(Res, Builder.getInt32Ty());
8389     return Res;
8390   }
8391 
8392 
8393   if (BuiltinID == clang::ARM::BI__builtin_arm_cls) {
8394     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
8395     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_cls), Arg, "cls");
8396   }
8397   if (BuiltinID == clang::ARM::BI__builtin_arm_cls64) {
8398     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
8399     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_cls64), Arg,
8400                               "cls");
8401   }
8402 
8403   if (BuiltinID == clang::ARM::BI__clear_cache) {
8404     assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments");
8405     const FunctionDecl *FD = E->getDirectCallee();
8406     Value *Ops[2];
8407     for (unsigned i = 0; i < 2; i++)
8408       Ops[i] = EmitScalarExpr(E->getArg(i));
8409     llvm::Type *Ty = CGM.getTypes().ConvertType(FD->getType());
8410     llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
8411     StringRef Name = FD->getName();
8412     return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops);
8413   }
8414 
8415   if (BuiltinID == clang::ARM::BI__builtin_arm_mcrr ||
8416       BuiltinID == clang::ARM::BI__builtin_arm_mcrr2) {
8417     Function *F;
8418 
8419     switch (BuiltinID) {
8420     default: llvm_unreachable("unexpected builtin");
8421     case clang::ARM::BI__builtin_arm_mcrr:
8422       F = CGM.getIntrinsic(Intrinsic::arm_mcrr);
8423       break;
8424     case clang::ARM::BI__builtin_arm_mcrr2:
8425       F = CGM.getIntrinsic(Intrinsic::arm_mcrr2);
8426       break;
8427     }
8428 
8429     // MCRR{2} instruction has 5 operands but
8430     // the intrinsic has 4 because Rt and Rt2
8431     // are represented as a single unsigned 64
8432     // bit integer in the intrinsic definition
8433     // but internally it's represented as 2 32
8434     // bit integers.
8435 
8436     Value *Coproc = EmitScalarExpr(E->getArg(0));
8437     Value *Opc1 = EmitScalarExpr(E->getArg(1));
8438     Value *RtAndRt2 = EmitScalarExpr(E->getArg(2));
8439     Value *CRm = EmitScalarExpr(E->getArg(3));
8440 
8441     Value *C1 = llvm::ConstantInt::get(Int64Ty, 32);
8442     Value *Rt = Builder.CreateTruncOrBitCast(RtAndRt2, Int32Ty);
8443     Value *Rt2 = Builder.CreateLShr(RtAndRt2, C1);
8444     Rt2 = Builder.CreateTruncOrBitCast(Rt2, Int32Ty);
8445 
8446     return Builder.CreateCall(F, {Coproc, Opc1, Rt, Rt2, CRm});
8447   }
8448 
8449   if (BuiltinID == clang::ARM::BI__builtin_arm_mrrc ||
8450       BuiltinID == clang::ARM::BI__builtin_arm_mrrc2) {
8451     Function *F;
8452 
8453     switch (BuiltinID) {
8454     default: llvm_unreachable("unexpected builtin");
8455     case clang::ARM::BI__builtin_arm_mrrc:
8456       F = CGM.getIntrinsic(Intrinsic::arm_mrrc);
8457       break;
8458     case clang::ARM::BI__builtin_arm_mrrc2:
8459       F = CGM.getIntrinsic(Intrinsic::arm_mrrc2);
8460       break;
8461     }
8462 
8463     Value *Coproc = EmitScalarExpr(E->getArg(0));
8464     Value *Opc1 = EmitScalarExpr(E->getArg(1));
8465     Value *CRm  = EmitScalarExpr(E->getArg(2));
8466     Value *RtAndRt2 = Builder.CreateCall(F, {Coproc, Opc1, CRm});
8467 
8468     // Returns an unsigned 64 bit integer, represented
8469     // as two 32 bit integers.
8470 
8471     Value *Rt = Builder.CreateExtractValue(RtAndRt2, 1);
8472     Value *Rt1 = Builder.CreateExtractValue(RtAndRt2, 0);
8473     Rt = Builder.CreateZExt(Rt, Int64Ty);
8474     Rt1 = Builder.CreateZExt(Rt1, Int64Ty);
8475 
8476     Value *ShiftCast = llvm::ConstantInt::get(Int64Ty, 32);
8477     RtAndRt2 = Builder.CreateShl(Rt, ShiftCast, "shl", true);
8478     RtAndRt2 = Builder.CreateOr(RtAndRt2, Rt1);
8479 
8480     return Builder.CreateBitCast(RtAndRt2, ConvertType(E->getType()));
8481   }
8482 
8483   if (BuiltinID == clang::ARM::BI__builtin_arm_ldrexd ||
8484       ((BuiltinID == clang::ARM::BI__builtin_arm_ldrex ||
8485         BuiltinID == clang::ARM::BI__builtin_arm_ldaex) &&
8486        getContext().getTypeSize(E->getType()) == 64) ||
8487       BuiltinID == clang::ARM::BI__ldrexd) {
8488     Function *F;
8489 
8490     switch (BuiltinID) {
8491     default: llvm_unreachable("unexpected builtin");
8492     case clang::ARM::BI__builtin_arm_ldaex:
8493       F = CGM.getIntrinsic(Intrinsic::arm_ldaexd);
8494       break;
8495     case clang::ARM::BI__builtin_arm_ldrexd:
8496     case clang::ARM::BI__builtin_arm_ldrex:
8497     case clang::ARM::BI__ldrexd:
8498       F = CGM.getIntrinsic(Intrinsic::arm_ldrexd);
8499       break;
8500     }
8501 
8502     Value *LdPtr = EmitScalarExpr(E->getArg(0));
8503     Value *Val = Builder.CreateCall(F, LdPtr, "ldrexd");
8504 
8505     Value *Val0 = Builder.CreateExtractValue(Val, 1);
8506     Value *Val1 = Builder.CreateExtractValue(Val, 0);
8507     Val0 = Builder.CreateZExt(Val0, Int64Ty);
8508     Val1 = Builder.CreateZExt(Val1, Int64Ty);
8509 
8510     Value *ShiftCst = llvm::ConstantInt::get(Int64Ty, 32);
8511     Val = Builder.CreateShl(Val0, ShiftCst, "shl", true /* nuw */);
8512     Val = Builder.CreateOr(Val, Val1);
8513     return Builder.CreateBitCast(Val, ConvertType(E->getType()));
8514   }
8515 
8516   if (BuiltinID == clang::ARM::BI__builtin_arm_ldrex ||
8517       BuiltinID == clang::ARM::BI__builtin_arm_ldaex) {
8518     Value *LoadAddr = EmitScalarExpr(E->getArg(0));
8519 
8520     QualType Ty = E->getType();
8521     llvm::Type *RealResTy = ConvertType(Ty);
8522     llvm::Type *IntTy =
8523         llvm::IntegerType::get(getLLVMContext(), getContext().getTypeSize(Ty));
8524 
8525     Function *F = CGM.getIntrinsic(
8526         BuiltinID == clang::ARM::BI__builtin_arm_ldaex ? Intrinsic::arm_ldaex
8527                                                        : Intrinsic::arm_ldrex,
8528         UnqualPtrTy);
8529     CallInst *Val = Builder.CreateCall(F, LoadAddr, "ldrex");
8530     Val->addParamAttr(
8531         0, Attribute::get(getLLVMContext(), Attribute::ElementType, IntTy));
8532 
8533     if (RealResTy->isPointerTy())
8534       return Builder.CreateIntToPtr(Val, RealResTy);
8535     else {
8536       llvm::Type *IntResTy = llvm::IntegerType::get(
8537           getLLVMContext(), CGM.getDataLayout().getTypeSizeInBits(RealResTy));
8538       return Builder.CreateBitCast(Builder.CreateTruncOrBitCast(Val, IntResTy),
8539                                    RealResTy);
8540     }
8541   }
8542 
8543   if (BuiltinID == clang::ARM::BI__builtin_arm_strexd ||
8544       ((BuiltinID == clang::ARM::BI__builtin_arm_stlex ||
8545         BuiltinID == clang::ARM::BI__builtin_arm_strex) &&
8546        getContext().getTypeSize(E->getArg(0)->getType()) == 64)) {
8547     Function *F = CGM.getIntrinsic(
8548         BuiltinID == clang::ARM::BI__builtin_arm_stlex ? Intrinsic::arm_stlexd
8549                                                        : Intrinsic::arm_strexd);
8550     llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty);
8551 
8552     Address Tmp = CreateMemTemp(E->getArg(0)->getType());
8553     Value *Val = EmitScalarExpr(E->getArg(0));
8554     Builder.CreateStore(Val, Tmp);
8555 
8556     Address LdPtr = Tmp.withElementType(STy);
8557     Val = Builder.CreateLoad(LdPtr);
8558 
8559     Value *Arg0 = Builder.CreateExtractValue(Val, 0);
8560     Value *Arg1 = Builder.CreateExtractValue(Val, 1);
8561     Value *StPtr = EmitScalarExpr(E->getArg(1));
8562     return Builder.CreateCall(F, {Arg0, Arg1, StPtr}, "strexd");
8563   }
8564 
8565   if (BuiltinID == clang::ARM::BI__builtin_arm_strex ||
8566       BuiltinID == clang::ARM::BI__builtin_arm_stlex) {
8567     Value *StoreVal = EmitScalarExpr(E->getArg(0));
8568     Value *StoreAddr = EmitScalarExpr(E->getArg(1));
8569 
8570     QualType Ty = E->getArg(0)->getType();
8571     llvm::Type *StoreTy =
8572         llvm::IntegerType::get(getLLVMContext(), getContext().getTypeSize(Ty));
8573 
8574     if (StoreVal->getType()->isPointerTy())
8575       StoreVal = Builder.CreatePtrToInt(StoreVal, Int32Ty);
8576     else {
8577       llvm::Type *IntTy = llvm::IntegerType::get(
8578           getLLVMContext(),
8579           CGM.getDataLayout().getTypeSizeInBits(StoreVal->getType()));
8580       StoreVal = Builder.CreateBitCast(StoreVal, IntTy);
8581       StoreVal = Builder.CreateZExtOrBitCast(StoreVal, Int32Ty);
8582     }
8583 
8584     Function *F = CGM.getIntrinsic(
8585         BuiltinID == clang::ARM::BI__builtin_arm_stlex ? Intrinsic::arm_stlex
8586                                                        : Intrinsic::arm_strex,
8587         StoreAddr->getType());
8588 
8589     CallInst *CI = Builder.CreateCall(F, {StoreVal, StoreAddr}, "strex");
8590     CI->addParamAttr(
8591         1, Attribute::get(getLLVMContext(), Attribute::ElementType, StoreTy));
8592     return CI;
8593   }
8594 
8595   if (BuiltinID == clang::ARM::BI__builtin_arm_clrex) {
8596     Function *F = CGM.getIntrinsic(Intrinsic::arm_clrex);
8597     return Builder.CreateCall(F);
8598   }
8599 
8600   // CRC32
8601   Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic;
8602   switch (BuiltinID) {
8603   case clang::ARM::BI__builtin_arm_crc32b:
8604     CRCIntrinsicID = Intrinsic::arm_crc32b; break;
8605   case clang::ARM::BI__builtin_arm_crc32cb:
8606     CRCIntrinsicID = Intrinsic::arm_crc32cb; break;
8607   case clang::ARM::BI__builtin_arm_crc32h:
8608     CRCIntrinsicID = Intrinsic::arm_crc32h; break;
8609   case clang::ARM::BI__builtin_arm_crc32ch:
8610     CRCIntrinsicID = Intrinsic::arm_crc32ch; break;
8611   case clang::ARM::BI__builtin_arm_crc32w:
8612   case clang::ARM::BI__builtin_arm_crc32d:
8613     CRCIntrinsicID = Intrinsic::arm_crc32w; break;
8614   case clang::ARM::BI__builtin_arm_crc32cw:
8615   case clang::ARM::BI__builtin_arm_crc32cd:
8616     CRCIntrinsicID = Intrinsic::arm_crc32cw; break;
8617   }
8618 
8619   if (CRCIntrinsicID != Intrinsic::not_intrinsic) {
8620     Value *Arg0 = EmitScalarExpr(E->getArg(0));
8621     Value *Arg1 = EmitScalarExpr(E->getArg(1));
8622 
8623     // crc32{c,}d intrinsics are implemented as two calls to crc32{c,}w
8624     // intrinsics, hence we need different codegen for these cases.
8625     if (BuiltinID == clang::ARM::BI__builtin_arm_crc32d ||
8626         BuiltinID == clang::ARM::BI__builtin_arm_crc32cd) {
8627       Value *C1 = llvm::ConstantInt::get(Int64Ty, 32);
8628       Value *Arg1a = Builder.CreateTruncOrBitCast(Arg1, Int32Ty);
8629       Value *Arg1b = Builder.CreateLShr(Arg1, C1);
8630       Arg1b = Builder.CreateTruncOrBitCast(Arg1b, Int32Ty);
8631 
8632       Function *F = CGM.getIntrinsic(CRCIntrinsicID);
8633       Value *Res = Builder.CreateCall(F, {Arg0, Arg1a});
8634       return Builder.CreateCall(F, {Res, Arg1b});
8635     } else {
8636       Arg1 = Builder.CreateZExtOrBitCast(Arg1, Int32Ty);
8637 
8638       Function *F = CGM.getIntrinsic(CRCIntrinsicID);
8639       return Builder.CreateCall(F, {Arg0, Arg1});
8640     }
8641   }
8642 
8643   if (BuiltinID == clang::ARM::BI__builtin_arm_rsr ||
8644       BuiltinID == clang::ARM::BI__builtin_arm_rsr64 ||
8645       BuiltinID == clang::ARM::BI__builtin_arm_rsrp ||
8646       BuiltinID == clang::ARM::BI__builtin_arm_wsr ||
8647       BuiltinID == clang::ARM::BI__builtin_arm_wsr64 ||
8648       BuiltinID == clang::ARM::BI__builtin_arm_wsrp) {
8649 
8650     SpecialRegisterAccessKind AccessKind = Write;
8651     if (BuiltinID == clang::ARM::BI__builtin_arm_rsr ||
8652         BuiltinID == clang::ARM::BI__builtin_arm_rsr64 ||
8653         BuiltinID == clang::ARM::BI__builtin_arm_rsrp)
8654       AccessKind = VolatileRead;
8655 
8656     bool IsPointerBuiltin = BuiltinID == clang::ARM::BI__builtin_arm_rsrp ||
8657                             BuiltinID == clang::ARM::BI__builtin_arm_wsrp;
8658 
8659     bool Is64Bit = BuiltinID == clang::ARM::BI__builtin_arm_rsr64 ||
8660                    BuiltinID == clang::ARM::BI__builtin_arm_wsr64;
8661 
8662     llvm::Type *ValueType;
8663     llvm::Type *RegisterType;
8664     if (IsPointerBuiltin) {
8665       ValueType = VoidPtrTy;
8666       RegisterType = Int32Ty;
8667     } else if (Is64Bit) {
8668       ValueType = RegisterType = Int64Ty;
8669     } else {
8670       ValueType = RegisterType = Int32Ty;
8671     }
8672 
8673     return EmitSpecialRegisterBuiltin(*this, E, RegisterType, ValueType,
8674                                       AccessKind);
8675   }
8676 
8677   if (BuiltinID == ARM::BI__builtin_sponentry) {
8678     llvm::Function *F = CGM.getIntrinsic(Intrinsic::sponentry, AllocaInt8PtrTy);
8679     return Builder.CreateCall(F);
8680   }
8681 
8682   // Handle MSVC intrinsics before argument evaluation to prevent double
8683   // evaluation.
8684   if (std::optional<MSVCIntrin> MsvcIntId = translateArmToMsvcIntrin(BuiltinID))
8685     return EmitMSVCBuiltinExpr(*MsvcIntId, E);
8686 
8687   // Deal with MVE builtins
8688   if (Value *Result = EmitARMMVEBuiltinExpr(BuiltinID, E, ReturnValue, Arch))
8689     return Result;
8690   // Handle CDE builtins
8691   if (Value *Result = EmitARMCDEBuiltinExpr(BuiltinID, E, ReturnValue, Arch))
8692     return Result;
8693 
8694   // Some intrinsics are equivalent - if they are use the base intrinsic ID.
8695   auto It = llvm::find_if(NEONEquivalentIntrinsicMap, [BuiltinID](auto &P) {
8696     return P.first == BuiltinID;
8697   });
8698   if (It != end(NEONEquivalentIntrinsicMap))
8699     BuiltinID = It->second;
8700 
8701   // Find out if any arguments are required to be integer constant
8702   // expressions.
8703   unsigned ICEArguments = 0;
8704   ASTContext::GetBuiltinTypeError Error;
8705   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
8706   assert(Error == ASTContext::GE_None && "Should not codegen an error");
8707 
8708   auto getAlignmentValue32 = [&](Address addr) -> Value* {
8709     return Builder.getInt32(addr.getAlignment().getQuantity());
8710   };
8711 
8712   Address PtrOp0 = Address::invalid();
8713   Address PtrOp1 = Address::invalid();
8714   SmallVector<Value*, 4> Ops;
8715   bool HasExtraArg = HasExtraNeonArgument(BuiltinID);
8716   unsigned NumArgs = E->getNumArgs() - (HasExtraArg ? 1 : 0);
8717   for (unsigned i = 0, e = NumArgs; i != e; i++) {
8718     if (i == 0) {
8719       switch (BuiltinID) {
8720       case NEON::BI__builtin_neon_vld1_v:
8721       case NEON::BI__builtin_neon_vld1q_v:
8722       case NEON::BI__builtin_neon_vld1q_lane_v:
8723       case NEON::BI__builtin_neon_vld1_lane_v:
8724       case NEON::BI__builtin_neon_vld1_dup_v:
8725       case NEON::BI__builtin_neon_vld1q_dup_v:
8726       case NEON::BI__builtin_neon_vst1_v:
8727       case NEON::BI__builtin_neon_vst1q_v:
8728       case NEON::BI__builtin_neon_vst1q_lane_v:
8729       case NEON::BI__builtin_neon_vst1_lane_v:
8730       case NEON::BI__builtin_neon_vst2_v:
8731       case NEON::BI__builtin_neon_vst2q_v:
8732       case NEON::BI__builtin_neon_vst2_lane_v:
8733       case NEON::BI__builtin_neon_vst2q_lane_v:
8734       case NEON::BI__builtin_neon_vst3_v:
8735       case NEON::BI__builtin_neon_vst3q_v:
8736       case NEON::BI__builtin_neon_vst3_lane_v:
8737       case NEON::BI__builtin_neon_vst3q_lane_v:
8738       case NEON::BI__builtin_neon_vst4_v:
8739       case NEON::BI__builtin_neon_vst4q_v:
8740       case NEON::BI__builtin_neon_vst4_lane_v:
8741       case NEON::BI__builtin_neon_vst4q_lane_v:
8742         // Get the alignment for the argument in addition to the value;
8743         // we'll use it later.
8744         PtrOp0 = EmitPointerWithAlignment(E->getArg(0));
8745         Ops.push_back(PtrOp0.getPointer());
8746         continue;
8747       }
8748     }
8749     if (i == 1) {
8750       switch (BuiltinID) {
8751       case NEON::BI__builtin_neon_vld2_v:
8752       case NEON::BI__builtin_neon_vld2q_v:
8753       case NEON::BI__builtin_neon_vld3_v:
8754       case NEON::BI__builtin_neon_vld3q_v:
8755       case NEON::BI__builtin_neon_vld4_v:
8756       case NEON::BI__builtin_neon_vld4q_v:
8757       case NEON::BI__builtin_neon_vld2_lane_v:
8758       case NEON::BI__builtin_neon_vld2q_lane_v:
8759       case NEON::BI__builtin_neon_vld3_lane_v:
8760       case NEON::BI__builtin_neon_vld3q_lane_v:
8761       case NEON::BI__builtin_neon_vld4_lane_v:
8762       case NEON::BI__builtin_neon_vld4q_lane_v:
8763       case NEON::BI__builtin_neon_vld2_dup_v:
8764       case NEON::BI__builtin_neon_vld2q_dup_v:
8765       case NEON::BI__builtin_neon_vld3_dup_v:
8766       case NEON::BI__builtin_neon_vld3q_dup_v:
8767       case NEON::BI__builtin_neon_vld4_dup_v:
8768       case NEON::BI__builtin_neon_vld4q_dup_v:
8769         // Get the alignment for the argument in addition to the value;
8770         // we'll use it later.
8771         PtrOp1 = EmitPointerWithAlignment(E->getArg(1));
8772         Ops.push_back(PtrOp1.getPointer());
8773         continue;
8774       }
8775     }
8776 
8777     Ops.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, i, E));
8778   }
8779 
8780   switch (BuiltinID) {
8781   default: break;
8782 
8783   case NEON::BI__builtin_neon_vget_lane_i8:
8784   case NEON::BI__builtin_neon_vget_lane_i16:
8785   case NEON::BI__builtin_neon_vget_lane_i32:
8786   case NEON::BI__builtin_neon_vget_lane_i64:
8787   case NEON::BI__builtin_neon_vget_lane_bf16:
8788   case NEON::BI__builtin_neon_vget_lane_f32:
8789   case NEON::BI__builtin_neon_vgetq_lane_i8:
8790   case NEON::BI__builtin_neon_vgetq_lane_i16:
8791   case NEON::BI__builtin_neon_vgetq_lane_i32:
8792   case NEON::BI__builtin_neon_vgetq_lane_i64:
8793   case NEON::BI__builtin_neon_vgetq_lane_bf16:
8794   case NEON::BI__builtin_neon_vgetq_lane_f32:
8795   case NEON::BI__builtin_neon_vduph_lane_bf16:
8796   case NEON::BI__builtin_neon_vduph_laneq_bf16:
8797     return Builder.CreateExtractElement(Ops[0], Ops[1], "vget_lane");
8798 
8799   case NEON::BI__builtin_neon_vrndns_f32: {
8800     Value *Arg = EmitScalarExpr(E->getArg(0));
8801     llvm::Type *Tys[] = {Arg->getType()};
8802     Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vrintn, Tys);
8803     return Builder.CreateCall(F, {Arg}, "vrndn"); }
8804 
8805   case NEON::BI__builtin_neon_vset_lane_i8:
8806   case NEON::BI__builtin_neon_vset_lane_i16:
8807   case NEON::BI__builtin_neon_vset_lane_i32:
8808   case NEON::BI__builtin_neon_vset_lane_i64:
8809   case NEON::BI__builtin_neon_vset_lane_bf16:
8810   case NEON::BI__builtin_neon_vset_lane_f32:
8811   case NEON::BI__builtin_neon_vsetq_lane_i8:
8812   case NEON::BI__builtin_neon_vsetq_lane_i16:
8813   case NEON::BI__builtin_neon_vsetq_lane_i32:
8814   case NEON::BI__builtin_neon_vsetq_lane_i64:
8815   case NEON::BI__builtin_neon_vsetq_lane_bf16:
8816   case NEON::BI__builtin_neon_vsetq_lane_f32:
8817     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
8818 
8819   case NEON::BI__builtin_neon_vsha1h_u32:
8820     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1h), Ops,
8821                         "vsha1h");
8822   case NEON::BI__builtin_neon_vsha1cq_u32:
8823     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1c), Ops,
8824                         "vsha1h");
8825   case NEON::BI__builtin_neon_vsha1pq_u32:
8826     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1p), Ops,
8827                         "vsha1h");
8828   case NEON::BI__builtin_neon_vsha1mq_u32:
8829     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1m), Ops,
8830                         "vsha1h");
8831 
8832   case NEON::BI__builtin_neon_vcvth_bf16_f32: {
8833     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vcvtbfp2bf), Ops,
8834                         "vcvtbfp2bf");
8835   }
8836 
8837   // The ARM _MoveToCoprocessor builtins put the input register value as
8838   // the first argument, but the LLVM intrinsic expects it as the third one.
8839   case clang::ARM::BI_MoveToCoprocessor:
8840   case clang::ARM::BI_MoveToCoprocessor2: {
8841     Function *F = CGM.getIntrinsic(BuiltinID == clang::ARM::BI_MoveToCoprocessor
8842                                        ? Intrinsic::arm_mcr
8843                                        : Intrinsic::arm_mcr2);
8844     return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0],
8845                                   Ops[3], Ops[4], Ops[5]});
8846   }
8847   }
8848 
8849   // Get the last argument, which specifies the vector type.
8850   assert(HasExtraArg);
8851   const Expr *Arg = E->getArg(E->getNumArgs()-1);
8852   std::optional<llvm::APSInt> Result =
8853       Arg->getIntegerConstantExpr(getContext());
8854   if (!Result)
8855     return nullptr;
8856 
8857   if (BuiltinID == clang::ARM::BI__builtin_arm_vcvtr_f ||
8858       BuiltinID == clang::ARM::BI__builtin_arm_vcvtr_d) {
8859     // Determine the overloaded type of this builtin.
8860     llvm::Type *Ty;
8861     if (BuiltinID == clang::ARM::BI__builtin_arm_vcvtr_f)
8862       Ty = FloatTy;
8863     else
8864       Ty = DoubleTy;
8865 
8866     // Determine whether this is an unsigned conversion or not.
8867     bool usgn = Result->getZExtValue() == 1;
8868     unsigned Int = usgn ? Intrinsic::arm_vcvtru : Intrinsic::arm_vcvtr;
8869 
8870     // Call the appropriate intrinsic.
8871     Function *F = CGM.getIntrinsic(Int, Ty);
8872     return Builder.CreateCall(F, Ops, "vcvtr");
8873   }
8874 
8875   // Determine the type of this overloaded NEON intrinsic.
8876   NeonTypeFlags Type = Result->getZExtValue();
8877   bool usgn = Type.isUnsigned();
8878   bool rightShift = false;
8879 
8880   llvm::FixedVectorType *VTy =
8881       GetNeonType(this, Type, getTarget().hasLegalHalfType(), false,
8882                   getTarget().hasBFloat16Type());
8883   llvm::Type *Ty = VTy;
8884   if (!Ty)
8885     return nullptr;
8886 
8887   // Many NEON builtins have identical semantics and uses in ARM and
8888   // AArch64. Emit these in a single function.
8889   auto IntrinsicMap = ArrayRef(ARMSIMDIntrinsicMap);
8890   const ARMVectorIntrinsicInfo *Builtin = findARMVectorIntrinsicInMap(
8891       IntrinsicMap, BuiltinID, NEONSIMDIntrinsicsProvenSorted);
8892   if (Builtin)
8893     return EmitCommonNeonBuiltinExpr(
8894         Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic,
8895         Builtin->NameHint, Builtin->TypeModifier, E, Ops, PtrOp0, PtrOp1, Arch);
8896 
8897   unsigned Int;
8898   switch (BuiltinID) {
8899   default: return nullptr;
8900   case NEON::BI__builtin_neon_vld1q_lane_v:
8901     // Handle 64-bit integer elements as a special case.  Use shuffles of
8902     // one-element vectors to avoid poor code for i64 in the backend.
8903     if (VTy->getElementType()->isIntegerTy(64)) {
8904       // Extract the other lane.
8905       Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8906       int Lane = cast<ConstantInt>(Ops[2])->getZExtValue();
8907       Value *SV = llvm::ConstantVector::get(ConstantInt::get(Int32Ty, 1-Lane));
8908       Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV);
8909       // Load the value as a one-element vector.
8910       Ty = llvm::FixedVectorType::get(VTy->getElementType(), 1);
8911       llvm::Type *Tys[] = {Ty, Int8PtrTy};
8912       Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld1, Tys);
8913       Value *Align = getAlignmentValue32(PtrOp0);
8914       Value *Ld = Builder.CreateCall(F, {Ops[0], Align});
8915       // Combine them.
8916       int Indices[] = {1 - Lane, Lane};
8917       return Builder.CreateShuffleVector(Ops[1], Ld, Indices, "vld1q_lane");
8918     }
8919     [[fallthrough]];
8920   case NEON::BI__builtin_neon_vld1_lane_v: {
8921     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8922     PtrOp0 = PtrOp0.withElementType(VTy->getElementType());
8923     Value *Ld = Builder.CreateLoad(PtrOp0);
8924     return Builder.CreateInsertElement(Ops[1], Ld, Ops[2], "vld1_lane");
8925   }
8926   case NEON::BI__builtin_neon_vqrshrn_n_v:
8927     Int =
8928       usgn ? Intrinsic::arm_neon_vqrshiftnu : Intrinsic::arm_neon_vqrshiftns;
8929     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n",
8930                         1, true);
8931   case NEON::BI__builtin_neon_vqrshrun_n_v:
8932     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqrshiftnsu, Ty),
8933                         Ops, "vqrshrun_n", 1, true);
8934   case NEON::BI__builtin_neon_vqshrn_n_v:
8935     Int = usgn ? Intrinsic::arm_neon_vqshiftnu : Intrinsic::arm_neon_vqshiftns;
8936     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n",
8937                         1, true);
8938   case NEON::BI__builtin_neon_vqshrun_n_v:
8939     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqshiftnsu, Ty),
8940                         Ops, "vqshrun_n", 1, true);
8941   case NEON::BI__builtin_neon_vrecpe_v:
8942   case NEON::BI__builtin_neon_vrecpeq_v:
8943     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrecpe, Ty),
8944                         Ops, "vrecpe");
8945   case NEON::BI__builtin_neon_vrshrn_n_v:
8946     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrshiftn, Ty),
8947                         Ops, "vrshrn_n", 1, true);
8948   case NEON::BI__builtin_neon_vrsra_n_v:
8949   case NEON::BI__builtin_neon_vrsraq_n_v:
8950     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
8951     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8952     Ops[2] = EmitNeonShiftVector(Ops[2], Ty, true);
8953     Int = usgn ? Intrinsic::arm_neon_vrshiftu : Intrinsic::arm_neon_vrshifts;
8954     Ops[1] = Builder.CreateCall(CGM.getIntrinsic(Int, Ty), {Ops[1], Ops[2]});
8955     return Builder.CreateAdd(Ops[0], Ops[1], "vrsra_n");
8956   case NEON::BI__builtin_neon_vsri_n_v:
8957   case NEON::BI__builtin_neon_vsriq_n_v:
8958     rightShift = true;
8959     [[fallthrough]];
8960   case NEON::BI__builtin_neon_vsli_n_v:
8961   case NEON::BI__builtin_neon_vsliq_n_v:
8962     Ops[2] = EmitNeonShiftVector(Ops[2], Ty, rightShift);
8963     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vshiftins, Ty),
8964                         Ops, "vsli_n");
8965   case NEON::BI__builtin_neon_vsra_n_v:
8966   case NEON::BI__builtin_neon_vsraq_n_v:
8967     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
8968     Ops[1] = EmitNeonRShiftImm(Ops[1], Ops[2], Ty, usgn, "vsra_n");
8969     return Builder.CreateAdd(Ops[0], Ops[1]);
8970   case NEON::BI__builtin_neon_vst1q_lane_v:
8971     // Handle 64-bit integer elements as a special case.  Use a shuffle to get
8972     // a one-element vector and avoid poor code for i64 in the backend.
8973     if (VTy->getElementType()->isIntegerTy(64)) {
8974       Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8975       Value *SV = llvm::ConstantVector::get(cast<llvm::Constant>(Ops[2]));
8976       Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV);
8977       Ops[2] = getAlignmentValue32(PtrOp0);
8978       llvm::Type *Tys[] = {Int8PtrTy, Ops[1]->getType()};
8979       return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst1,
8980                                                  Tys), Ops);
8981     }
8982     [[fallthrough]];
8983   case NEON::BI__builtin_neon_vst1_lane_v: {
8984     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8985     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
8986     return Builder.CreateStore(Ops[1],
8987                                PtrOp0.withElementType(Ops[1]->getType()));
8988   }
8989   case NEON::BI__builtin_neon_vtbl1_v:
8990     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl1),
8991                         Ops, "vtbl1");
8992   case NEON::BI__builtin_neon_vtbl2_v:
8993     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl2),
8994                         Ops, "vtbl2");
8995   case NEON::BI__builtin_neon_vtbl3_v:
8996     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl3),
8997                         Ops, "vtbl3");
8998   case NEON::BI__builtin_neon_vtbl4_v:
8999     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl4),
9000                         Ops, "vtbl4");
9001   case NEON::BI__builtin_neon_vtbx1_v:
9002     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx1),
9003                         Ops, "vtbx1");
9004   case NEON::BI__builtin_neon_vtbx2_v:
9005     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx2),
9006                         Ops, "vtbx2");
9007   case NEON::BI__builtin_neon_vtbx3_v:
9008     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx3),
9009                         Ops, "vtbx3");
9010   case NEON::BI__builtin_neon_vtbx4_v:
9011     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx4),
9012                         Ops, "vtbx4");
9013   }
9014 }
9015 
9016 template<typename Integer>
9017 static Integer GetIntegerConstantValue(const Expr *E, ASTContext &Context) {
9018   return E->getIntegerConstantExpr(Context)->getExtValue();
9019 }
9020 
9021 static llvm::Value *SignOrZeroExtend(CGBuilderTy &Builder, llvm::Value *V,
9022                                      llvm::Type *T, bool Unsigned) {
9023   // Helper function called by Tablegen-constructed ARM MVE builtin codegen,
9024   // which finds it convenient to specify signed/unsigned as a boolean flag.
9025   return Unsigned ? Builder.CreateZExt(V, T) : Builder.CreateSExt(V, T);
9026 }
9027 
9028 static llvm::Value *MVEImmediateShr(CGBuilderTy &Builder, llvm::Value *V,
9029                                     uint32_t Shift, bool Unsigned) {
9030   // MVE helper function for integer shift right. This must handle signed vs
9031   // unsigned, and also deal specially with the case where the shift count is
9032   // equal to the lane size. In LLVM IR, an LShr with that parameter would be
9033   // undefined behavior, but in MVE it's legal, so we must convert it to code
9034   // that is not undefined in IR.
9035   unsigned LaneBits = cast<llvm::VectorType>(V->getType())
9036                           ->getElementType()
9037                           ->getPrimitiveSizeInBits();
9038   if (Shift == LaneBits) {
9039     // An unsigned shift of the full lane size always generates zero, so we can
9040     // simply emit a zero vector. A signed shift of the full lane size does the
9041     // same thing as shifting by one bit fewer.
9042     if (Unsigned)
9043       return llvm::Constant::getNullValue(V->getType());
9044     else
9045       --Shift;
9046   }
9047   return Unsigned ? Builder.CreateLShr(V, Shift) : Builder.CreateAShr(V, Shift);
9048 }
9049 
9050 static llvm::Value *ARMMVEVectorSplat(CGBuilderTy &Builder, llvm::Value *V) {
9051   // MVE-specific helper function for a vector splat, which infers the element
9052   // count of the output vector by knowing that MVE vectors are all 128 bits
9053   // wide.
9054   unsigned Elements = 128 / V->getType()->getPrimitiveSizeInBits();
9055   return Builder.CreateVectorSplat(Elements, V);
9056 }
9057 
9058 static llvm::Value *ARMMVEVectorReinterpret(CGBuilderTy &Builder,
9059                                             CodeGenFunction *CGF,
9060                                             llvm::Value *V,
9061                                             llvm::Type *DestType) {
9062   // Convert one MVE vector type into another by reinterpreting its in-register
9063   // format.
9064   //
9065   // Little-endian, this is identical to a bitcast (which reinterprets the
9066   // memory format). But big-endian, they're not necessarily the same, because
9067   // the register and memory formats map to each other differently depending on
9068   // the lane size.
9069   //
9070   // We generate a bitcast whenever we can (if we're little-endian, or if the
9071   // lane sizes are the same anyway). Otherwise we fall back to an IR intrinsic
9072   // that performs the different kind of reinterpretation.
9073   if (CGF->getTarget().isBigEndian() &&
9074       V->getType()->getScalarSizeInBits() != DestType->getScalarSizeInBits()) {
9075     return Builder.CreateCall(
9076         CGF->CGM.getIntrinsic(Intrinsic::arm_mve_vreinterpretq,
9077                               {DestType, V->getType()}),
9078         V);
9079   } else {
9080     return Builder.CreateBitCast(V, DestType);
9081   }
9082 }
9083 
9084 static llvm::Value *VectorUnzip(CGBuilderTy &Builder, llvm::Value *V, bool Odd) {
9085   // Make a shufflevector that extracts every other element of a vector (evens
9086   // or odds, as desired).
9087   SmallVector<int, 16> Indices;
9088   unsigned InputElements =
9089       cast<llvm::FixedVectorType>(V->getType())->getNumElements();
9090   for (unsigned i = 0; i < InputElements; i += 2)
9091     Indices.push_back(i + Odd);
9092   return Builder.CreateShuffleVector(V, Indices);
9093 }
9094 
9095 static llvm::Value *VectorZip(CGBuilderTy &Builder, llvm::Value *V0,
9096                               llvm::Value *V1) {
9097   // Make a shufflevector that interleaves two vectors element by element.
9098   assert(V0->getType() == V1->getType() && "Can't zip different vector types");
9099   SmallVector<int, 16> Indices;
9100   unsigned InputElements =
9101       cast<llvm::FixedVectorType>(V0->getType())->getNumElements();
9102   for (unsigned i = 0; i < InputElements; i++) {
9103     Indices.push_back(i);
9104     Indices.push_back(i + InputElements);
9105   }
9106   return Builder.CreateShuffleVector(V0, V1, Indices);
9107 }
9108 
9109 template<unsigned HighBit, unsigned OtherBits>
9110 static llvm::Value *ARMMVEConstantSplat(CGBuilderTy &Builder, llvm::Type *VT) {
9111   // MVE-specific helper function to make a vector splat of a constant such as
9112   // UINT_MAX or INT_MIN, in which all bits below the highest one are equal.
9113   llvm::Type *T = cast<llvm::VectorType>(VT)->getElementType();
9114   unsigned LaneBits = T->getPrimitiveSizeInBits();
9115   uint32_t Value = HighBit << (LaneBits - 1);
9116   if (OtherBits)
9117     Value |= (1UL << (LaneBits - 1)) - 1;
9118   llvm::Value *Lane = llvm::ConstantInt::get(T, Value);
9119   return ARMMVEVectorSplat(Builder, Lane);
9120 }
9121 
9122 static llvm::Value *ARMMVEVectorElementReverse(CGBuilderTy &Builder,
9123                                                llvm::Value *V,
9124                                                unsigned ReverseWidth) {
9125   // MVE-specific helper function which reverses the elements of a
9126   // vector within every (ReverseWidth)-bit collection of lanes.
9127   SmallVector<int, 16> Indices;
9128   unsigned LaneSize = V->getType()->getScalarSizeInBits();
9129   unsigned Elements = 128 / LaneSize;
9130   unsigned Mask = ReverseWidth / LaneSize - 1;
9131   for (unsigned i = 0; i < Elements; i++)
9132     Indices.push_back(i ^ Mask);
9133   return Builder.CreateShuffleVector(V, Indices);
9134 }
9135 
9136 Value *CodeGenFunction::EmitARMMVEBuiltinExpr(unsigned BuiltinID,
9137                                               const CallExpr *E,
9138                                               ReturnValueSlot ReturnValue,
9139                                               llvm::Triple::ArchType Arch) {
9140   enum class CustomCodeGen { VLD24, VST24 } CustomCodeGenType;
9141   Intrinsic::ID IRIntr;
9142   unsigned NumVectors;
9143 
9144   // Code autogenerated by Tablegen will handle all the simple builtins.
9145   switch (BuiltinID) {
9146     #include "clang/Basic/arm_mve_builtin_cg.inc"
9147 
9148     // If we didn't match an MVE builtin id at all, go back to the
9149     // main EmitARMBuiltinExpr.
9150   default:
9151     return nullptr;
9152   }
9153 
9154   // Anything that breaks from that switch is an MVE builtin that
9155   // needs handwritten code to generate.
9156 
9157   switch (CustomCodeGenType) {
9158 
9159   case CustomCodeGen::VLD24: {
9160     llvm::SmallVector<Value *, 4> Ops;
9161     llvm::SmallVector<llvm::Type *, 4> Tys;
9162 
9163     auto MvecCType = E->getType();
9164     auto MvecLType = ConvertType(MvecCType);
9165     assert(MvecLType->isStructTy() &&
9166            "Return type for vld[24]q should be a struct");
9167     assert(MvecLType->getStructNumElements() == 1 &&
9168            "Return-type struct for vld[24]q should have one element");
9169     auto MvecLTypeInner = MvecLType->getStructElementType(0);
9170     assert(MvecLTypeInner->isArrayTy() &&
9171            "Return-type struct for vld[24]q should contain an array");
9172     assert(MvecLTypeInner->getArrayNumElements() == NumVectors &&
9173            "Array member of return-type struct vld[24]q has wrong length");
9174     auto VecLType = MvecLTypeInner->getArrayElementType();
9175 
9176     Tys.push_back(VecLType);
9177 
9178     auto Addr = E->getArg(0);
9179     Ops.push_back(EmitScalarExpr(Addr));
9180     Tys.push_back(ConvertType(Addr->getType()));
9181 
9182     Function *F = CGM.getIntrinsic(IRIntr, ArrayRef(Tys));
9183     Value *LoadResult = Builder.CreateCall(F, Ops);
9184     Value *MvecOut = PoisonValue::get(MvecLType);
9185     for (unsigned i = 0; i < NumVectors; ++i) {
9186       Value *Vec = Builder.CreateExtractValue(LoadResult, i);
9187       MvecOut = Builder.CreateInsertValue(MvecOut, Vec, {0, i});
9188     }
9189 
9190     if (ReturnValue.isNull())
9191       return MvecOut;
9192     else
9193       return Builder.CreateStore(MvecOut, ReturnValue.getValue());
9194   }
9195 
9196   case CustomCodeGen::VST24: {
9197     llvm::SmallVector<Value *, 4> Ops;
9198     llvm::SmallVector<llvm::Type *, 4> Tys;
9199 
9200     auto Addr = E->getArg(0);
9201     Ops.push_back(EmitScalarExpr(Addr));
9202     Tys.push_back(ConvertType(Addr->getType()));
9203 
9204     auto MvecCType = E->getArg(1)->getType();
9205     auto MvecLType = ConvertType(MvecCType);
9206     assert(MvecLType->isStructTy() && "Data type for vst2q should be a struct");
9207     assert(MvecLType->getStructNumElements() == 1 &&
9208            "Data-type struct for vst2q should have one element");
9209     auto MvecLTypeInner = MvecLType->getStructElementType(0);
9210     assert(MvecLTypeInner->isArrayTy() &&
9211            "Data-type struct for vst2q should contain an array");
9212     assert(MvecLTypeInner->getArrayNumElements() == NumVectors &&
9213            "Array member of return-type struct vld[24]q has wrong length");
9214     auto VecLType = MvecLTypeInner->getArrayElementType();
9215 
9216     Tys.push_back(VecLType);
9217 
9218     AggValueSlot MvecSlot = CreateAggTemp(MvecCType);
9219     EmitAggExpr(E->getArg(1), MvecSlot);
9220     auto Mvec = Builder.CreateLoad(MvecSlot.getAddress());
9221     for (unsigned i = 0; i < NumVectors; i++)
9222       Ops.push_back(Builder.CreateExtractValue(Mvec, {0, i}));
9223 
9224     Function *F = CGM.getIntrinsic(IRIntr, ArrayRef(Tys));
9225     Value *ToReturn = nullptr;
9226     for (unsigned i = 0; i < NumVectors; i++) {
9227       Ops.push_back(llvm::ConstantInt::get(Int32Ty, i));
9228       ToReturn = Builder.CreateCall(F, Ops);
9229       Ops.pop_back();
9230     }
9231     return ToReturn;
9232   }
9233   }
9234   llvm_unreachable("unknown custom codegen type.");
9235 }
9236 
9237 Value *CodeGenFunction::EmitARMCDEBuiltinExpr(unsigned BuiltinID,
9238                                               const CallExpr *E,
9239                                               ReturnValueSlot ReturnValue,
9240                                               llvm::Triple::ArchType Arch) {
9241   switch (BuiltinID) {
9242   default:
9243     return nullptr;
9244 #include "clang/Basic/arm_cde_builtin_cg.inc"
9245   }
9246 }
9247 
9248 static Value *EmitAArch64TblBuiltinExpr(CodeGenFunction &CGF, unsigned BuiltinID,
9249                                       const CallExpr *E,
9250                                       SmallVectorImpl<Value *> &Ops,
9251                                       llvm::Triple::ArchType Arch) {
9252   unsigned int Int = 0;
9253   const char *s = nullptr;
9254 
9255   switch (BuiltinID) {
9256   default:
9257     return nullptr;
9258   case NEON::BI__builtin_neon_vtbl1_v:
9259   case NEON::BI__builtin_neon_vqtbl1_v:
9260   case NEON::BI__builtin_neon_vqtbl1q_v:
9261   case NEON::BI__builtin_neon_vtbl2_v:
9262   case NEON::BI__builtin_neon_vqtbl2_v:
9263   case NEON::BI__builtin_neon_vqtbl2q_v:
9264   case NEON::BI__builtin_neon_vtbl3_v:
9265   case NEON::BI__builtin_neon_vqtbl3_v:
9266   case NEON::BI__builtin_neon_vqtbl3q_v:
9267   case NEON::BI__builtin_neon_vtbl4_v:
9268   case NEON::BI__builtin_neon_vqtbl4_v:
9269   case NEON::BI__builtin_neon_vqtbl4q_v:
9270     break;
9271   case NEON::BI__builtin_neon_vtbx1_v:
9272   case NEON::BI__builtin_neon_vqtbx1_v:
9273   case NEON::BI__builtin_neon_vqtbx1q_v:
9274   case NEON::BI__builtin_neon_vtbx2_v:
9275   case NEON::BI__builtin_neon_vqtbx2_v:
9276   case NEON::BI__builtin_neon_vqtbx2q_v:
9277   case NEON::BI__builtin_neon_vtbx3_v:
9278   case NEON::BI__builtin_neon_vqtbx3_v:
9279   case NEON::BI__builtin_neon_vqtbx3q_v:
9280   case NEON::BI__builtin_neon_vtbx4_v:
9281   case NEON::BI__builtin_neon_vqtbx4_v:
9282   case NEON::BI__builtin_neon_vqtbx4q_v:
9283     break;
9284   }
9285 
9286   assert(E->getNumArgs() >= 3);
9287 
9288   // Get the last argument, which specifies the vector type.
9289   const Expr *Arg = E->getArg(E->getNumArgs() - 1);
9290   std::optional<llvm::APSInt> Result =
9291       Arg->getIntegerConstantExpr(CGF.getContext());
9292   if (!Result)
9293     return nullptr;
9294 
9295   // Determine the type of this overloaded NEON intrinsic.
9296   NeonTypeFlags Type = Result->getZExtValue();
9297   llvm::FixedVectorType *Ty = GetNeonType(&CGF, Type);
9298   if (!Ty)
9299     return nullptr;
9300 
9301   CodeGen::CGBuilderTy &Builder = CGF.Builder;
9302 
9303   // AArch64 scalar builtins are not overloaded, they do not have an extra
9304   // argument that specifies the vector type, need to handle each case.
9305   switch (BuiltinID) {
9306   case NEON::BI__builtin_neon_vtbl1_v: {
9307     return packTBLDVectorList(CGF, ArrayRef(Ops).slice(0, 1), nullptr, Ops[1],
9308                               Ty, Intrinsic::aarch64_neon_tbl1, "vtbl1");
9309   }
9310   case NEON::BI__builtin_neon_vtbl2_v: {
9311     return packTBLDVectorList(CGF, ArrayRef(Ops).slice(0, 2), nullptr, Ops[2],
9312                               Ty, Intrinsic::aarch64_neon_tbl1, "vtbl1");
9313   }
9314   case NEON::BI__builtin_neon_vtbl3_v: {
9315     return packTBLDVectorList(CGF, ArrayRef(Ops).slice(0, 3), nullptr, Ops[3],
9316                               Ty, Intrinsic::aarch64_neon_tbl2, "vtbl2");
9317   }
9318   case NEON::BI__builtin_neon_vtbl4_v: {
9319     return packTBLDVectorList(CGF, ArrayRef(Ops).slice(0, 4), nullptr, Ops[4],
9320                               Ty, Intrinsic::aarch64_neon_tbl2, "vtbl2");
9321   }
9322   case NEON::BI__builtin_neon_vtbx1_v: {
9323     Value *TblRes =
9324         packTBLDVectorList(CGF, ArrayRef(Ops).slice(1, 1), nullptr, Ops[2], Ty,
9325                            Intrinsic::aarch64_neon_tbl1, "vtbl1");
9326 
9327     llvm::Constant *EightV = ConstantInt::get(Ty, 8);
9328     Value *CmpRes = Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[2], EightV);
9329     CmpRes = Builder.CreateSExt(CmpRes, Ty);
9330 
9331     Value *EltsFromInput = Builder.CreateAnd(CmpRes, Ops[0]);
9332     Value *EltsFromTbl = Builder.CreateAnd(Builder.CreateNot(CmpRes), TblRes);
9333     return Builder.CreateOr(EltsFromInput, EltsFromTbl, "vtbx");
9334   }
9335   case NEON::BI__builtin_neon_vtbx2_v: {
9336     return packTBLDVectorList(CGF, ArrayRef(Ops).slice(1, 2), Ops[0], Ops[3],
9337                               Ty, Intrinsic::aarch64_neon_tbx1, "vtbx1");
9338   }
9339   case NEON::BI__builtin_neon_vtbx3_v: {
9340     Value *TblRes =
9341         packTBLDVectorList(CGF, ArrayRef(Ops).slice(1, 3), nullptr, Ops[4], Ty,
9342                            Intrinsic::aarch64_neon_tbl2, "vtbl2");
9343 
9344     llvm::Constant *TwentyFourV = ConstantInt::get(Ty, 24);
9345     Value *CmpRes = Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[4],
9346                                            TwentyFourV);
9347     CmpRes = Builder.CreateSExt(CmpRes, Ty);
9348 
9349     Value *EltsFromInput = Builder.CreateAnd(CmpRes, Ops[0]);
9350     Value *EltsFromTbl = Builder.CreateAnd(Builder.CreateNot(CmpRes), TblRes);
9351     return Builder.CreateOr(EltsFromInput, EltsFromTbl, "vtbx");
9352   }
9353   case NEON::BI__builtin_neon_vtbx4_v: {
9354     return packTBLDVectorList(CGF, ArrayRef(Ops).slice(1, 4), Ops[0], Ops[5],
9355                               Ty, Intrinsic::aarch64_neon_tbx2, "vtbx2");
9356   }
9357   case NEON::BI__builtin_neon_vqtbl1_v:
9358   case NEON::BI__builtin_neon_vqtbl1q_v:
9359     Int = Intrinsic::aarch64_neon_tbl1; s = "vtbl1"; break;
9360   case NEON::BI__builtin_neon_vqtbl2_v:
9361   case NEON::BI__builtin_neon_vqtbl2q_v: {
9362     Int = Intrinsic::aarch64_neon_tbl2; s = "vtbl2"; break;
9363   case NEON::BI__builtin_neon_vqtbl3_v:
9364   case NEON::BI__builtin_neon_vqtbl3q_v:
9365     Int = Intrinsic::aarch64_neon_tbl3; s = "vtbl3"; break;
9366   case NEON::BI__builtin_neon_vqtbl4_v:
9367   case NEON::BI__builtin_neon_vqtbl4q_v:
9368     Int = Intrinsic::aarch64_neon_tbl4; s = "vtbl4"; break;
9369   case NEON::BI__builtin_neon_vqtbx1_v:
9370   case NEON::BI__builtin_neon_vqtbx1q_v:
9371     Int = Intrinsic::aarch64_neon_tbx1; s = "vtbx1"; break;
9372   case NEON::BI__builtin_neon_vqtbx2_v:
9373   case NEON::BI__builtin_neon_vqtbx2q_v:
9374     Int = Intrinsic::aarch64_neon_tbx2; s = "vtbx2"; break;
9375   case NEON::BI__builtin_neon_vqtbx3_v:
9376   case NEON::BI__builtin_neon_vqtbx3q_v:
9377     Int = Intrinsic::aarch64_neon_tbx3; s = "vtbx3"; break;
9378   case NEON::BI__builtin_neon_vqtbx4_v:
9379   case NEON::BI__builtin_neon_vqtbx4q_v:
9380     Int = Intrinsic::aarch64_neon_tbx4; s = "vtbx4"; break;
9381   }
9382   }
9383 
9384   if (!Int)
9385     return nullptr;
9386 
9387   Function *F = CGF.CGM.getIntrinsic(Int, Ty);
9388   return CGF.EmitNeonCall(F, Ops, s);
9389 }
9390 
9391 Value *CodeGenFunction::vectorWrapScalar16(Value *Op) {
9392   auto *VTy = llvm::FixedVectorType::get(Int16Ty, 4);
9393   Op = Builder.CreateBitCast(Op, Int16Ty);
9394   Value *V = PoisonValue::get(VTy);
9395   llvm::Constant *CI = ConstantInt::get(SizeTy, 0);
9396   Op = Builder.CreateInsertElement(V, Op, CI);
9397   return Op;
9398 }
9399 
9400 /// SVEBuiltinMemEltTy - Returns the memory element type for this memory
9401 /// access builtin.  Only required if it can't be inferred from the base pointer
9402 /// operand.
9403 llvm::Type *CodeGenFunction::SVEBuiltinMemEltTy(const SVETypeFlags &TypeFlags) {
9404   switch (TypeFlags.getMemEltType()) {
9405   case SVETypeFlags::MemEltTyDefault:
9406     return getEltType(TypeFlags);
9407   case SVETypeFlags::MemEltTyInt8:
9408     return Builder.getInt8Ty();
9409   case SVETypeFlags::MemEltTyInt16:
9410     return Builder.getInt16Ty();
9411   case SVETypeFlags::MemEltTyInt32:
9412     return Builder.getInt32Ty();
9413   case SVETypeFlags::MemEltTyInt64:
9414     return Builder.getInt64Ty();
9415   }
9416   llvm_unreachable("Unknown MemEltType");
9417 }
9418 
9419 llvm::Type *CodeGenFunction::getEltType(const SVETypeFlags &TypeFlags) {
9420   switch (TypeFlags.getEltType()) {
9421   default:
9422     llvm_unreachable("Invalid SVETypeFlag!");
9423 
9424   case SVETypeFlags::EltTyInt8:
9425     return Builder.getInt8Ty();
9426   case SVETypeFlags::EltTyInt16:
9427     return Builder.getInt16Ty();
9428   case SVETypeFlags::EltTyInt32:
9429     return Builder.getInt32Ty();
9430   case SVETypeFlags::EltTyInt64:
9431     return Builder.getInt64Ty();
9432   case SVETypeFlags::EltTyInt128:
9433     return Builder.getInt128Ty();
9434 
9435   case SVETypeFlags::EltTyFloat16:
9436     return Builder.getHalfTy();
9437   case SVETypeFlags::EltTyFloat32:
9438     return Builder.getFloatTy();
9439   case SVETypeFlags::EltTyFloat64:
9440     return Builder.getDoubleTy();
9441 
9442   case SVETypeFlags::EltTyBFloat16:
9443     return Builder.getBFloatTy();
9444 
9445   case SVETypeFlags::EltTyBool8:
9446   case SVETypeFlags::EltTyBool16:
9447   case SVETypeFlags::EltTyBool32:
9448   case SVETypeFlags::EltTyBool64:
9449     return Builder.getInt1Ty();
9450   }
9451 }
9452 
9453 // Return the llvm predicate vector type corresponding to the specified element
9454 // TypeFlags.
9455 llvm::ScalableVectorType *
9456 CodeGenFunction::getSVEPredType(const SVETypeFlags &TypeFlags) {
9457   switch (TypeFlags.getEltType()) {
9458   default: llvm_unreachable("Unhandled SVETypeFlag!");
9459 
9460   case SVETypeFlags::EltTyInt8:
9461     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
9462   case SVETypeFlags::EltTyInt16:
9463     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
9464   case SVETypeFlags::EltTyInt32:
9465     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4);
9466   case SVETypeFlags::EltTyInt64:
9467     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2);
9468 
9469   case SVETypeFlags::EltTyBFloat16:
9470     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
9471   case SVETypeFlags::EltTyFloat16:
9472     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
9473   case SVETypeFlags::EltTyFloat32:
9474     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4);
9475   case SVETypeFlags::EltTyFloat64:
9476     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2);
9477 
9478   case SVETypeFlags::EltTyBool8:
9479     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
9480   case SVETypeFlags::EltTyBool16:
9481     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
9482   case SVETypeFlags::EltTyBool32:
9483     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4);
9484   case SVETypeFlags::EltTyBool64:
9485     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2);
9486   }
9487 }
9488 
9489 // Return the llvm vector type corresponding to the specified element TypeFlags.
9490 llvm::ScalableVectorType *
9491 CodeGenFunction::getSVEType(const SVETypeFlags &TypeFlags) {
9492   switch (TypeFlags.getEltType()) {
9493   default:
9494     llvm_unreachable("Invalid SVETypeFlag!");
9495 
9496   case SVETypeFlags::EltTyInt8:
9497     return llvm::ScalableVectorType::get(Builder.getInt8Ty(), 16);
9498   case SVETypeFlags::EltTyInt16:
9499     return llvm::ScalableVectorType::get(Builder.getInt16Ty(), 8);
9500   case SVETypeFlags::EltTyInt32:
9501     return llvm::ScalableVectorType::get(Builder.getInt32Ty(), 4);
9502   case SVETypeFlags::EltTyInt64:
9503     return llvm::ScalableVectorType::get(Builder.getInt64Ty(), 2);
9504 
9505   case SVETypeFlags::EltTyFloat16:
9506     return llvm::ScalableVectorType::get(Builder.getHalfTy(), 8);
9507   case SVETypeFlags::EltTyBFloat16:
9508     return llvm::ScalableVectorType::get(Builder.getBFloatTy(), 8);
9509   case SVETypeFlags::EltTyFloat32:
9510     return llvm::ScalableVectorType::get(Builder.getFloatTy(), 4);
9511   case SVETypeFlags::EltTyFloat64:
9512     return llvm::ScalableVectorType::get(Builder.getDoubleTy(), 2);
9513 
9514   case SVETypeFlags::EltTyBool8:
9515     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
9516   case SVETypeFlags::EltTyBool16:
9517     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
9518   case SVETypeFlags::EltTyBool32:
9519     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4);
9520   case SVETypeFlags::EltTyBool64:
9521     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2);
9522   }
9523 }
9524 
9525 llvm::Value *
9526 CodeGenFunction::EmitSVEAllTruePred(const SVETypeFlags &TypeFlags) {
9527   Function *Ptrue =
9528       CGM.getIntrinsic(Intrinsic::aarch64_sve_ptrue, getSVEPredType(TypeFlags));
9529   return Builder.CreateCall(Ptrue, {Builder.getInt32(/*SV_ALL*/ 31)});
9530 }
9531 
9532 constexpr unsigned SVEBitsPerBlock = 128;
9533 
9534 static llvm::ScalableVectorType *getSVEVectorForElementType(llvm::Type *EltTy) {
9535   unsigned NumElts = SVEBitsPerBlock / EltTy->getScalarSizeInBits();
9536   return llvm::ScalableVectorType::get(EltTy, NumElts);
9537 }
9538 
9539 // Reinterpret the input predicate so that it can be used to correctly isolate
9540 // the elements of the specified datatype.
9541 Value *CodeGenFunction::EmitSVEPredicateCast(Value *Pred,
9542                                              llvm::ScalableVectorType *VTy) {
9543 
9544   if (isa<TargetExtType>(Pred->getType()) &&
9545       cast<TargetExtType>(Pred->getType())->getName() == "aarch64.svcount")
9546     return Pred;
9547 
9548   auto *RTy = llvm::VectorType::get(IntegerType::get(getLLVMContext(), 1), VTy);
9549   if (Pred->getType() == RTy)
9550     return Pred;
9551 
9552   unsigned IntID;
9553   llvm::Type *IntrinsicTy;
9554   switch (VTy->getMinNumElements()) {
9555   default:
9556     llvm_unreachable("unsupported element count!");
9557   case 1:
9558   case 2:
9559   case 4:
9560   case 8:
9561     IntID = Intrinsic::aarch64_sve_convert_from_svbool;
9562     IntrinsicTy = RTy;
9563     break;
9564   case 16:
9565     IntID = Intrinsic::aarch64_sve_convert_to_svbool;
9566     IntrinsicTy = Pred->getType();
9567     break;
9568   }
9569 
9570   Function *F = CGM.getIntrinsic(IntID, IntrinsicTy);
9571   Value *C = Builder.CreateCall(F, Pred);
9572   assert(C->getType() == RTy && "Unexpected return type!");
9573   return C;
9574 }
9575 
9576 Value *CodeGenFunction::EmitSVEGatherLoad(const SVETypeFlags &TypeFlags,
9577                                           SmallVectorImpl<Value *> &Ops,
9578                                           unsigned IntID) {
9579   auto *ResultTy = getSVEType(TypeFlags);
9580   auto *OverloadedTy =
9581       llvm::ScalableVectorType::get(SVEBuiltinMemEltTy(TypeFlags), ResultTy);
9582 
9583   Function *F = nullptr;
9584   if (Ops[1]->getType()->isVectorTy())
9585     // This is the "vector base, scalar offset" case. In order to uniquely
9586     // map this built-in to an LLVM IR intrinsic, we need both the return type
9587     // and the type of the vector base.
9588     F = CGM.getIntrinsic(IntID, {OverloadedTy, Ops[1]->getType()});
9589   else
9590     // This is the "scalar base, vector offset case". The type of the offset
9591     // is encoded in the name of the intrinsic. We only need to specify the
9592     // return type in order to uniquely map this built-in to an LLVM IR
9593     // intrinsic.
9594     F = CGM.getIntrinsic(IntID, OverloadedTy);
9595 
9596   // At the ACLE level there's only one predicate type, svbool_t, which is
9597   // mapped to <n x 16 x i1>. However, this might be incompatible with the
9598   // actual type being loaded. For example, when loading doubles (i64) the
9599   // predicate should be <n x 2 x i1> instead. At the IR level the type of
9600   // the predicate and the data being loaded must match. Cast to the type
9601   // expected by the intrinsic. The intrinsic itself should be defined in
9602   // a way than enforces relations between parameter types.
9603   Ops[0] = EmitSVEPredicateCast(
9604       Ops[0], cast<llvm::ScalableVectorType>(F->getArg(0)->getType()));
9605 
9606   // Pass 0 when the offset is missing. This can only be applied when using
9607   // the "vector base" addressing mode for which ACLE allows no offset. The
9608   // corresponding LLVM IR always requires an offset.
9609   if (Ops.size() == 2) {
9610     assert(Ops[1]->getType()->isVectorTy() && "Scalar base requires an offset");
9611     Ops.push_back(ConstantInt::get(Int64Ty, 0));
9612   }
9613 
9614   // For "vector base, scalar index" scale the index so that it becomes a
9615   // scalar offset.
9616   if (!TypeFlags.isByteIndexed() && Ops[1]->getType()->isVectorTy()) {
9617     unsigned BytesPerElt =
9618         OverloadedTy->getElementType()->getScalarSizeInBits() / 8;
9619     Ops[2] = Builder.CreateShl(Ops[2], Log2_32(BytesPerElt));
9620   }
9621 
9622   Value *Call = Builder.CreateCall(F, Ops);
9623 
9624   // The following sext/zext is only needed when ResultTy != OverloadedTy. In
9625   // other cases it's folded into a nop.
9626   return TypeFlags.isZExtReturn() ? Builder.CreateZExt(Call, ResultTy)
9627                                   : Builder.CreateSExt(Call, ResultTy);
9628 }
9629 
9630 Value *CodeGenFunction::EmitSVEScatterStore(const SVETypeFlags &TypeFlags,
9631                                             SmallVectorImpl<Value *> &Ops,
9632                                             unsigned IntID) {
9633   auto *SrcDataTy = getSVEType(TypeFlags);
9634   auto *OverloadedTy =
9635       llvm::ScalableVectorType::get(SVEBuiltinMemEltTy(TypeFlags), SrcDataTy);
9636 
9637   // In ACLE the source data is passed in the last argument, whereas in LLVM IR
9638   // it's the first argument. Move it accordingly.
9639   Ops.insert(Ops.begin(), Ops.pop_back_val());
9640 
9641   Function *F = nullptr;
9642   if (Ops[2]->getType()->isVectorTy())
9643     // This is the "vector base, scalar offset" case. In order to uniquely
9644     // map this built-in to an LLVM IR intrinsic, we need both the return type
9645     // and the type of the vector base.
9646     F = CGM.getIntrinsic(IntID, {OverloadedTy, Ops[2]->getType()});
9647   else
9648     // This is the "scalar base, vector offset case". The type of the offset
9649     // is encoded in the name of the intrinsic. We only need to specify the
9650     // return type in order to uniquely map this built-in to an LLVM IR
9651     // intrinsic.
9652     F = CGM.getIntrinsic(IntID, OverloadedTy);
9653 
9654   // Pass 0 when the offset is missing. This can only be applied when using
9655   // the "vector base" addressing mode for which ACLE allows no offset. The
9656   // corresponding LLVM IR always requires an offset.
9657   if (Ops.size() == 3) {
9658     assert(Ops[1]->getType()->isVectorTy() && "Scalar base requires an offset");
9659     Ops.push_back(ConstantInt::get(Int64Ty, 0));
9660   }
9661 
9662   // Truncation is needed when SrcDataTy != OverloadedTy. In other cases it's
9663   // folded into a nop.
9664   Ops[0] = Builder.CreateTrunc(Ops[0], OverloadedTy);
9665 
9666   // At the ACLE level there's only one predicate type, svbool_t, which is
9667   // mapped to <n x 16 x i1>. However, this might be incompatible with the
9668   // actual type being stored. For example, when storing doubles (i64) the
9669   // predicated should be <n x 2 x i1> instead. At the IR level the type of
9670   // the predicate and the data being stored must match. Cast to the type
9671   // expected by the intrinsic. The intrinsic itself should be defined in
9672   // a way that enforces relations between parameter types.
9673   Ops[1] = EmitSVEPredicateCast(
9674       Ops[1], cast<llvm::ScalableVectorType>(F->getArg(1)->getType()));
9675 
9676   // For "vector base, scalar index" scale the index so that it becomes a
9677   // scalar offset.
9678   if (!TypeFlags.isByteIndexed() && Ops[2]->getType()->isVectorTy()) {
9679     unsigned BytesPerElt =
9680         OverloadedTy->getElementType()->getScalarSizeInBits() / 8;
9681     Ops[3] = Builder.CreateShl(Ops[3], Log2_32(BytesPerElt));
9682   }
9683 
9684   return Builder.CreateCall(F, Ops);
9685 }
9686 
9687 Value *CodeGenFunction::EmitSVEGatherPrefetch(const SVETypeFlags &TypeFlags,
9688                                               SmallVectorImpl<Value *> &Ops,
9689                                               unsigned IntID) {
9690   // The gather prefetches are overloaded on the vector input - this can either
9691   // be the vector of base addresses or vector of offsets.
9692   auto *OverloadedTy = dyn_cast<llvm::ScalableVectorType>(Ops[1]->getType());
9693   if (!OverloadedTy)
9694     OverloadedTy = cast<llvm::ScalableVectorType>(Ops[2]->getType());
9695 
9696   // Cast the predicate from svbool_t to the right number of elements.
9697   Ops[0] = EmitSVEPredicateCast(Ops[0], OverloadedTy);
9698 
9699   // vector + imm addressing modes
9700   if (Ops[1]->getType()->isVectorTy()) {
9701     if (Ops.size() == 3) {
9702       // Pass 0 for 'vector+imm' when the index is omitted.
9703       Ops.push_back(ConstantInt::get(Int64Ty, 0));
9704 
9705       // The sv_prfop is the last operand in the builtin and IR intrinsic.
9706       std::swap(Ops[2], Ops[3]);
9707     } else {
9708       // Index needs to be passed as scaled offset.
9709       llvm::Type *MemEltTy = SVEBuiltinMemEltTy(TypeFlags);
9710       unsigned BytesPerElt = MemEltTy->getPrimitiveSizeInBits() / 8;
9711       if (BytesPerElt > 1)
9712         Ops[2] = Builder.CreateShl(Ops[2], Log2_32(BytesPerElt));
9713     }
9714   }
9715 
9716   Function *F = CGM.getIntrinsic(IntID, OverloadedTy);
9717   return Builder.CreateCall(F, Ops);
9718 }
9719 
9720 Value *CodeGenFunction::EmitSVEStructLoad(const SVETypeFlags &TypeFlags,
9721                                           SmallVectorImpl<Value*> &Ops,
9722                                           unsigned IntID) {
9723   llvm::ScalableVectorType *VTy = getSVEType(TypeFlags);
9724 
9725   unsigned N;
9726   switch (IntID) {
9727   case Intrinsic::aarch64_sve_ld2_sret:
9728   case Intrinsic::aarch64_sve_ld1_pn_x2:
9729   case Intrinsic::aarch64_sve_ldnt1_pn_x2:
9730   case Intrinsic::aarch64_sve_ld2q_sret:
9731     N = 2;
9732     break;
9733   case Intrinsic::aarch64_sve_ld3_sret:
9734   case Intrinsic::aarch64_sve_ld3q_sret:
9735     N = 3;
9736     break;
9737   case Intrinsic::aarch64_sve_ld4_sret:
9738   case Intrinsic::aarch64_sve_ld1_pn_x4:
9739   case Intrinsic::aarch64_sve_ldnt1_pn_x4:
9740   case Intrinsic::aarch64_sve_ld4q_sret:
9741     N = 4;
9742     break;
9743   default:
9744     llvm_unreachable("unknown intrinsic!");
9745   }
9746   auto RetTy = llvm::VectorType::get(VTy->getElementType(),
9747                                      VTy->getElementCount() * N);
9748 
9749   Value *Predicate = EmitSVEPredicateCast(Ops[0], VTy);
9750   Value *BasePtr = Ops[1];
9751 
9752   // Does the load have an offset?
9753   if (Ops.size() > 2)
9754     BasePtr = Builder.CreateGEP(VTy, BasePtr, Ops[2]);
9755 
9756   Function *F = CGM.getIntrinsic(IntID, {VTy});
9757   Value *Call = Builder.CreateCall(F, {Predicate, BasePtr});
9758   unsigned MinElts = VTy->getMinNumElements();
9759   Value *Ret = llvm::PoisonValue::get(RetTy);
9760   for (unsigned I = 0; I < N; I++) {
9761     Value *Idx = ConstantInt::get(CGM.Int64Ty, I * MinElts);
9762     Value *SRet = Builder.CreateExtractValue(Call, I);
9763     Ret = Builder.CreateInsertVector(RetTy, Ret, SRet, Idx);
9764   }
9765   return Ret;
9766 }
9767 
9768 Value *CodeGenFunction::EmitSVEStructStore(const SVETypeFlags &TypeFlags,
9769                                            SmallVectorImpl<Value*> &Ops,
9770                                            unsigned IntID) {
9771   llvm::ScalableVectorType *VTy = getSVEType(TypeFlags);
9772 
9773   unsigned N;
9774   switch (IntID) {
9775   case Intrinsic::aarch64_sve_st2:
9776   case Intrinsic::aarch64_sve_st1_pn_x2:
9777   case Intrinsic::aarch64_sve_stnt1_pn_x2:
9778   case Intrinsic::aarch64_sve_st2q:
9779     N = 2;
9780     break;
9781   case Intrinsic::aarch64_sve_st3:
9782   case Intrinsic::aarch64_sve_st3q:
9783     N = 3;
9784     break;
9785   case Intrinsic::aarch64_sve_st4:
9786   case Intrinsic::aarch64_sve_st1_pn_x4:
9787   case Intrinsic::aarch64_sve_stnt1_pn_x4:
9788   case Intrinsic::aarch64_sve_st4q:
9789     N = 4;
9790     break;
9791   default:
9792     llvm_unreachable("unknown intrinsic!");
9793   }
9794 
9795   Value *Predicate = EmitSVEPredicateCast(Ops[0], VTy);
9796   Value *BasePtr = Ops[1];
9797 
9798   // Does the store have an offset?
9799   if (Ops.size() > (2 + N))
9800     BasePtr = Builder.CreateGEP(VTy, BasePtr, Ops[2]);
9801 
9802   // The llvm.aarch64.sve.st2/3/4 intrinsics take legal part vectors, so we
9803   // need to break up the tuple vector.
9804   SmallVector<llvm::Value*, 5> Operands;
9805   for (unsigned I = Ops.size() - N; I < Ops.size(); ++I)
9806     Operands.push_back(Ops[I]);
9807   Operands.append({Predicate, BasePtr});
9808   Function *F = CGM.getIntrinsic(IntID, { VTy });
9809 
9810   return Builder.CreateCall(F, Operands);
9811 }
9812 
9813 // SVE2's svpmullb and svpmullt builtins are similar to the svpmullb_pair and
9814 // svpmullt_pair intrinsics, with the exception that their results are bitcast
9815 // to a wider type.
9816 Value *CodeGenFunction::EmitSVEPMull(const SVETypeFlags &TypeFlags,
9817                                      SmallVectorImpl<Value *> &Ops,
9818                                      unsigned BuiltinID) {
9819   // Splat scalar operand to vector (intrinsics with _n infix)
9820   if (TypeFlags.hasSplatOperand()) {
9821     unsigned OpNo = TypeFlags.getSplatOperand();
9822     Ops[OpNo] = EmitSVEDupX(Ops[OpNo]);
9823   }
9824 
9825   // The pair-wise function has a narrower overloaded type.
9826   Function *F = CGM.getIntrinsic(BuiltinID, Ops[0]->getType());
9827   Value *Call = Builder.CreateCall(F, {Ops[0], Ops[1]});
9828 
9829   // Now bitcast to the wider result type.
9830   llvm::ScalableVectorType *Ty = getSVEType(TypeFlags);
9831   return EmitSVEReinterpret(Call, Ty);
9832 }
9833 
9834 Value *CodeGenFunction::EmitSVEMovl(const SVETypeFlags &TypeFlags,
9835                                     ArrayRef<Value *> Ops, unsigned BuiltinID) {
9836   llvm::Type *OverloadedTy = getSVEType(TypeFlags);
9837   Function *F = CGM.getIntrinsic(BuiltinID, OverloadedTy);
9838   return Builder.CreateCall(F, {Ops[0], Builder.getInt32(0)});
9839 }
9840 
9841 Value *CodeGenFunction::EmitSVEPrefetchLoad(const SVETypeFlags &TypeFlags,
9842                                             SmallVectorImpl<Value *> &Ops,
9843                                             unsigned BuiltinID) {
9844   auto *MemEltTy = SVEBuiltinMemEltTy(TypeFlags);
9845   auto *VectorTy = getSVEVectorForElementType(MemEltTy);
9846   auto *MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
9847 
9848   Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy);
9849   Value *BasePtr = Ops[1];
9850 
9851   // Implement the index operand if not omitted.
9852   if (Ops.size() > 3)
9853     BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Ops[2]);
9854 
9855   Value *PrfOp = Ops.back();
9856 
9857   Function *F = CGM.getIntrinsic(BuiltinID, Predicate->getType());
9858   return Builder.CreateCall(F, {Predicate, BasePtr, PrfOp});
9859 }
9860 
9861 Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E,
9862                                           llvm::Type *ReturnTy,
9863                                           SmallVectorImpl<Value *> &Ops,
9864                                           unsigned IntrinsicID,
9865                                           bool IsZExtReturn) {
9866   QualType LangPTy = E->getArg(1)->getType();
9867   llvm::Type *MemEltTy = CGM.getTypes().ConvertType(
9868       LangPTy->castAs<PointerType>()->getPointeeType());
9869 
9870   // The vector type that is returned may be different from the
9871   // eventual type loaded from memory.
9872   auto VectorTy = cast<llvm::ScalableVectorType>(ReturnTy);
9873   llvm::ScalableVectorType *MemoryTy = nullptr;
9874   llvm::ScalableVectorType *PredTy = nullptr;
9875   bool IsQuadLoad = false;
9876   switch (IntrinsicID) {
9877   case Intrinsic::aarch64_sve_ld1uwq:
9878   case Intrinsic::aarch64_sve_ld1udq:
9879     MemoryTy = llvm::ScalableVectorType::get(MemEltTy, 1);
9880     PredTy = llvm::ScalableVectorType::get(
9881         llvm::Type::getInt1Ty(getLLVMContext()), 1);
9882     IsQuadLoad = true;
9883     break;
9884   default:
9885     MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
9886     PredTy = MemoryTy;
9887     break;
9888   }
9889 
9890   Value *Predicate = EmitSVEPredicateCast(Ops[0], PredTy);
9891   Value *BasePtr = Ops[1];
9892 
9893   // Does the load have an offset?
9894   if (Ops.size() > 2)
9895     BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Ops[2]);
9896 
9897   Function *F = CGM.getIntrinsic(IntrinsicID, IsQuadLoad ? VectorTy : MemoryTy);
9898   auto *Load =
9899       cast<llvm::Instruction>(Builder.CreateCall(F, {Predicate, BasePtr}));
9900   auto TBAAInfo = CGM.getTBAAAccessInfo(LangPTy->getPointeeType());
9901   CGM.DecorateInstructionWithTBAA(Load, TBAAInfo);
9902 
9903   if (IsQuadLoad)
9904     return Load;
9905 
9906   return IsZExtReturn ? Builder.CreateZExt(Load, VectorTy)
9907                       : Builder.CreateSExt(Load, VectorTy);
9908 }
9909 
9910 Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E,
9911                                            SmallVectorImpl<Value *> &Ops,
9912                                            unsigned IntrinsicID) {
9913   QualType LangPTy = E->getArg(1)->getType();
9914   llvm::Type *MemEltTy = CGM.getTypes().ConvertType(
9915       LangPTy->castAs<PointerType>()->getPointeeType());
9916 
9917   // The vector type that is stored may be different from the
9918   // eventual type stored to memory.
9919   auto VectorTy = cast<llvm::ScalableVectorType>(Ops.back()->getType());
9920   auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
9921 
9922   auto PredTy = MemoryTy;
9923   auto AddrMemoryTy = MemoryTy;
9924   bool IsQuadStore = false;
9925 
9926   switch (IntrinsicID) {
9927   case Intrinsic::aarch64_sve_st1wq:
9928   case Intrinsic::aarch64_sve_st1dq:
9929     AddrMemoryTy = llvm::ScalableVectorType::get(MemEltTy, 1);
9930     PredTy =
9931         llvm::ScalableVectorType::get(IntegerType::get(getLLVMContext(), 1), 1);
9932     IsQuadStore = true;
9933     break;
9934   default:
9935     break;
9936   }
9937   Value *Predicate = EmitSVEPredicateCast(Ops[0], PredTy);
9938   Value *BasePtr = Ops[1];
9939 
9940   // Does the store have an offset?
9941   if (Ops.size() == 4)
9942     BasePtr = Builder.CreateGEP(AddrMemoryTy, BasePtr, Ops[2]);
9943 
9944   // Last value is always the data
9945   Value *Val =
9946       IsQuadStore ? Ops.back() : Builder.CreateTrunc(Ops.back(), MemoryTy);
9947 
9948   Function *F =
9949       CGM.getIntrinsic(IntrinsicID, IsQuadStore ? VectorTy : MemoryTy);
9950   auto *Store =
9951       cast<llvm::Instruction>(Builder.CreateCall(F, {Val, Predicate, BasePtr}));
9952   auto TBAAInfo = CGM.getTBAAAccessInfo(LangPTy->getPointeeType());
9953   CGM.DecorateInstructionWithTBAA(Store, TBAAInfo);
9954   return Store;
9955 }
9956 
9957 Value *CodeGenFunction::EmitSMELd1St1(const SVETypeFlags &TypeFlags,
9958                                       SmallVectorImpl<Value *> &Ops,
9959                                       unsigned IntID) {
9960   Ops[2] = EmitSVEPredicateCast(
9961       Ops[2], getSVEVectorForElementType(SVEBuiltinMemEltTy(TypeFlags)));
9962 
9963   SmallVector<Value *> NewOps;
9964   NewOps.push_back(Ops[2]);
9965 
9966   llvm::Value *BasePtr = Ops[3];
9967 
9968   // If the intrinsic contains the vnum parameter, multiply it with the vector
9969   // size in bytes.
9970   if (Ops.size() == 5) {
9971     Function *StreamingVectorLength =
9972         CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb);
9973     llvm::Value *StreamingVectorLengthCall =
9974         Builder.CreateCall(StreamingVectorLength);
9975     llvm::Value *Mulvl =
9976         Builder.CreateMul(StreamingVectorLengthCall, Ops[4], "mulvl");
9977     // The type of the ptr parameter is void *, so use Int8Ty here.
9978     BasePtr = Builder.CreateGEP(Int8Ty, Ops[3], Mulvl);
9979   }
9980   NewOps.push_back(BasePtr);
9981   NewOps.push_back(Ops[0]);
9982   NewOps.push_back(Ops[1]);
9983   Function *F = CGM.getIntrinsic(IntID);
9984   return Builder.CreateCall(F, NewOps);
9985 }
9986 
9987 Value *CodeGenFunction::EmitSMEReadWrite(const SVETypeFlags &TypeFlags,
9988                                          SmallVectorImpl<Value *> &Ops,
9989                                          unsigned IntID) {
9990   auto *VecTy = getSVEType(TypeFlags);
9991   Function *F = CGM.getIntrinsic(IntID, VecTy);
9992   if (TypeFlags.isReadZA())
9993     Ops[1] = EmitSVEPredicateCast(Ops[1], VecTy);
9994   else if (TypeFlags.isWriteZA())
9995     Ops[2] = EmitSVEPredicateCast(Ops[2], VecTy);
9996   return Builder.CreateCall(F, Ops);
9997 }
9998 
9999 Value *CodeGenFunction::EmitSMEZero(const SVETypeFlags &TypeFlags,
10000                                     SmallVectorImpl<Value *> &Ops,
10001                                     unsigned IntID) {
10002   // svzero_za() intrinsic zeros the entire za tile and has no paramters.
10003   if (Ops.size() == 0)
10004     Ops.push_back(llvm::ConstantInt::get(Int32Ty, 255));
10005   Function *F = CGM.getIntrinsic(IntID, {});
10006   return Builder.CreateCall(F, Ops);
10007 }
10008 
10009 Value *CodeGenFunction::EmitSMELdrStr(const SVETypeFlags &TypeFlags,
10010                                       SmallVectorImpl<Value *> &Ops,
10011                                       unsigned IntID) {
10012   if (Ops.size() == 2)
10013     Ops.push_back(Builder.getInt32(0));
10014   else
10015     Ops[2] = Builder.CreateIntCast(Ops[2], Int32Ty, true);
10016   Function *F = CGM.getIntrinsic(IntID, {});
10017   return Builder.CreateCall(F, Ops);
10018 }
10019 
10020 // Limit the usage of scalable llvm IR generated by the ACLE by using the
10021 // sve dup.x intrinsic instead of IRBuilder::CreateVectorSplat.
10022 Value *CodeGenFunction::EmitSVEDupX(Value *Scalar, llvm::Type *Ty) {
10023   return Builder.CreateVectorSplat(
10024       cast<llvm::VectorType>(Ty)->getElementCount(), Scalar);
10025 }
10026 
10027 Value *CodeGenFunction::EmitSVEDupX(Value* Scalar) {
10028   return EmitSVEDupX(Scalar, getSVEVectorForElementType(Scalar->getType()));
10029 }
10030 
10031 Value *CodeGenFunction::EmitSVEReinterpret(Value *Val, llvm::Type *Ty) {
10032   // FIXME: For big endian this needs an additional REV, or needs a separate
10033   // intrinsic that is code-generated as a no-op, because the LLVM bitcast
10034   // instruction is defined as 'bitwise' equivalent from memory point of
10035   // view (when storing/reloading), whereas the svreinterpret builtin
10036   // implements bitwise equivalent cast from register point of view.
10037   // LLVM CodeGen for a bitcast must add an explicit REV for big-endian.
10038   return Builder.CreateBitCast(Val, Ty);
10039 }
10040 
10041 static void InsertExplicitZeroOperand(CGBuilderTy &Builder, llvm::Type *Ty,
10042                                       SmallVectorImpl<Value *> &Ops) {
10043   auto *SplatZero = Constant::getNullValue(Ty);
10044   Ops.insert(Ops.begin(), SplatZero);
10045 }
10046 
10047 static void InsertExplicitUndefOperand(CGBuilderTy &Builder, llvm::Type *Ty,
10048                                        SmallVectorImpl<Value *> &Ops) {
10049   auto *SplatUndef = UndefValue::get(Ty);
10050   Ops.insert(Ops.begin(), SplatUndef);
10051 }
10052 
10053 SmallVector<llvm::Type *, 2>
10054 CodeGenFunction::getSVEOverloadTypes(const SVETypeFlags &TypeFlags,
10055                                      llvm::Type *ResultType,
10056                                      ArrayRef<Value *> Ops) {
10057   if (TypeFlags.isOverloadNone())
10058     return {};
10059 
10060   llvm::Type *DefaultType = getSVEType(TypeFlags);
10061 
10062   if (TypeFlags.isOverloadWhileOrMultiVecCvt())
10063     return {DefaultType, Ops[1]->getType()};
10064 
10065   if (TypeFlags.isOverloadWhileRW())
10066     return {getSVEPredType(TypeFlags), Ops[0]->getType()};
10067 
10068   if (TypeFlags.isOverloadCvt())
10069     return {Ops[0]->getType(), Ops.back()->getType()};
10070 
10071   if (TypeFlags.isReductionQV() && !ResultType->isScalableTy() &&
10072       ResultType->isVectorTy())
10073     return {ResultType, Ops[1]->getType()};
10074 
10075   assert(TypeFlags.isOverloadDefault() && "Unexpected value for overloads");
10076   return {DefaultType};
10077 }
10078 
10079 Value *CodeGenFunction::EmitSVETupleSetOrGet(const SVETypeFlags &TypeFlags,
10080                                              llvm::Type *Ty,
10081                                              ArrayRef<Value *> Ops) {
10082   assert((TypeFlags.isTupleSet() || TypeFlags.isTupleGet()) &&
10083          "Expects TypleFlag isTupleSet or TypeFlags.isTupleSet()");
10084 
10085   unsigned I = cast<ConstantInt>(Ops[1])->getSExtValue();
10086   auto *SingleVecTy = dyn_cast<llvm::ScalableVectorType>(
10087                       TypeFlags.isTupleSet() ? Ops[2]->getType() : Ty);
10088   Value *Idx = ConstantInt::get(CGM.Int64Ty,
10089                                 I * SingleVecTy->getMinNumElements());
10090 
10091   if (TypeFlags.isTupleSet())
10092     return Builder.CreateInsertVector(Ty, Ops[0], Ops[2], Idx);
10093   return Builder.CreateExtractVector(Ty, Ops[0], Idx);
10094 }
10095 
10096 Value *CodeGenFunction::EmitSVETupleCreate(const SVETypeFlags &TypeFlags,
10097                                              llvm::Type *Ty,
10098                                              ArrayRef<Value *> Ops) {
10099   assert(TypeFlags.isTupleCreate() && "Expects TypleFlag isTupleCreate");
10100 
10101   auto *SrcTy = dyn_cast<llvm::ScalableVectorType>(Ops[0]->getType());
10102   unsigned MinElts = SrcTy->getMinNumElements();
10103   Value *Call = llvm::PoisonValue::get(Ty);
10104   for (unsigned I = 0; I < Ops.size(); I++) {
10105     Value *Idx = ConstantInt::get(CGM.Int64Ty, I * MinElts);
10106     Call = Builder.CreateInsertVector(Ty, Call, Ops[I], Idx);
10107   }
10108 
10109   return Call;
10110 }
10111 
10112 Value *CodeGenFunction::FormSVEBuiltinResult(Value *Call) {
10113   // Multi-vector results should be broken up into a single (wide) result
10114   // vector.
10115   auto *StructTy = dyn_cast<StructType>(Call->getType());
10116   if (!StructTy)
10117     return Call;
10118 
10119   auto *VTy = dyn_cast<ScalableVectorType>(StructTy->getTypeAtIndex(0U));
10120   if (!VTy)
10121     return Call;
10122   unsigned N = StructTy->getNumElements();
10123 
10124   // We may need to emit a cast to a svbool_t
10125   bool IsPredTy = VTy->getElementType()->isIntegerTy(1);
10126   unsigned MinElts = IsPredTy ? 16 : VTy->getMinNumElements();
10127 
10128   ScalableVectorType *WideVTy =
10129       ScalableVectorType::get(VTy->getElementType(), MinElts * N);
10130   Value *Ret = llvm::PoisonValue::get(WideVTy);
10131   for (unsigned I = 0; I < N; ++I) {
10132     Value *SRet = Builder.CreateExtractValue(Call, I);
10133     assert(SRet->getType() == VTy && "Unexpected type for result value");
10134     Value *Idx = ConstantInt::get(CGM.Int64Ty, I * MinElts);
10135 
10136     if (IsPredTy)
10137       SRet = EmitSVEPredicateCast(
10138           SRet, ScalableVectorType::get(Builder.getInt1Ty(), 16));
10139 
10140     Ret = Builder.CreateInsertVector(WideVTy, Ret, SRet, Idx);
10141   }
10142   Call = Ret;
10143 
10144   return Call;
10145 }
10146 
10147 void CodeGenFunction::GetAArch64SVEProcessedOperands(
10148     unsigned BuiltinID, const CallExpr *E, SmallVectorImpl<Value *> &Ops,
10149     SVETypeFlags TypeFlags) {
10150   // Find out if any arguments are required to be integer constant expressions.
10151   unsigned ICEArguments = 0;
10152   ASTContext::GetBuiltinTypeError Error;
10153   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
10154   assert(Error == ASTContext::GE_None && "Should not codegen an error");
10155 
10156   // Tuple set/get only requires one insert/extract vector, which is
10157   // created by EmitSVETupleSetOrGet.
10158   bool IsTupleGetOrSet = TypeFlags.isTupleSet() || TypeFlags.isTupleGet();
10159 
10160   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
10161     bool IsICE = ICEArguments & (1 << i);
10162     Value *Arg = EmitScalarExpr(E->getArg(i));
10163 
10164     if (IsICE) {
10165       // If this is required to be a constant, constant fold it so that we know
10166       // that the generated intrinsic gets a ConstantInt.
10167       std::optional<llvm::APSInt> Result =
10168           E->getArg(i)->getIntegerConstantExpr(getContext());
10169       assert(Result && "Expected argument to be a constant");
10170 
10171       // Immediates for SVE llvm intrinsics are always 32bit.  We can safely
10172       // truncate because the immediate has been range checked and no valid
10173       // immediate requires more than a handful of bits.
10174       *Result = Result->extOrTrunc(32);
10175       Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), *Result));
10176       continue;
10177     }
10178 
10179     if (IsTupleGetOrSet || !isa<ScalableVectorType>(Arg->getType())) {
10180       Ops.push_back(Arg);
10181       continue;
10182     }
10183 
10184     auto *VTy = cast<ScalableVectorType>(Arg->getType());
10185     unsigned MinElts = VTy->getMinNumElements();
10186     bool IsPred = VTy->getElementType()->isIntegerTy(1);
10187     unsigned N = (MinElts * VTy->getScalarSizeInBits()) / (IsPred ? 16 : 128);
10188 
10189     if (N == 1) {
10190       Ops.push_back(Arg);
10191       continue;
10192     }
10193 
10194     for (unsigned I = 0; I < N; ++I) {
10195       Value *Idx = ConstantInt::get(CGM.Int64Ty, (I * MinElts) / N);
10196       auto *NewVTy =
10197           ScalableVectorType::get(VTy->getElementType(), MinElts / N);
10198       Ops.push_back(Builder.CreateExtractVector(NewVTy, Arg, Idx));
10199     }
10200   }
10201 }
10202 
10203 Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
10204                                                   const CallExpr *E) {
10205   llvm::Type *Ty = ConvertType(E->getType());
10206   if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 &&
10207       BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64_x4) {
10208     Value *Val = EmitScalarExpr(E->getArg(0));
10209     return EmitSVEReinterpret(Val, Ty);
10210   }
10211 
10212   auto *Builtin = findARMVectorIntrinsicInMap(AArch64SVEIntrinsicMap, BuiltinID,
10213                                               AArch64SVEIntrinsicsProvenSorted);
10214 
10215   llvm::SmallVector<Value *, 4> Ops;
10216   SVETypeFlags TypeFlags(Builtin->TypeModifier);
10217   GetAArch64SVEProcessedOperands(BuiltinID, E, Ops, TypeFlags);
10218 
10219   if (TypeFlags.isLoad())
10220     return EmitSVEMaskedLoad(E, Ty, Ops, Builtin->LLVMIntrinsic,
10221                              TypeFlags.isZExtReturn());
10222   else if (TypeFlags.isStore())
10223     return EmitSVEMaskedStore(E, Ops, Builtin->LLVMIntrinsic);
10224   else if (TypeFlags.isGatherLoad())
10225     return EmitSVEGatherLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
10226   else if (TypeFlags.isScatterStore())
10227     return EmitSVEScatterStore(TypeFlags, Ops, Builtin->LLVMIntrinsic);
10228   else if (TypeFlags.isPrefetch())
10229     return EmitSVEPrefetchLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
10230   else if (TypeFlags.isGatherPrefetch())
10231     return EmitSVEGatherPrefetch(TypeFlags, Ops, Builtin->LLVMIntrinsic);
10232   else if (TypeFlags.isStructLoad())
10233     return EmitSVEStructLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
10234   else if (TypeFlags.isStructStore())
10235     return EmitSVEStructStore(TypeFlags, Ops, Builtin->LLVMIntrinsic);
10236   else if (TypeFlags.isTupleSet() || TypeFlags.isTupleGet())
10237     return EmitSVETupleSetOrGet(TypeFlags, Ty, Ops);
10238   else if (TypeFlags.isTupleCreate())
10239     return EmitSVETupleCreate(TypeFlags, Ty, Ops);
10240   else if (TypeFlags.isUndef())
10241     return UndefValue::get(Ty);
10242   else if (Builtin->LLVMIntrinsic != 0) {
10243     if (TypeFlags.getMergeType() == SVETypeFlags::MergeZeroExp)
10244       InsertExplicitZeroOperand(Builder, Ty, Ops);
10245 
10246     if (TypeFlags.getMergeType() == SVETypeFlags::MergeAnyExp)
10247       InsertExplicitUndefOperand(Builder, Ty, Ops);
10248 
10249     // Some ACLE builtins leave out the argument to specify the predicate
10250     // pattern, which is expected to be expanded to an SV_ALL pattern.
10251     if (TypeFlags.isAppendSVALL())
10252       Ops.push_back(Builder.getInt32(/*SV_ALL*/ 31));
10253     if (TypeFlags.isInsertOp1SVALL())
10254       Ops.insert(&Ops[1], Builder.getInt32(/*SV_ALL*/ 31));
10255 
10256     // Predicates must match the main datatype.
10257     for (unsigned i = 0, e = Ops.size(); i != e; ++i)
10258       if (auto PredTy = dyn_cast<llvm::VectorType>(Ops[i]->getType()))
10259         if (PredTy->getElementType()->isIntegerTy(1))
10260           Ops[i] = EmitSVEPredicateCast(Ops[i], getSVEType(TypeFlags));
10261 
10262     // Splat scalar operand to vector (intrinsics with _n infix)
10263     if (TypeFlags.hasSplatOperand()) {
10264       unsigned OpNo = TypeFlags.getSplatOperand();
10265       Ops[OpNo] = EmitSVEDupX(Ops[OpNo]);
10266     }
10267 
10268     if (TypeFlags.isReverseCompare())
10269       std::swap(Ops[1], Ops[2]);
10270     else if (TypeFlags.isReverseUSDOT())
10271       std::swap(Ops[1], Ops[2]);
10272     else if (TypeFlags.isReverseMergeAnyBinOp() &&
10273              TypeFlags.getMergeType() == SVETypeFlags::MergeAny)
10274       std::swap(Ops[1], Ops[2]);
10275     else if (TypeFlags.isReverseMergeAnyAccOp() &&
10276              TypeFlags.getMergeType() == SVETypeFlags::MergeAny)
10277       std::swap(Ops[1], Ops[3]);
10278 
10279     // Predicated intrinsics with _z suffix need a select w/ zeroinitializer.
10280     if (TypeFlags.getMergeType() == SVETypeFlags::MergeZero) {
10281       llvm::Type *OpndTy = Ops[1]->getType();
10282       auto *SplatZero = Constant::getNullValue(OpndTy);
10283       Ops[1] = Builder.CreateSelect(Ops[0], Ops[1], SplatZero);
10284     }
10285 
10286     Function *F = CGM.getIntrinsic(Builtin->LLVMIntrinsic,
10287                                    getSVEOverloadTypes(TypeFlags, Ty, Ops));
10288     Value *Call = Builder.CreateCall(F, Ops);
10289 
10290     // Predicate results must be converted to svbool_t.
10291     if (auto PredTy = dyn_cast<llvm::VectorType>(Call->getType()))
10292       if (PredTy->getScalarType()->isIntegerTy(1))
10293         Call = EmitSVEPredicateCast(Call, cast<llvm::ScalableVectorType>(Ty));
10294 
10295     return FormSVEBuiltinResult(Call);
10296   }
10297 
10298   switch (BuiltinID) {
10299   default:
10300     return nullptr;
10301 
10302   case SVE::BI__builtin_sve_svreinterpret_b: {
10303     auto SVCountTy =
10304         llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount");
10305     Function *CastFromSVCountF =
10306         CGM.getIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool, SVCountTy);
10307     return Builder.CreateCall(CastFromSVCountF, Ops[0]);
10308   }
10309   case SVE::BI__builtin_sve_svreinterpret_c: {
10310     auto SVCountTy =
10311         llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount");
10312     Function *CastToSVCountF =
10313         CGM.getIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, SVCountTy);
10314     return Builder.CreateCall(CastToSVCountF, Ops[0]);
10315   }
10316 
10317   case SVE::BI__builtin_sve_svpsel_lane_b8:
10318   case SVE::BI__builtin_sve_svpsel_lane_b16:
10319   case SVE::BI__builtin_sve_svpsel_lane_b32:
10320   case SVE::BI__builtin_sve_svpsel_lane_b64:
10321   case SVE::BI__builtin_sve_svpsel_lane_c8:
10322   case SVE::BI__builtin_sve_svpsel_lane_c16:
10323   case SVE::BI__builtin_sve_svpsel_lane_c32:
10324   case SVE::BI__builtin_sve_svpsel_lane_c64: {
10325     bool IsSVCount = isa<TargetExtType>(Ops[0]->getType());
10326     assert(((!IsSVCount || cast<TargetExtType>(Ops[0]->getType())->getName() ==
10327                                "aarch64.svcount")) &&
10328            "Unexpected TargetExtType");
10329     auto SVCountTy =
10330         llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount");
10331     Function *CastFromSVCountF =
10332         CGM.getIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool, SVCountTy);
10333     Function *CastToSVCountF =
10334         CGM.getIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, SVCountTy);
10335 
10336     auto OverloadedTy = getSVEType(SVETypeFlags(Builtin->TypeModifier));
10337     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_psel, OverloadedTy);
10338     llvm::Value *Ops0 =
10339         IsSVCount ? Builder.CreateCall(CastFromSVCountF, Ops[0]) : Ops[0];
10340     llvm::Value *Ops1 = EmitSVEPredicateCast(Ops[1], OverloadedTy);
10341     llvm::Value *PSel = Builder.CreateCall(F, {Ops0, Ops1, Ops[2]});
10342     return IsSVCount ? Builder.CreateCall(CastToSVCountF, PSel) : PSel;
10343   }
10344   case SVE::BI__builtin_sve_svmov_b_z: {
10345     // svmov_b_z(pg, op) <=> svand_b_z(pg, op, op)
10346     SVETypeFlags TypeFlags(Builtin->TypeModifier);
10347     llvm::Type* OverloadedTy = getSVEType(TypeFlags);
10348     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_and_z, OverloadedTy);
10349     return Builder.CreateCall(F, {Ops[0], Ops[1], Ops[1]});
10350   }
10351 
10352   case SVE::BI__builtin_sve_svnot_b_z: {
10353     // svnot_b_z(pg, op) <=> sveor_b_z(pg, op, pg)
10354     SVETypeFlags TypeFlags(Builtin->TypeModifier);
10355     llvm::Type* OverloadedTy = getSVEType(TypeFlags);
10356     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_eor_z, OverloadedTy);
10357     return Builder.CreateCall(F, {Ops[0], Ops[1], Ops[0]});
10358   }
10359 
10360   case SVE::BI__builtin_sve_svmovlb_u16:
10361   case SVE::BI__builtin_sve_svmovlb_u32:
10362   case SVE::BI__builtin_sve_svmovlb_u64:
10363     return EmitSVEMovl(TypeFlags, Ops, Intrinsic::aarch64_sve_ushllb);
10364 
10365   case SVE::BI__builtin_sve_svmovlb_s16:
10366   case SVE::BI__builtin_sve_svmovlb_s32:
10367   case SVE::BI__builtin_sve_svmovlb_s64:
10368     return EmitSVEMovl(TypeFlags, Ops, Intrinsic::aarch64_sve_sshllb);
10369 
10370   case SVE::BI__builtin_sve_svmovlt_u16:
10371   case SVE::BI__builtin_sve_svmovlt_u32:
10372   case SVE::BI__builtin_sve_svmovlt_u64:
10373     return EmitSVEMovl(TypeFlags, Ops, Intrinsic::aarch64_sve_ushllt);
10374 
10375   case SVE::BI__builtin_sve_svmovlt_s16:
10376   case SVE::BI__builtin_sve_svmovlt_s32:
10377   case SVE::BI__builtin_sve_svmovlt_s64:
10378     return EmitSVEMovl(TypeFlags, Ops, Intrinsic::aarch64_sve_sshllt);
10379 
10380   case SVE::BI__builtin_sve_svpmullt_u16:
10381   case SVE::BI__builtin_sve_svpmullt_u64:
10382   case SVE::BI__builtin_sve_svpmullt_n_u16:
10383   case SVE::BI__builtin_sve_svpmullt_n_u64:
10384     return EmitSVEPMull(TypeFlags, Ops, Intrinsic::aarch64_sve_pmullt_pair);
10385 
10386   case SVE::BI__builtin_sve_svpmullb_u16:
10387   case SVE::BI__builtin_sve_svpmullb_u64:
10388   case SVE::BI__builtin_sve_svpmullb_n_u16:
10389   case SVE::BI__builtin_sve_svpmullb_n_u64:
10390     return EmitSVEPMull(TypeFlags, Ops, Intrinsic::aarch64_sve_pmullb_pair);
10391 
10392   case SVE::BI__builtin_sve_svdup_n_b8:
10393   case SVE::BI__builtin_sve_svdup_n_b16:
10394   case SVE::BI__builtin_sve_svdup_n_b32:
10395   case SVE::BI__builtin_sve_svdup_n_b64: {
10396     Value *CmpNE =
10397         Builder.CreateICmpNE(Ops[0], Constant::getNullValue(Ops[0]->getType()));
10398     llvm::ScalableVectorType *OverloadedTy = getSVEType(TypeFlags);
10399     Value *Dup = EmitSVEDupX(CmpNE, OverloadedTy);
10400     return EmitSVEPredicateCast(Dup, cast<llvm::ScalableVectorType>(Ty));
10401   }
10402 
10403   case SVE::BI__builtin_sve_svdupq_n_b8:
10404   case SVE::BI__builtin_sve_svdupq_n_b16:
10405   case SVE::BI__builtin_sve_svdupq_n_b32:
10406   case SVE::BI__builtin_sve_svdupq_n_b64:
10407   case SVE::BI__builtin_sve_svdupq_n_u8:
10408   case SVE::BI__builtin_sve_svdupq_n_s8:
10409   case SVE::BI__builtin_sve_svdupq_n_u64:
10410   case SVE::BI__builtin_sve_svdupq_n_f64:
10411   case SVE::BI__builtin_sve_svdupq_n_s64:
10412   case SVE::BI__builtin_sve_svdupq_n_u16:
10413   case SVE::BI__builtin_sve_svdupq_n_f16:
10414   case SVE::BI__builtin_sve_svdupq_n_bf16:
10415   case SVE::BI__builtin_sve_svdupq_n_s16:
10416   case SVE::BI__builtin_sve_svdupq_n_u32:
10417   case SVE::BI__builtin_sve_svdupq_n_f32:
10418   case SVE::BI__builtin_sve_svdupq_n_s32: {
10419     // These builtins are implemented by storing each element to an array and using
10420     // ld1rq to materialize a vector.
10421     unsigned NumOpnds = Ops.size();
10422 
10423     bool IsBoolTy =
10424         cast<llvm::VectorType>(Ty)->getElementType()->isIntegerTy(1);
10425 
10426     // For svdupq_n_b* the element type of is an integer of type 128/numelts,
10427     // so that the compare can use the width that is natural for the expected
10428     // number of predicate lanes.
10429     llvm::Type *EltTy = Ops[0]->getType();
10430     if (IsBoolTy)
10431       EltTy = IntegerType::get(getLLVMContext(), SVEBitsPerBlock / NumOpnds);
10432 
10433     SmallVector<llvm::Value *, 16> VecOps;
10434     for (unsigned I = 0; I < NumOpnds; ++I)
10435         VecOps.push_back(Builder.CreateZExt(Ops[I], EltTy));
10436     Value *Vec = BuildVector(VecOps);
10437 
10438     llvm::Type *OverloadedTy = getSVEVectorForElementType(EltTy);
10439     Value *InsertSubVec = Builder.CreateInsertVector(
10440         OverloadedTy, PoisonValue::get(OverloadedTy), Vec, Builder.getInt64(0));
10441 
10442     Function *F =
10443         CGM.getIntrinsic(Intrinsic::aarch64_sve_dupq_lane, OverloadedTy);
10444     Value *DupQLane =
10445         Builder.CreateCall(F, {InsertSubVec, Builder.getInt64(0)});
10446 
10447     if (!IsBoolTy)
10448       return DupQLane;
10449 
10450     SVETypeFlags TypeFlags(Builtin->TypeModifier);
10451     Value *Pred = EmitSVEAllTruePred(TypeFlags);
10452 
10453     // For svdupq_n_b* we need to add an additional 'cmpne' with '0'.
10454     F = CGM.getIntrinsic(NumOpnds == 2 ? Intrinsic::aarch64_sve_cmpne
10455                                        : Intrinsic::aarch64_sve_cmpne_wide,
10456                          OverloadedTy);
10457     Value *Call = Builder.CreateCall(
10458         F, {Pred, DupQLane, EmitSVEDupX(Builder.getInt64(0))});
10459     return EmitSVEPredicateCast(Call, cast<llvm::ScalableVectorType>(Ty));
10460   }
10461 
10462   case SVE::BI__builtin_sve_svpfalse_b:
10463     return ConstantInt::getFalse(Ty);
10464 
10465   case SVE::BI__builtin_sve_svpfalse_c: {
10466     auto SVBoolTy = ScalableVectorType::get(Builder.getInt1Ty(), 16);
10467     Function *CastToSVCountF =
10468         CGM.getIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, Ty);
10469     return Builder.CreateCall(CastToSVCountF, ConstantInt::getFalse(SVBoolTy));
10470   }
10471 
10472   case SVE::BI__builtin_sve_svlen_bf16:
10473   case SVE::BI__builtin_sve_svlen_f16:
10474   case SVE::BI__builtin_sve_svlen_f32:
10475   case SVE::BI__builtin_sve_svlen_f64:
10476   case SVE::BI__builtin_sve_svlen_s8:
10477   case SVE::BI__builtin_sve_svlen_s16:
10478   case SVE::BI__builtin_sve_svlen_s32:
10479   case SVE::BI__builtin_sve_svlen_s64:
10480   case SVE::BI__builtin_sve_svlen_u8:
10481   case SVE::BI__builtin_sve_svlen_u16:
10482   case SVE::BI__builtin_sve_svlen_u32:
10483   case SVE::BI__builtin_sve_svlen_u64: {
10484     SVETypeFlags TF(Builtin->TypeModifier);
10485     auto VTy = cast<llvm::VectorType>(getSVEType(TF));
10486     auto *NumEls =
10487         llvm::ConstantInt::get(Ty, VTy->getElementCount().getKnownMinValue());
10488 
10489     Function *F = CGM.getIntrinsic(Intrinsic::vscale, Ty);
10490     return Builder.CreateMul(NumEls, Builder.CreateCall(F));
10491   }
10492 
10493   case SVE::BI__builtin_sve_svtbl2_u8:
10494   case SVE::BI__builtin_sve_svtbl2_s8:
10495   case SVE::BI__builtin_sve_svtbl2_u16:
10496   case SVE::BI__builtin_sve_svtbl2_s16:
10497   case SVE::BI__builtin_sve_svtbl2_u32:
10498   case SVE::BI__builtin_sve_svtbl2_s32:
10499   case SVE::BI__builtin_sve_svtbl2_u64:
10500   case SVE::BI__builtin_sve_svtbl2_s64:
10501   case SVE::BI__builtin_sve_svtbl2_f16:
10502   case SVE::BI__builtin_sve_svtbl2_bf16:
10503   case SVE::BI__builtin_sve_svtbl2_f32:
10504   case SVE::BI__builtin_sve_svtbl2_f64: {
10505     SVETypeFlags TF(Builtin->TypeModifier);
10506     auto VTy = cast<llvm::ScalableVectorType>(getSVEType(TF));
10507     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_tbl2, VTy);
10508     return Builder.CreateCall(F, Ops);
10509   }
10510 
10511   case SVE::BI__builtin_sve_svset_neonq_s8:
10512   case SVE::BI__builtin_sve_svset_neonq_s16:
10513   case SVE::BI__builtin_sve_svset_neonq_s32:
10514   case SVE::BI__builtin_sve_svset_neonq_s64:
10515   case SVE::BI__builtin_sve_svset_neonq_u8:
10516   case SVE::BI__builtin_sve_svset_neonq_u16:
10517   case SVE::BI__builtin_sve_svset_neonq_u32:
10518   case SVE::BI__builtin_sve_svset_neonq_u64:
10519   case SVE::BI__builtin_sve_svset_neonq_f16:
10520   case SVE::BI__builtin_sve_svset_neonq_f32:
10521   case SVE::BI__builtin_sve_svset_neonq_f64:
10522   case SVE::BI__builtin_sve_svset_neonq_bf16: {
10523     return Builder.CreateInsertVector(Ty, Ops[0], Ops[1], Builder.getInt64(0));
10524   }
10525 
10526   case SVE::BI__builtin_sve_svget_neonq_s8:
10527   case SVE::BI__builtin_sve_svget_neonq_s16:
10528   case SVE::BI__builtin_sve_svget_neonq_s32:
10529   case SVE::BI__builtin_sve_svget_neonq_s64:
10530   case SVE::BI__builtin_sve_svget_neonq_u8:
10531   case SVE::BI__builtin_sve_svget_neonq_u16:
10532   case SVE::BI__builtin_sve_svget_neonq_u32:
10533   case SVE::BI__builtin_sve_svget_neonq_u64:
10534   case SVE::BI__builtin_sve_svget_neonq_f16:
10535   case SVE::BI__builtin_sve_svget_neonq_f32:
10536   case SVE::BI__builtin_sve_svget_neonq_f64:
10537   case SVE::BI__builtin_sve_svget_neonq_bf16: {
10538     return Builder.CreateExtractVector(Ty, Ops[0], Builder.getInt64(0));
10539   }
10540 
10541   case SVE::BI__builtin_sve_svdup_neonq_s8:
10542   case SVE::BI__builtin_sve_svdup_neonq_s16:
10543   case SVE::BI__builtin_sve_svdup_neonq_s32:
10544   case SVE::BI__builtin_sve_svdup_neonq_s64:
10545   case SVE::BI__builtin_sve_svdup_neonq_u8:
10546   case SVE::BI__builtin_sve_svdup_neonq_u16:
10547   case SVE::BI__builtin_sve_svdup_neonq_u32:
10548   case SVE::BI__builtin_sve_svdup_neonq_u64:
10549   case SVE::BI__builtin_sve_svdup_neonq_f16:
10550   case SVE::BI__builtin_sve_svdup_neonq_f32:
10551   case SVE::BI__builtin_sve_svdup_neonq_f64:
10552   case SVE::BI__builtin_sve_svdup_neonq_bf16: {
10553     Value *Insert = Builder.CreateInsertVector(Ty, PoisonValue::get(Ty), Ops[0],
10554                                                Builder.getInt64(0));
10555     return Builder.CreateIntrinsic(Intrinsic::aarch64_sve_dupq_lane, {Ty},
10556                                    {Insert, Builder.getInt64(0)});
10557   }
10558   }
10559 
10560   /// Should not happen
10561   return nullptr;
10562 }
10563 
10564 static void swapCommutativeSMEOperands(unsigned BuiltinID,
10565                                        SmallVectorImpl<Value *> &Ops) {
10566   unsigned MultiVec;
10567   switch (BuiltinID) {
10568   default:
10569     return;
10570   case SME::BI__builtin_sme_svsumla_za32_s8_vg4x1:
10571     MultiVec = 1;
10572     break;
10573   case SME::BI__builtin_sme_svsumla_za32_s8_vg4x2:
10574   case SME::BI__builtin_sme_svsudot_za32_s8_vg1x2:
10575     MultiVec = 2;
10576     break;
10577   case SME::BI__builtin_sme_svsudot_za32_s8_vg1x4:
10578   case SME::BI__builtin_sme_svsumla_za32_s8_vg4x4:
10579     MultiVec = 4;
10580     break;
10581   }
10582 
10583   if (MultiVec > 0)
10584     for (unsigned I = 0; I < MultiVec; ++I)
10585       std::swap(Ops[I + 1], Ops[I + 1 + MultiVec]);
10586 }
10587 
10588 Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
10589                                                   const CallExpr *E) {
10590   auto *Builtin = findARMVectorIntrinsicInMap(AArch64SMEIntrinsicMap, BuiltinID,
10591                                               AArch64SMEIntrinsicsProvenSorted);
10592 
10593   llvm::SmallVector<Value *, 4> Ops;
10594   SVETypeFlags TypeFlags(Builtin->TypeModifier);
10595   GetAArch64SVEProcessedOperands(BuiltinID, E, Ops, TypeFlags);
10596 
10597   if (TypeFlags.isLoad() || TypeFlags.isStore())
10598     return EmitSMELd1St1(TypeFlags, Ops, Builtin->LLVMIntrinsic);
10599   else if (TypeFlags.isReadZA() || TypeFlags.isWriteZA())
10600     return EmitSMEReadWrite(TypeFlags, Ops, Builtin->LLVMIntrinsic);
10601   else if (BuiltinID == SME::BI__builtin_sme_svzero_mask_za ||
10602            BuiltinID == SME::BI__builtin_sme_svzero_za)
10603     return EmitSMEZero(TypeFlags, Ops, Builtin->LLVMIntrinsic);
10604   else if (BuiltinID == SME::BI__builtin_sme_svldr_vnum_za ||
10605            BuiltinID == SME::BI__builtin_sme_svstr_vnum_za ||
10606            BuiltinID == SME::BI__builtin_sme_svldr_za ||
10607            BuiltinID == SME::BI__builtin_sme_svstr_za)
10608     return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic);
10609 
10610   // Handle builtins which require their multi-vector operands to be swapped
10611   swapCommutativeSMEOperands(BuiltinID, Ops);
10612 
10613   // Should not happen!
10614   if (Builtin->LLVMIntrinsic == 0)
10615     return nullptr;
10616 
10617   // Predicates must match the main datatype.
10618   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
10619     if (auto PredTy = dyn_cast<llvm::VectorType>(Ops[i]->getType()))
10620       if (PredTy->getElementType()->isIntegerTy(1))
10621         Ops[i] = EmitSVEPredicateCast(Ops[i], getSVEType(TypeFlags));
10622 
10623   Function *F =
10624       TypeFlags.isOverloadNone()
10625           ? CGM.getIntrinsic(Builtin->LLVMIntrinsic)
10626           : CGM.getIntrinsic(Builtin->LLVMIntrinsic, {getSVEType(TypeFlags)});
10627   Value *Call = Builder.CreateCall(F, Ops);
10628 
10629   return FormSVEBuiltinResult(Call);
10630 }
10631 
10632 Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
10633                                                const CallExpr *E,
10634                                                llvm::Triple::ArchType Arch) {
10635   if (BuiltinID >= clang::AArch64::FirstSVEBuiltin &&
10636       BuiltinID <= clang::AArch64::LastSVEBuiltin)
10637     return EmitAArch64SVEBuiltinExpr(BuiltinID, E);
10638 
10639   if (BuiltinID >= clang::AArch64::FirstSMEBuiltin &&
10640       BuiltinID <= clang::AArch64::LastSMEBuiltin)
10641     return EmitAArch64SMEBuiltinExpr(BuiltinID, E);
10642 
10643   unsigned HintID = static_cast<unsigned>(-1);
10644   switch (BuiltinID) {
10645   default: break;
10646   case clang::AArch64::BI__builtin_arm_nop:
10647     HintID = 0;
10648     break;
10649   case clang::AArch64::BI__builtin_arm_yield:
10650   case clang::AArch64::BI__yield:
10651     HintID = 1;
10652     break;
10653   case clang::AArch64::BI__builtin_arm_wfe:
10654   case clang::AArch64::BI__wfe:
10655     HintID = 2;
10656     break;
10657   case clang::AArch64::BI__builtin_arm_wfi:
10658   case clang::AArch64::BI__wfi:
10659     HintID = 3;
10660     break;
10661   case clang::AArch64::BI__builtin_arm_sev:
10662   case clang::AArch64::BI__sev:
10663     HintID = 4;
10664     break;
10665   case clang::AArch64::BI__builtin_arm_sevl:
10666   case clang::AArch64::BI__sevl:
10667     HintID = 5;
10668     break;
10669   }
10670 
10671   if (HintID != static_cast<unsigned>(-1)) {
10672     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_hint);
10673     return Builder.CreateCall(F, llvm::ConstantInt::get(Int32Ty, HintID));
10674   }
10675 
10676   if (BuiltinID == clang::AArch64::BI__builtin_arm_get_sme_state) {
10677     // Create call to __arm_sme_state and store the results to the two pointers.
10678     CallInst *CI = EmitRuntimeCall(CGM.CreateRuntimeFunction(
10679         llvm::FunctionType::get(StructType::get(CGM.Int64Ty, CGM.Int64Ty), {},
10680                                 false),
10681         "__arm_sme_state"));
10682     auto Attrs =
10683         AttributeList()
10684             .addFnAttribute(getLLVMContext(), "aarch64_pstate_sm_compatible")
10685             .addFnAttribute(getLLVMContext(), "aarch64_pstate_za_preserved");
10686     CI->setAttributes(Attrs);
10687     CI->setCallingConv(
10688         llvm::CallingConv::
10689             AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2);
10690     Builder.CreateStore(Builder.CreateExtractValue(CI, 0),
10691                         EmitPointerWithAlignment(E->getArg(0)));
10692     return Builder.CreateStore(Builder.CreateExtractValue(CI, 1),
10693                                EmitPointerWithAlignment(E->getArg(1)));
10694   }
10695 
10696   if (BuiltinID == clang::AArch64::BI__builtin_arm_rbit) {
10697     assert((getContext().getTypeSize(E->getType()) == 32) &&
10698            "rbit of unusual size!");
10699     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10700     return Builder.CreateCall(
10701         CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit");
10702   }
10703   if (BuiltinID == clang::AArch64::BI__builtin_arm_rbit64) {
10704     assert((getContext().getTypeSize(E->getType()) == 64) &&
10705            "rbit of unusual size!");
10706     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10707     return Builder.CreateCall(
10708         CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit");
10709   }
10710 
10711   if (BuiltinID == clang::AArch64::BI__builtin_arm_clz ||
10712       BuiltinID == clang::AArch64::BI__builtin_arm_clz64) {
10713     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10714     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Arg->getType());
10715     Value *Res = Builder.CreateCall(F, {Arg, Builder.getInt1(false)});
10716     if (BuiltinID == clang::AArch64::BI__builtin_arm_clz64)
10717       Res = Builder.CreateTrunc(Res, Builder.getInt32Ty());
10718     return Res;
10719   }
10720 
10721   if (BuiltinID == clang::AArch64::BI__builtin_arm_cls) {
10722     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10723     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_cls), Arg,
10724                               "cls");
10725   }
10726   if (BuiltinID == clang::AArch64::BI__builtin_arm_cls64) {
10727     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10728     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_cls64), Arg,
10729                               "cls");
10730   }
10731 
10732   if (BuiltinID == clang::AArch64::BI__builtin_arm_rint32zf ||
10733       BuiltinID == clang::AArch64::BI__builtin_arm_rint32z) {
10734     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10735     llvm::Type *Ty = Arg->getType();
10736     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_frint32z, Ty),
10737                               Arg, "frint32z");
10738   }
10739 
10740   if (BuiltinID == clang::AArch64::BI__builtin_arm_rint64zf ||
10741       BuiltinID == clang::AArch64::BI__builtin_arm_rint64z) {
10742     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10743     llvm::Type *Ty = Arg->getType();
10744     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_frint64z, Ty),
10745                               Arg, "frint64z");
10746   }
10747 
10748   if (BuiltinID == clang::AArch64::BI__builtin_arm_rint32xf ||
10749       BuiltinID == clang::AArch64::BI__builtin_arm_rint32x) {
10750     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10751     llvm::Type *Ty = Arg->getType();
10752     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_frint32x, Ty),
10753                               Arg, "frint32x");
10754   }
10755 
10756   if (BuiltinID == clang::AArch64::BI__builtin_arm_rint64xf ||
10757       BuiltinID == clang::AArch64::BI__builtin_arm_rint64x) {
10758     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10759     llvm::Type *Ty = Arg->getType();
10760     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_frint64x, Ty),
10761                               Arg, "frint64x");
10762   }
10763 
10764   if (BuiltinID == clang::AArch64::BI__builtin_arm_jcvt) {
10765     assert((getContext().getTypeSize(E->getType()) == 32) &&
10766            "__jcvt of unusual size!");
10767     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10768     return Builder.CreateCall(
10769         CGM.getIntrinsic(Intrinsic::aarch64_fjcvtzs), Arg);
10770   }
10771 
10772   if (BuiltinID == clang::AArch64::BI__builtin_arm_ld64b ||
10773       BuiltinID == clang::AArch64::BI__builtin_arm_st64b ||
10774       BuiltinID == clang::AArch64::BI__builtin_arm_st64bv ||
10775       BuiltinID == clang::AArch64::BI__builtin_arm_st64bv0) {
10776     llvm::Value *MemAddr = EmitScalarExpr(E->getArg(0));
10777     llvm::Value *ValPtr = EmitScalarExpr(E->getArg(1));
10778 
10779     if (BuiltinID == clang::AArch64::BI__builtin_arm_ld64b) {
10780       // Load from the address via an LLVM intrinsic, receiving a
10781       // tuple of 8 i64 words, and store each one to ValPtr.
10782       Function *F = CGM.getIntrinsic(Intrinsic::aarch64_ld64b);
10783       llvm::Value *Val = Builder.CreateCall(F, MemAddr);
10784       llvm::Value *ToRet;
10785       for (size_t i = 0; i < 8; i++) {
10786         llvm::Value *ValOffsetPtr =
10787             Builder.CreateGEP(Int64Ty, ValPtr, Builder.getInt32(i));
10788         Address Addr =
10789             Address(ValOffsetPtr, Int64Ty, CharUnits::fromQuantity(8));
10790         ToRet = Builder.CreateStore(Builder.CreateExtractValue(Val, i), Addr);
10791       }
10792       return ToRet;
10793     } else {
10794       // Load 8 i64 words from ValPtr, and store them to the address
10795       // via an LLVM intrinsic.
10796       SmallVector<llvm::Value *, 9> Args;
10797       Args.push_back(MemAddr);
10798       for (size_t i = 0; i < 8; i++) {
10799         llvm::Value *ValOffsetPtr =
10800             Builder.CreateGEP(Int64Ty, ValPtr, Builder.getInt32(i));
10801         Address Addr =
10802             Address(ValOffsetPtr, Int64Ty, CharUnits::fromQuantity(8));
10803         Args.push_back(Builder.CreateLoad(Addr));
10804       }
10805 
10806       auto Intr = (BuiltinID == clang::AArch64::BI__builtin_arm_st64b
10807                        ? Intrinsic::aarch64_st64b
10808                    : BuiltinID == clang::AArch64::BI__builtin_arm_st64bv
10809                        ? Intrinsic::aarch64_st64bv
10810                        : Intrinsic::aarch64_st64bv0);
10811       Function *F = CGM.getIntrinsic(Intr);
10812       return Builder.CreateCall(F, Args);
10813     }
10814   }
10815 
10816   if (BuiltinID == clang::AArch64::BI__builtin_arm_rndr ||
10817       BuiltinID == clang::AArch64::BI__builtin_arm_rndrrs) {
10818 
10819     auto Intr = (BuiltinID == clang::AArch64::BI__builtin_arm_rndr
10820                      ? Intrinsic::aarch64_rndr
10821                      : Intrinsic::aarch64_rndrrs);
10822     Function *F = CGM.getIntrinsic(Intr);
10823     llvm::Value *Val = Builder.CreateCall(F);
10824     Value *RandomValue = Builder.CreateExtractValue(Val, 0);
10825     Value *Status = Builder.CreateExtractValue(Val, 1);
10826 
10827     Address MemAddress = EmitPointerWithAlignment(E->getArg(0));
10828     Builder.CreateStore(RandomValue, MemAddress);
10829     Status = Builder.CreateZExt(Status, Int32Ty);
10830     return Status;
10831   }
10832 
10833   if (BuiltinID == clang::AArch64::BI__clear_cache) {
10834     assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments");
10835     const FunctionDecl *FD = E->getDirectCallee();
10836     Value *Ops[2];
10837     for (unsigned i = 0; i < 2; i++)
10838       Ops[i] = EmitScalarExpr(E->getArg(i));
10839     llvm::Type *Ty = CGM.getTypes().ConvertType(FD->getType());
10840     llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
10841     StringRef Name = FD->getName();
10842     return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops);
10843   }
10844 
10845   if ((BuiltinID == clang::AArch64::BI__builtin_arm_ldrex ||
10846        BuiltinID == clang::AArch64::BI__builtin_arm_ldaex) &&
10847       getContext().getTypeSize(E->getType()) == 128) {
10848     Function *F =
10849         CGM.getIntrinsic(BuiltinID == clang::AArch64::BI__builtin_arm_ldaex
10850                              ? Intrinsic::aarch64_ldaxp
10851                              : Intrinsic::aarch64_ldxp);
10852 
10853     Value *LdPtr = EmitScalarExpr(E->getArg(0));
10854     Value *Val = Builder.CreateCall(F, LdPtr, "ldxp");
10855 
10856     Value *Val0 = Builder.CreateExtractValue(Val, 1);
10857     Value *Val1 = Builder.CreateExtractValue(Val, 0);
10858     llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
10859     Val0 = Builder.CreateZExt(Val0, Int128Ty);
10860     Val1 = Builder.CreateZExt(Val1, Int128Ty);
10861 
10862     Value *ShiftCst = llvm::ConstantInt::get(Int128Ty, 64);
10863     Val = Builder.CreateShl(Val0, ShiftCst, "shl", true /* nuw */);
10864     Val = Builder.CreateOr(Val, Val1);
10865     return Builder.CreateBitCast(Val, ConvertType(E->getType()));
10866   } else if (BuiltinID == clang::AArch64::BI__builtin_arm_ldrex ||
10867              BuiltinID == clang::AArch64::BI__builtin_arm_ldaex) {
10868     Value *LoadAddr = EmitScalarExpr(E->getArg(0));
10869 
10870     QualType Ty = E->getType();
10871     llvm::Type *RealResTy = ConvertType(Ty);
10872     llvm::Type *IntTy =
10873         llvm::IntegerType::get(getLLVMContext(), getContext().getTypeSize(Ty));
10874 
10875     Function *F =
10876         CGM.getIntrinsic(BuiltinID == clang::AArch64::BI__builtin_arm_ldaex
10877                              ? Intrinsic::aarch64_ldaxr
10878                              : Intrinsic::aarch64_ldxr,
10879                          UnqualPtrTy);
10880     CallInst *Val = Builder.CreateCall(F, LoadAddr, "ldxr");
10881     Val->addParamAttr(
10882         0, Attribute::get(getLLVMContext(), Attribute::ElementType, IntTy));
10883 
10884     if (RealResTy->isPointerTy())
10885       return Builder.CreateIntToPtr(Val, RealResTy);
10886 
10887     llvm::Type *IntResTy = llvm::IntegerType::get(
10888         getLLVMContext(), CGM.getDataLayout().getTypeSizeInBits(RealResTy));
10889     return Builder.CreateBitCast(Builder.CreateTruncOrBitCast(Val, IntResTy),
10890                                  RealResTy);
10891   }
10892 
10893   if ((BuiltinID == clang::AArch64::BI__builtin_arm_strex ||
10894        BuiltinID == clang::AArch64::BI__builtin_arm_stlex) &&
10895       getContext().getTypeSize(E->getArg(0)->getType()) == 128) {
10896     Function *F =
10897         CGM.getIntrinsic(BuiltinID == clang::AArch64::BI__builtin_arm_stlex
10898                              ? Intrinsic::aarch64_stlxp
10899                              : Intrinsic::aarch64_stxp);
10900     llvm::Type *STy = llvm::StructType::get(Int64Ty, Int64Ty);
10901 
10902     Address Tmp = CreateMemTemp(E->getArg(0)->getType());
10903     EmitAnyExprToMem(E->getArg(0), Tmp, Qualifiers(), /*init*/ true);
10904 
10905     Tmp = Tmp.withElementType(STy);
10906     llvm::Value *Val = Builder.CreateLoad(Tmp);
10907 
10908     Value *Arg0 = Builder.CreateExtractValue(Val, 0);
10909     Value *Arg1 = Builder.CreateExtractValue(Val, 1);
10910     Value *StPtr = EmitScalarExpr(E->getArg(1));
10911     return Builder.CreateCall(F, {Arg0, Arg1, StPtr}, "stxp");
10912   }
10913 
10914   if (BuiltinID == clang::AArch64::BI__builtin_arm_strex ||
10915       BuiltinID == clang::AArch64::BI__builtin_arm_stlex) {
10916     Value *StoreVal = EmitScalarExpr(E->getArg(0));
10917     Value *StoreAddr = EmitScalarExpr(E->getArg(1));
10918 
10919     QualType Ty = E->getArg(0)->getType();
10920     llvm::Type *StoreTy =
10921         llvm::IntegerType::get(getLLVMContext(), getContext().getTypeSize(Ty));
10922 
10923     if (StoreVal->getType()->isPointerTy())
10924       StoreVal = Builder.CreatePtrToInt(StoreVal, Int64Ty);
10925     else {
10926       llvm::Type *IntTy = llvm::IntegerType::get(
10927           getLLVMContext(),
10928           CGM.getDataLayout().getTypeSizeInBits(StoreVal->getType()));
10929       StoreVal = Builder.CreateBitCast(StoreVal, IntTy);
10930       StoreVal = Builder.CreateZExtOrBitCast(StoreVal, Int64Ty);
10931     }
10932 
10933     Function *F =
10934         CGM.getIntrinsic(BuiltinID == clang::AArch64::BI__builtin_arm_stlex
10935                              ? Intrinsic::aarch64_stlxr
10936                              : Intrinsic::aarch64_stxr,
10937                          StoreAddr->getType());
10938     CallInst *CI = Builder.CreateCall(F, {StoreVal, StoreAddr}, "stxr");
10939     CI->addParamAttr(
10940         1, Attribute::get(getLLVMContext(), Attribute::ElementType, StoreTy));
10941     return CI;
10942   }
10943 
10944   if (BuiltinID == clang::AArch64::BI__getReg) {
10945     Expr::EvalResult Result;
10946     if (!E->getArg(0)->EvaluateAsInt(Result, CGM.getContext()))
10947       llvm_unreachable("Sema will ensure that the parameter is constant");
10948 
10949     llvm::APSInt Value = Result.Val.getInt();
10950     LLVMContext &Context = CGM.getLLVMContext();
10951     std::string Reg = Value == 31 ? "sp" : "x" + toString(Value, 10);
10952 
10953     llvm::Metadata *Ops[] = {llvm::MDString::get(Context, Reg)};
10954     llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
10955     llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
10956 
10957     llvm::Function *F =
10958         CGM.getIntrinsic(llvm::Intrinsic::read_register, {Int64Ty});
10959     return Builder.CreateCall(F, Metadata);
10960   }
10961 
10962   if (BuiltinID == clang::AArch64::BI__break) {
10963     Expr::EvalResult Result;
10964     if (!E->getArg(0)->EvaluateAsInt(Result, CGM.getContext()))
10965       llvm_unreachable("Sema will ensure that the parameter is constant");
10966 
10967     llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::aarch64_break);
10968     return Builder.CreateCall(F, {EmitScalarExpr(E->getArg(0))});
10969   }
10970 
10971   if (BuiltinID == clang::AArch64::BI__builtin_arm_clrex) {
10972     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_clrex);
10973     return Builder.CreateCall(F);
10974   }
10975 
10976   if (BuiltinID == clang::AArch64::BI_ReadWriteBarrier)
10977     return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
10978                                llvm::SyncScope::SingleThread);
10979 
10980   // CRC32
10981   Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic;
10982   switch (BuiltinID) {
10983   case clang::AArch64::BI__builtin_arm_crc32b:
10984     CRCIntrinsicID = Intrinsic::aarch64_crc32b; break;
10985   case clang::AArch64::BI__builtin_arm_crc32cb:
10986     CRCIntrinsicID = Intrinsic::aarch64_crc32cb; break;
10987   case clang::AArch64::BI__builtin_arm_crc32h:
10988     CRCIntrinsicID = Intrinsic::aarch64_crc32h; break;
10989   case clang::AArch64::BI__builtin_arm_crc32ch:
10990     CRCIntrinsicID = Intrinsic::aarch64_crc32ch; break;
10991   case clang::AArch64::BI__builtin_arm_crc32w:
10992     CRCIntrinsicID = Intrinsic::aarch64_crc32w; break;
10993   case clang::AArch64::BI__builtin_arm_crc32cw:
10994     CRCIntrinsicID = Intrinsic::aarch64_crc32cw; break;
10995   case clang::AArch64::BI__builtin_arm_crc32d:
10996     CRCIntrinsicID = Intrinsic::aarch64_crc32x; break;
10997   case clang::AArch64::BI__builtin_arm_crc32cd:
10998     CRCIntrinsicID = Intrinsic::aarch64_crc32cx; break;
10999   }
11000 
11001   if (CRCIntrinsicID != Intrinsic::not_intrinsic) {
11002     Value *Arg0 = EmitScalarExpr(E->getArg(0));
11003     Value *Arg1 = EmitScalarExpr(E->getArg(1));
11004     Function *F = CGM.getIntrinsic(CRCIntrinsicID);
11005 
11006     llvm::Type *DataTy = F->getFunctionType()->getParamType(1);
11007     Arg1 = Builder.CreateZExtOrBitCast(Arg1, DataTy);
11008 
11009     return Builder.CreateCall(F, {Arg0, Arg1});
11010   }
11011 
11012   // Memory Operations (MOPS)
11013   if (BuiltinID == AArch64::BI__builtin_arm_mops_memset_tag) {
11014     Value *Dst = EmitScalarExpr(E->getArg(0));
11015     Value *Val = EmitScalarExpr(E->getArg(1));
11016     Value *Size = EmitScalarExpr(E->getArg(2));
11017     Dst = Builder.CreatePointerCast(Dst, Int8PtrTy);
11018     Val = Builder.CreateTrunc(Val, Int8Ty);
11019     Size = Builder.CreateIntCast(Size, Int64Ty, false);
11020     return Builder.CreateCall(
11021         CGM.getIntrinsic(Intrinsic::aarch64_mops_memset_tag), {Dst, Val, Size});
11022   }
11023 
11024   // Memory Tagging Extensions (MTE) Intrinsics
11025   Intrinsic::ID MTEIntrinsicID = Intrinsic::not_intrinsic;
11026   switch (BuiltinID) {
11027   case clang::AArch64::BI__builtin_arm_irg:
11028     MTEIntrinsicID = Intrinsic::aarch64_irg; break;
11029   case clang::AArch64::BI__builtin_arm_addg:
11030     MTEIntrinsicID = Intrinsic::aarch64_addg; break;
11031   case clang::AArch64::BI__builtin_arm_gmi:
11032     MTEIntrinsicID = Intrinsic::aarch64_gmi; break;
11033   case clang::AArch64::BI__builtin_arm_ldg:
11034     MTEIntrinsicID = Intrinsic::aarch64_ldg; break;
11035   case clang::AArch64::BI__builtin_arm_stg:
11036     MTEIntrinsicID = Intrinsic::aarch64_stg; break;
11037   case clang::AArch64::BI__builtin_arm_subp:
11038     MTEIntrinsicID = Intrinsic::aarch64_subp; break;
11039   }
11040 
11041   if (MTEIntrinsicID != Intrinsic::not_intrinsic) {
11042     llvm::Type *T = ConvertType(E->getType());
11043 
11044     if (MTEIntrinsicID == Intrinsic::aarch64_irg) {
11045       Value *Pointer = EmitScalarExpr(E->getArg(0));
11046       Value *Mask = EmitScalarExpr(E->getArg(1));
11047 
11048       Pointer = Builder.CreatePointerCast(Pointer, Int8PtrTy);
11049       Mask = Builder.CreateZExt(Mask, Int64Ty);
11050       Value *RV = Builder.CreateCall(
11051                        CGM.getIntrinsic(MTEIntrinsicID), {Pointer, Mask});
11052        return Builder.CreatePointerCast(RV, T);
11053     }
11054     if (MTEIntrinsicID == Intrinsic::aarch64_addg) {
11055       Value *Pointer = EmitScalarExpr(E->getArg(0));
11056       Value *TagOffset = EmitScalarExpr(E->getArg(1));
11057 
11058       Pointer = Builder.CreatePointerCast(Pointer, Int8PtrTy);
11059       TagOffset = Builder.CreateZExt(TagOffset, Int64Ty);
11060       Value *RV = Builder.CreateCall(
11061                        CGM.getIntrinsic(MTEIntrinsicID), {Pointer, TagOffset});
11062       return Builder.CreatePointerCast(RV, T);
11063     }
11064     if (MTEIntrinsicID == Intrinsic::aarch64_gmi) {
11065       Value *Pointer = EmitScalarExpr(E->getArg(0));
11066       Value *ExcludedMask = EmitScalarExpr(E->getArg(1));
11067 
11068       ExcludedMask = Builder.CreateZExt(ExcludedMask, Int64Ty);
11069       Pointer = Builder.CreatePointerCast(Pointer, Int8PtrTy);
11070       return Builder.CreateCall(
11071                        CGM.getIntrinsic(MTEIntrinsicID), {Pointer, ExcludedMask});
11072     }
11073     // Although it is possible to supply a different return
11074     // address (first arg) to this intrinsic, for now we set
11075     // return address same as input address.
11076     if (MTEIntrinsicID == Intrinsic::aarch64_ldg) {
11077       Value *TagAddress = EmitScalarExpr(E->getArg(0));
11078       TagAddress = Builder.CreatePointerCast(TagAddress, Int8PtrTy);
11079       Value *RV = Builder.CreateCall(
11080                     CGM.getIntrinsic(MTEIntrinsicID), {TagAddress, TagAddress});
11081       return Builder.CreatePointerCast(RV, T);
11082     }
11083     // Although it is possible to supply a different tag (to set)
11084     // to this intrinsic (as first arg), for now we supply
11085     // the tag that is in input address arg (common use case).
11086     if (MTEIntrinsicID == Intrinsic::aarch64_stg) {
11087         Value *TagAddress = EmitScalarExpr(E->getArg(0));
11088         TagAddress = Builder.CreatePointerCast(TagAddress, Int8PtrTy);
11089         return Builder.CreateCall(
11090                  CGM.getIntrinsic(MTEIntrinsicID), {TagAddress, TagAddress});
11091     }
11092     if (MTEIntrinsicID == Intrinsic::aarch64_subp) {
11093       Value *PointerA = EmitScalarExpr(E->getArg(0));
11094       Value *PointerB = EmitScalarExpr(E->getArg(1));
11095       PointerA = Builder.CreatePointerCast(PointerA, Int8PtrTy);
11096       PointerB = Builder.CreatePointerCast(PointerB, Int8PtrTy);
11097       return Builder.CreateCall(
11098                        CGM.getIntrinsic(MTEIntrinsicID), {PointerA, PointerB});
11099     }
11100   }
11101 
11102   if (BuiltinID == clang::AArch64::BI__builtin_arm_rsr ||
11103       BuiltinID == clang::AArch64::BI__builtin_arm_rsr64 ||
11104       BuiltinID == clang::AArch64::BI__builtin_arm_rsr128 ||
11105       BuiltinID == clang::AArch64::BI__builtin_arm_rsrp ||
11106       BuiltinID == clang::AArch64::BI__builtin_arm_wsr ||
11107       BuiltinID == clang::AArch64::BI__builtin_arm_wsr64 ||
11108       BuiltinID == clang::AArch64::BI__builtin_arm_wsr128 ||
11109       BuiltinID == clang::AArch64::BI__builtin_arm_wsrp) {
11110 
11111     SpecialRegisterAccessKind AccessKind = Write;
11112     if (BuiltinID == clang::AArch64::BI__builtin_arm_rsr ||
11113         BuiltinID == clang::AArch64::BI__builtin_arm_rsr64 ||
11114         BuiltinID == clang::AArch64::BI__builtin_arm_rsr128 ||
11115         BuiltinID == clang::AArch64::BI__builtin_arm_rsrp)
11116       AccessKind = VolatileRead;
11117 
11118     bool IsPointerBuiltin = BuiltinID == clang::AArch64::BI__builtin_arm_rsrp ||
11119                             BuiltinID == clang::AArch64::BI__builtin_arm_wsrp;
11120 
11121     bool Is32Bit = BuiltinID == clang::AArch64::BI__builtin_arm_rsr ||
11122                    BuiltinID == clang::AArch64::BI__builtin_arm_wsr;
11123 
11124     bool Is128Bit = BuiltinID == clang::AArch64::BI__builtin_arm_rsr128 ||
11125                     BuiltinID == clang::AArch64::BI__builtin_arm_wsr128;
11126 
11127     llvm::Type *ValueType;
11128     llvm::Type *RegisterType = Int64Ty;
11129     if (Is32Bit) {
11130       ValueType = Int32Ty;
11131     } else if (Is128Bit) {
11132       llvm::Type *Int128Ty =
11133           llvm::IntegerType::getInt128Ty(CGM.getLLVMContext());
11134       ValueType = Int128Ty;
11135       RegisterType = Int128Ty;
11136     } else if (IsPointerBuiltin) {
11137       ValueType = VoidPtrTy;
11138     } else {
11139       ValueType = Int64Ty;
11140     };
11141 
11142     return EmitSpecialRegisterBuiltin(*this, E, RegisterType, ValueType,
11143                                       AccessKind);
11144   }
11145 
11146   if (BuiltinID == clang::AArch64::BI_ReadStatusReg ||
11147       BuiltinID == clang::AArch64::BI_WriteStatusReg) {
11148     LLVMContext &Context = CGM.getLLVMContext();
11149 
11150     unsigned SysReg =
11151       E->getArg(0)->EvaluateKnownConstInt(getContext()).getZExtValue();
11152 
11153     std::string SysRegStr;
11154     llvm::raw_string_ostream(SysRegStr) <<
11155                        ((1 << 1) | ((SysReg >> 14) & 1))  << ":" <<
11156                        ((SysReg >> 11) & 7)               << ":" <<
11157                        ((SysReg >> 7)  & 15)              << ":" <<
11158                        ((SysReg >> 3)  & 15)              << ":" <<
11159                        ( SysReg        & 7);
11160 
11161     llvm::Metadata *Ops[] = { llvm::MDString::get(Context, SysRegStr) };
11162     llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
11163     llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
11164 
11165     llvm::Type *RegisterType = Int64Ty;
11166     llvm::Type *Types[] = { RegisterType };
11167 
11168     if (BuiltinID == clang::AArch64::BI_ReadStatusReg) {
11169       llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::read_register, Types);
11170 
11171       return Builder.CreateCall(F, Metadata);
11172     }
11173 
11174     llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::write_register, Types);
11175     llvm::Value *ArgValue = EmitScalarExpr(E->getArg(1));
11176 
11177     return Builder.CreateCall(F, { Metadata, ArgValue });
11178   }
11179 
11180   if (BuiltinID == clang::AArch64::BI_AddressOfReturnAddress) {
11181     llvm::Function *F =
11182         CGM.getIntrinsic(Intrinsic::addressofreturnaddress, AllocaInt8PtrTy);
11183     return Builder.CreateCall(F);
11184   }
11185 
11186   if (BuiltinID == clang::AArch64::BI__builtin_sponentry) {
11187     llvm::Function *F = CGM.getIntrinsic(Intrinsic::sponentry, AllocaInt8PtrTy);
11188     return Builder.CreateCall(F);
11189   }
11190 
11191   if (BuiltinID == clang::AArch64::BI__mulh ||
11192       BuiltinID == clang::AArch64::BI__umulh) {
11193     llvm::Type *ResType = ConvertType(E->getType());
11194     llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
11195 
11196     bool IsSigned = BuiltinID == clang::AArch64::BI__mulh;
11197     Value *LHS =
11198         Builder.CreateIntCast(EmitScalarExpr(E->getArg(0)), Int128Ty, IsSigned);
11199     Value *RHS =
11200         Builder.CreateIntCast(EmitScalarExpr(E->getArg(1)), Int128Ty, IsSigned);
11201 
11202     Value *MulResult, *HigherBits;
11203     if (IsSigned) {
11204       MulResult = Builder.CreateNSWMul(LHS, RHS);
11205       HigherBits = Builder.CreateAShr(MulResult, 64);
11206     } else {
11207       MulResult = Builder.CreateNUWMul(LHS, RHS);
11208       HigherBits = Builder.CreateLShr(MulResult, 64);
11209     }
11210     HigherBits = Builder.CreateIntCast(HigherBits, ResType, IsSigned);
11211 
11212     return HigherBits;
11213   }
11214 
11215   if (BuiltinID == AArch64::BI__writex18byte ||
11216       BuiltinID == AArch64::BI__writex18word ||
11217       BuiltinID == AArch64::BI__writex18dword ||
11218       BuiltinID == AArch64::BI__writex18qword) {
11219     // Read x18 as i8*
11220     LLVMContext &Context = CGM.getLLVMContext();
11221     llvm::Metadata *Ops[] = {llvm::MDString::get(Context, "x18")};
11222     llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
11223     llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
11224     llvm::Function *F =
11225         CGM.getIntrinsic(llvm::Intrinsic::read_register, {Int64Ty});
11226     llvm::Value *X18 = Builder.CreateCall(F, Metadata);
11227     X18 = Builder.CreateIntToPtr(X18, Int8PtrTy);
11228 
11229     // Store val at x18 + offset
11230     Value *Offset = Builder.CreateZExt(EmitScalarExpr(E->getArg(0)), Int64Ty);
11231     Value *Ptr = Builder.CreateGEP(Int8Ty, X18, Offset);
11232     Value *Val = EmitScalarExpr(E->getArg(1));
11233     StoreInst *Store = Builder.CreateAlignedStore(Val, Ptr, CharUnits::One());
11234     return Store;
11235   }
11236 
11237   if (BuiltinID == AArch64::BI__readx18byte ||
11238       BuiltinID == AArch64::BI__readx18word ||
11239       BuiltinID == AArch64::BI__readx18dword ||
11240       BuiltinID == AArch64::BI__readx18qword) {
11241     llvm::Type *IntTy = ConvertType(E->getType());
11242 
11243     // Read x18 as i8*
11244     LLVMContext &Context = CGM.getLLVMContext();
11245     llvm::Metadata *Ops[] = {llvm::MDString::get(Context, "x18")};
11246     llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
11247     llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
11248     llvm::Function *F =
11249         CGM.getIntrinsic(llvm::Intrinsic::read_register, {Int64Ty});
11250     llvm::Value *X18 = Builder.CreateCall(F, Metadata);
11251     X18 = Builder.CreateIntToPtr(X18, Int8PtrTy);
11252 
11253     // Load x18 + offset
11254     Value *Offset = Builder.CreateZExt(EmitScalarExpr(E->getArg(0)), Int64Ty);
11255     Value *Ptr = Builder.CreateGEP(Int8Ty, X18, Offset);
11256     LoadInst *Load = Builder.CreateAlignedLoad(IntTy, Ptr, CharUnits::One());
11257     return Load;
11258   }
11259 
11260   if (BuiltinID == AArch64::BI_CopyDoubleFromInt64 ||
11261       BuiltinID == AArch64::BI_CopyFloatFromInt32 ||
11262       BuiltinID == AArch64::BI_CopyInt32FromFloat ||
11263       BuiltinID == AArch64::BI_CopyInt64FromDouble) {
11264     Value *Arg = EmitScalarExpr(E->getArg(0));
11265     llvm::Type *RetTy = ConvertType(E->getType());
11266     return Builder.CreateBitCast(Arg, RetTy);
11267   }
11268 
11269   if (BuiltinID == AArch64::BI_CountLeadingOnes ||
11270       BuiltinID == AArch64::BI_CountLeadingOnes64 ||
11271       BuiltinID == AArch64::BI_CountLeadingZeros ||
11272       BuiltinID == AArch64::BI_CountLeadingZeros64) {
11273     Value *Arg = EmitScalarExpr(E->getArg(0));
11274     llvm::Type *ArgType = Arg->getType();
11275 
11276     if (BuiltinID == AArch64::BI_CountLeadingOnes ||
11277         BuiltinID == AArch64::BI_CountLeadingOnes64)
11278       Arg = Builder.CreateXor(Arg, Constant::getAllOnesValue(ArgType));
11279 
11280     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
11281     Value *Result = Builder.CreateCall(F, {Arg, Builder.getInt1(false)});
11282 
11283     if (BuiltinID == AArch64::BI_CountLeadingOnes64 ||
11284         BuiltinID == AArch64::BI_CountLeadingZeros64)
11285       Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
11286     return Result;
11287   }
11288 
11289   if (BuiltinID == AArch64::BI_CountLeadingSigns ||
11290       BuiltinID == AArch64::BI_CountLeadingSigns64) {
11291     Value *Arg = EmitScalarExpr(E->getArg(0));
11292 
11293     Function *F = (BuiltinID == AArch64::BI_CountLeadingSigns)
11294                       ? CGM.getIntrinsic(Intrinsic::aarch64_cls)
11295                       : CGM.getIntrinsic(Intrinsic::aarch64_cls64);
11296 
11297     Value *Result = Builder.CreateCall(F, Arg, "cls");
11298     if (BuiltinID == AArch64::BI_CountLeadingSigns64)
11299       Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
11300     return Result;
11301   }
11302 
11303   if (BuiltinID == AArch64::BI_CountOneBits ||
11304       BuiltinID == AArch64::BI_CountOneBits64) {
11305     Value *ArgValue = EmitScalarExpr(E->getArg(0));
11306     llvm::Type *ArgType = ArgValue->getType();
11307     Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType);
11308 
11309     Value *Result = Builder.CreateCall(F, ArgValue);
11310     if (BuiltinID == AArch64::BI_CountOneBits64)
11311       Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
11312     return Result;
11313   }
11314 
11315   if (BuiltinID == AArch64::BI__prefetch) {
11316     Value *Address = EmitScalarExpr(E->getArg(0));
11317     Value *RW = llvm::ConstantInt::get(Int32Ty, 0);
11318     Value *Locality = ConstantInt::get(Int32Ty, 3);
11319     Value *Data = llvm::ConstantInt::get(Int32Ty, 1);
11320     Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType());
11321     return Builder.CreateCall(F, {Address, RW, Locality, Data});
11322   }
11323 
11324   // Handle MSVC intrinsics before argument evaluation to prevent double
11325   // evaluation.
11326   if (std::optional<MSVCIntrin> MsvcIntId =
11327           translateAarch64ToMsvcIntrin(BuiltinID))
11328     return EmitMSVCBuiltinExpr(*MsvcIntId, E);
11329 
11330   // Some intrinsics are equivalent - if they are use the base intrinsic ID.
11331   auto It = llvm::find_if(NEONEquivalentIntrinsicMap, [BuiltinID](auto &P) {
11332     return P.first == BuiltinID;
11333   });
11334   if (It != end(NEONEquivalentIntrinsicMap))
11335     BuiltinID = It->second;
11336 
11337   // Find out if any arguments are required to be integer constant
11338   // expressions.
11339   unsigned ICEArguments = 0;
11340   ASTContext::GetBuiltinTypeError Error;
11341   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
11342   assert(Error == ASTContext::GE_None && "Should not codegen an error");
11343 
11344   llvm::SmallVector<Value*, 4> Ops;
11345   Address PtrOp0 = Address::invalid();
11346   for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; i++) {
11347     if (i == 0) {
11348       switch (BuiltinID) {
11349       case NEON::BI__builtin_neon_vld1_v:
11350       case NEON::BI__builtin_neon_vld1q_v:
11351       case NEON::BI__builtin_neon_vld1_dup_v:
11352       case NEON::BI__builtin_neon_vld1q_dup_v:
11353       case NEON::BI__builtin_neon_vld1_lane_v:
11354       case NEON::BI__builtin_neon_vld1q_lane_v:
11355       case NEON::BI__builtin_neon_vst1_v:
11356       case NEON::BI__builtin_neon_vst1q_v:
11357       case NEON::BI__builtin_neon_vst1_lane_v:
11358       case NEON::BI__builtin_neon_vst1q_lane_v:
11359       case NEON::BI__builtin_neon_vldap1_lane_s64:
11360       case NEON::BI__builtin_neon_vldap1q_lane_s64:
11361       case NEON::BI__builtin_neon_vstl1_lane_s64:
11362       case NEON::BI__builtin_neon_vstl1q_lane_s64:
11363         // Get the alignment for the argument in addition to the value;
11364         // we'll use it later.
11365         PtrOp0 = EmitPointerWithAlignment(E->getArg(0));
11366         Ops.push_back(PtrOp0.getPointer());
11367         continue;
11368       }
11369     }
11370     Ops.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, i, E));
11371   }
11372 
11373   auto SISDMap = ArrayRef(AArch64SISDIntrinsicMap);
11374   const ARMVectorIntrinsicInfo *Builtin = findARMVectorIntrinsicInMap(
11375       SISDMap, BuiltinID, AArch64SISDIntrinsicsProvenSorted);
11376 
11377   if (Builtin) {
11378     Ops.push_back(EmitScalarExpr(E->getArg(E->getNumArgs() - 1)));
11379     Value *Result = EmitCommonNeonSISDBuiltinExpr(*this, *Builtin, Ops, E);
11380     assert(Result && "SISD intrinsic should have been handled");
11381     return Result;
11382   }
11383 
11384   const Expr *Arg = E->getArg(E->getNumArgs()-1);
11385   NeonTypeFlags Type(0);
11386   if (std::optional<llvm::APSInt> Result =
11387           Arg->getIntegerConstantExpr(getContext()))
11388     // Determine the type of this overloaded NEON intrinsic.
11389     Type = NeonTypeFlags(Result->getZExtValue());
11390 
11391   bool usgn = Type.isUnsigned();
11392   bool quad = Type.isQuad();
11393 
11394   // Handle non-overloaded intrinsics first.
11395   switch (BuiltinID) {
11396   default: break;
11397   case NEON::BI__builtin_neon_vabsh_f16:
11398     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11399     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, HalfTy), Ops, "vabs");
11400   case NEON::BI__builtin_neon_vaddq_p128: {
11401     llvm::Type *Ty = GetNeonType(this, NeonTypeFlags::Poly128);
11402     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11403     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
11404     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
11405     Ops[0] =  Builder.CreateXor(Ops[0], Ops[1]);
11406     llvm::Type *Int128Ty = llvm::Type::getIntNTy(getLLVMContext(), 128);
11407     return Builder.CreateBitCast(Ops[0], Int128Ty);
11408   }
11409   case NEON::BI__builtin_neon_vldrq_p128: {
11410     llvm::Type *Int128Ty = llvm::Type::getIntNTy(getLLVMContext(), 128);
11411     Value *Ptr = EmitScalarExpr(E->getArg(0));
11412     return Builder.CreateAlignedLoad(Int128Ty, Ptr,
11413                                      CharUnits::fromQuantity(16));
11414   }
11415   case NEON::BI__builtin_neon_vstrq_p128: {
11416     Value *Ptr = Ops[0];
11417     return Builder.CreateDefaultAlignedStore(EmitScalarExpr(E->getArg(1)), Ptr);
11418   }
11419   case NEON::BI__builtin_neon_vcvts_f32_u32:
11420   case NEON::BI__builtin_neon_vcvtd_f64_u64:
11421     usgn = true;
11422     [[fallthrough]];
11423   case NEON::BI__builtin_neon_vcvts_f32_s32:
11424   case NEON::BI__builtin_neon_vcvtd_f64_s64: {
11425     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11426     bool Is64 = Ops[0]->getType()->getPrimitiveSizeInBits() == 64;
11427     llvm::Type *InTy = Is64 ? Int64Ty : Int32Ty;
11428     llvm::Type *FTy = Is64 ? DoubleTy : FloatTy;
11429     Ops[0] = Builder.CreateBitCast(Ops[0], InTy);
11430     if (usgn)
11431       return Builder.CreateUIToFP(Ops[0], FTy);
11432     return Builder.CreateSIToFP(Ops[0], FTy);
11433   }
11434   case NEON::BI__builtin_neon_vcvth_f16_u16:
11435   case NEON::BI__builtin_neon_vcvth_f16_u32:
11436   case NEON::BI__builtin_neon_vcvth_f16_u64:
11437     usgn = true;
11438     [[fallthrough]];
11439   case NEON::BI__builtin_neon_vcvth_f16_s16:
11440   case NEON::BI__builtin_neon_vcvth_f16_s32:
11441   case NEON::BI__builtin_neon_vcvth_f16_s64: {
11442     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11443     llvm::Type *FTy = HalfTy;
11444     llvm::Type *InTy;
11445     if (Ops[0]->getType()->getPrimitiveSizeInBits() == 64)
11446       InTy = Int64Ty;
11447     else if (Ops[0]->getType()->getPrimitiveSizeInBits() == 32)
11448       InTy = Int32Ty;
11449     else
11450       InTy = Int16Ty;
11451     Ops[0] = Builder.CreateBitCast(Ops[0], InTy);
11452     if (usgn)
11453       return Builder.CreateUIToFP(Ops[0], FTy);
11454     return Builder.CreateSIToFP(Ops[0], FTy);
11455   }
11456   case NEON::BI__builtin_neon_vcvtah_u16_f16:
11457   case NEON::BI__builtin_neon_vcvtmh_u16_f16:
11458   case NEON::BI__builtin_neon_vcvtnh_u16_f16:
11459   case NEON::BI__builtin_neon_vcvtph_u16_f16:
11460   case NEON::BI__builtin_neon_vcvth_u16_f16:
11461   case NEON::BI__builtin_neon_vcvtah_s16_f16:
11462   case NEON::BI__builtin_neon_vcvtmh_s16_f16:
11463   case NEON::BI__builtin_neon_vcvtnh_s16_f16:
11464   case NEON::BI__builtin_neon_vcvtph_s16_f16:
11465   case NEON::BI__builtin_neon_vcvth_s16_f16: {
11466     unsigned Int;
11467     llvm::Type* InTy = Int32Ty;
11468     llvm::Type* FTy  = HalfTy;
11469     llvm::Type *Tys[2] = {InTy, FTy};
11470     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11471     switch (BuiltinID) {
11472     default: llvm_unreachable("missing builtin ID in switch!");
11473     case NEON::BI__builtin_neon_vcvtah_u16_f16:
11474       Int = Intrinsic::aarch64_neon_fcvtau; break;
11475     case NEON::BI__builtin_neon_vcvtmh_u16_f16:
11476       Int = Intrinsic::aarch64_neon_fcvtmu; break;
11477     case NEON::BI__builtin_neon_vcvtnh_u16_f16:
11478       Int = Intrinsic::aarch64_neon_fcvtnu; break;
11479     case NEON::BI__builtin_neon_vcvtph_u16_f16:
11480       Int = Intrinsic::aarch64_neon_fcvtpu; break;
11481     case NEON::BI__builtin_neon_vcvth_u16_f16:
11482       Int = Intrinsic::aarch64_neon_fcvtzu; break;
11483     case NEON::BI__builtin_neon_vcvtah_s16_f16:
11484       Int = Intrinsic::aarch64_neon_fcvtas; break;
11485     case NEON::BI__builtin_neon_vcvtmh_s16_f16:
11486       Int = Intrinsic::aarch64_neon_fcvtms; break;
11487     case NEON::BI__builtin_neon_vcvtnh_s16_f16:
11488       Int = Intrinsic::aarch64_neon_fcvtns; break;
11489     case NEON::BI__builtin_neon_vcvtph_s16_f16:
11490       Int = Intrinsic::aarch64_neon_fcvtps; break;
11491     case NEON::BI__builtin_neon_vcvth_s16_f16:
11492       Int = Intrinsic::aarch64_neon_fcvtzs; break;
11493     }
11494     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvt");
11495     return Builder.CreateTrunc(Ops[0], Int16Ty);
11496   }
11497   case NEON::BI__builtin_neon_vcaleh_f16:
11498   case NEON::BI__builtin_neon_vcalth_f16:
11499   case NEON::BI__builtin_neon_vcageh_f16:
11500   case NEON::BI__builtin_neon_vcagth_f16: {
11501     unsigned Int;
11502     llvm::Type* InTy = Int32Ty;
11503     llvm::Type* FTy  = HalfTy;
11504     llvm::Type *Tys[2] = {InTy, FTy};
11505     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11506     switch (BuiltinID) {
11507     default: llvm_unreachable("missing builtin ID in switch!");
11508     case NEON::BI__builtin_neon_vcageh_f16:
11509       Int = Intrinsic::aarch64_neon_facge; break;
11510     case NEON::BI__builtin_neon_vcagth_f16:
11511       Int = Intrinsic::aarch64_neon_facgt; break;
11512     case NEON::BI__builtin_neon_vcaleh_f16:
11513       Int = Intrinsic::aarch64_neon_facge; std::swap(Ops[0], Ops[1]); break;
11514     case NEON::BI__builtin_neon_vcalth_f16:
11515       Int = Intrinsic::aarch64_neon_facgt; std::swap(Ops[0], Ops[1]); break;
11516     }
11517     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "facg");
11518     return Builder.CreateTrunc(Ops[0], Int16Ty);
11519   }
11520   case NEON::BI__builtin_neon_vcvth_n_s16_f16:
11521   case NEON::BI__builtin_neon_vcvth_n_u16_f16: {
11522     unsigned Int;
11523     llvm::Type* InTy = Int32Ty;
11524     llvm::Type* FTy  = HalfTy;
11525     llvm::Type *Tys[2] = {InTy, FTy};
11526     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11527     switch (BuiltinID) {
11528     default: llvm_unreachable("missing builtin ID in switch!");
11529     case NEON::BI__builtin_neon_vcvth_n_s16_f16:
11530       Int = Intrinsic::aarch64_neon_vcvtfp2fxs; break;
11531     case NEON::BI__builtin_neon_vcvth_n_u16_f16:
11532       Int = Intrinsic::aarch64_neon_vcvtfp2fxu; break;
11533     }
11534     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvth_n");
11535     return Builder.CreateTrunc(Ops[0], Int16Ty);
11536   }
11537   case NEON::BI__builtin_neon_vcvth_n_f16_s16:
11538   case NEON::BI__builtin_neon_vcvth_n_f16_u16: {
11539     unsigned Int;
11540     llvm::Type* FTy  = HalfTy;
11541     llvm::Type* InTy = Int32Ty;
11542     llvm::Type *Tys[2] = {FTy, InTy};
11543     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11544     switch (BuiltinID) {
11545     default: llvm_unreachable("missing builtin ID in switch!");
11546     case NEON::BI__builtin_neon_vcvth_n_f16_s16:
11547       Int = Intrinsic::aarch64_neon_vcvtfxs2fp;
11548       Ops[0] = Builder.CreateSExt(Ops[0], InTy, "sext");
11549       break;
11550     case NEON::BI__builtin_neon_vcvth_n_f16_u16:
11551       Int = Intrinsic::aarch64_neon_vcvtfxu2fp;
11552       Ops[0] = Builder.CreateZExt(Ops[0], InTy);
11553       break;
11554     }
11555     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvth_n");
11556   }
11557   case NEON::BI__builtin_neon_vpaddd_s64: {
11558     auto *Ty = llvm::FixedVectorType::get(Int64Ty, 2);
11559     Value *Vec = EmitScalarExpr(E->getArg(0));
11560     // The vector is v2f64, so make sure it's bitcast to that.
11561     Vec = Builder.CreateBitCast(Vec, Ty, "v2i64");
11562     llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0);
11563     llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1);
11564     Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
11565     Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
11566     // Pairwise addition of a v2f64 into a scalar f64.
11567     return Builder.CreateAdd(Op0, Op1, "vpaddd");
11568   }
11569   case NEON::BI__builtin_neon_vpaddd_f64: {
11570     auto *Ty = llvm::FixedVectorType::get(DoubleTy, 2);
11571     Value *Vec = EmitScalarExpr(E->getArg(0));
11572     // The vector is v2f64, so make sure it's bitcast to that.
11573     Vec = Builder.CreateBitCast(Vec, Ty, "v2f64");
11574     llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0);
11575     llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1);
11576     Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
11577     Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
11578     // Pairwise addition of a v2f64 into a scalar f64.
11579     return Builder.CreateFAdd(Op0, Op1, "vpaddd");
11580   }
11581   case NEON::BI__builtin_neon_vpadds_f32: {
11582     auto *Ty = llvm::FixedVectorType::get(FloatTy, 2);
11583     Value *Vec = EmitScalarExpr(E->getArg(0));
11584     // The vector is v2f32, so make sure it's bitcast to that.
11585     Vec = Builder.CreateBitCast(Vec, Ty, "v2f32");
11586     llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0);
11587     llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1);
11588     Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
11589     Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
11590     // Pairwise addition of a v2f32 into a scalar f32.
11591     return Builder.CreateFAdd(Op0, Op1, "vpaddd");
11592   }
11593   case NEON::BI__builtin_neon_vceqzd_s64:
11594   case NEON::BI__builtin_neon_vceqzd_f64:
11595   case NEON::BI__builtin_neon_vceqzs_f32:
11596   case NEON::BI__builtin_neon_vceqzh_f16:
11597     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11598     return EmitAArch64CompareBuiltinExpr(
11599         Ops[0], ConvertType(E->getCallReturnType(getContext())),
11600         ICmpInst::FCMP_OEQ, ICmpInst::ICMP_EQ, "vceqz");
11601   case NEON::BI__builtin_neon_vcgezd_s64:
11602   case NEON::BI__builtin_neon_vcgezd_f64:
11603   case NEON::BI__builtin_neon_vcgezs_f32:
11604   case NEON::BI__builtin_neon_vcgezh_f16:
11605     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11606     return EmitAArch64CompareBuiltinExpr(
11607         Ops[0], ConvertType(E->getCallReturnType(getContext())),
11608         ICmpInst::FCMP_OGE, ICmpInst::ICMP_SGE, "vcgez");
11609   case NEON::BI__builtin_neon_vclezd_s64:
11610   case NEON::BI__builtin_neon_vclezd_f64:
11611   case NEON::BI__builtin_neon_vclezs_f32:
11612   case NEON::BI__builtin_neon_vclezh_f16:
11613     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11614     return EmitAArch64CompareBuiltinExpr(
11615         Ops[0], ConvertType(E->getCallReturnType(getContext())),
11616         ICmpInst::FCMP_OLE, ICmpInst::ICMP_SLE, "vclez");
11617   case NEON::BI__builtin_neon_vcgtzd_s64:
11618   case NEON::BI__builtin_neon_vcgtzd_f64:
11619   case NEON::BI__builtin_neon_vcgtzs_f32:
11620   case NEON::BI__builtin_neon_vcgtzh_f16:
11621     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11622     return EmitAArch64CompareBuiltinExpr(
11623         Ops[0], ConvertType(E->getCallReturnType(getContext())),
11624         ICmpInst::FCMP_OGT, ICmpInst::ICMP_SGT, "vcgtz");
11625   case NEON::BI__builtin_neon_vcltzd_s64:
11626   case NEON::BI__builtin_neon_vcltzd_f64:
11627   case NEON::BI__builtin_neon_vcltzs_f32:
11628   case NEON::BI__builtin_neon_vcltzh_f16:
11629     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11630     return EmitAArch64CompareBuiltinExpr(
11631         Ops[0], ConvertType(E->getCallReturnType(getContext())),
11632         ICmpInst::FCMP_OLT, ICmpInst::ICMP_SLT, "vcltz");
11633 
11634   case NEON::BI__builtin_neon_vceqzd_u64: {
11635     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11636     Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty);
11637     Ops[0] =
11638         Builder.CreateICmpEQ(Ops[0], llvm::Constant::getNullValue(Int64Ty));
11639     return Builder.CreateSExt(Ops[0], Int64Ty, "vceqzd");
11640   }
11641   case NEON::BI__builtin_neon_vceqd_f64:
11642   case NEON::BI__builtin_neon_vcled_f64:
11643   case NEON::BI__builtin_neon_vcltd_f64:
11644   case NEON::BI__builtin_neon_vcged_f64:
11645   case NEON::BI__builtin_neon_vcgtd_f64: {
11646     llvm::CmpInst::Predicate P;
11647     switch (BuiltinID) {
11648     default: llvm_unreachable("missing builtin ID in switch!");
11649     case NEON::BI__builtin_neon_vceqd_f64: P = llvm::FCmpInst::FCMP_OEQ; break;
11650     case NEON::BI__builtin_neon_vcled_f64: P = llvm::FCmpInst::FCMP_OLE; break;
11651     case NEON::BI__builtin_neon_vcltd_f64: P = llvm::FCmpInst::FCMP_OLT; break;
11652     case NEON::BI__builtin_neon_vcged_f64: P = llvm::FCmpInst::FCMP_OGE; break;
11653     case NEON::BI__builtin_neon_vcgtd_f64: P = llvm::FCmpInst::FCMP_OGT; break;
11654     }
11655     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11656     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
11657     Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy);
11658     if (P == llvm::FCmpInst::FCMP_OEQ)
11659       Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
11660     else
11661       Ops[0] = Builder.CreateFCmpS(P, Ops[0], Ops[1]);
11662     return Builder.CreateSExt(Ops[0], Int64Ty, "vcmpd");
11663   }
11664   case NEON::BI__builtin_neon_vceqs_f32:
11665   case NEON::BI__builtin_neon_vcles_f32:
11666   case NEON::BI__builtin_neon_vclts_f32:
11667   case NEON::BI__builtin_neon_vcges_f32:
11668   case NEON::BI__builtin_neon_vcgts_f32: {
11669     llvm::CmpInst::Predicate P;
11670     switch (BuiltinID) {
11671     default: llvm_unreachable("missing builtin ID in switch!");
11672     case NEON::BI__builtin_neon_vceqs_f32: P = llvm::FCmpInst::FCMP_OEQ; break;
11673     case NEON::BI__builtin_neon_vcles_f32: P = llvm::FCmpInst::FCMP_OLE; break;
11674     case NEON::BI__builtin_neon_vclts_f32: P = llvm::FCmpInst::FCMP_OLT; break;
11675     case NEON::BI__builtin_neon_vcges_f32: P = llvm::FCmpInst::FCMP_OGE; break;
11676     case NEON::BI__builtin_neon_vcgts_f32: P = llvm::FCmpInst::FCMP_OGT; break;
11677     }
11678     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11679     Ops[0] = Builder.CreateBitCast(Ops[0], FloatTy);
11680     Ops[1] = Builder.CreateBitCast(Ops[1], FloatTy);
11681     if (P == llvm::FCmpInst::FCMP_OEQ)
11682       Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
11683     else
11684       Ops[0] = Builder.CreateFCmpS(P, Ops[0], Ops[1]);
11685     return Builder.CreateSExt(Ops[0], Int32Ty, "vcmpd");
11686   }
11687   case NEON::BI__builtin_neon_vceqh_f16:
11688   case NEON::BI__builtin_neon_vcleh_f16:
11689   case NEON::BI__builtin_neon_vclth_f16:
11690   case NEON::BI__builtin_neon_vcgeh_f16:
11691   case NEON::BI__builtin_neon_vcgth_f16: {
11692     llvm::CmpInst::Predicate P;
11693     switch (BuiltinID) {
11694     default: llvm_unreachable("missing builtin ID in switch!");
11695     case NEON::BI__builtin_neon_vceqh_f16: P = llvm::FCmpInst::FCMP_OEQ; break;
11696     case NEON::BI__builtin_neon_vcleh_f16: P = llvm::FCmpInst::FCMP_OLE; break;
11697     case NEON::BI__builtin_neon_vclth_f16: P = llvm::FCmpInst::FCMP_OLT; break;
11698     case NEON::BI__builtin_neon_vcgeh_f16: P = llvm::FCmpInst::FCMP_OGE; break;
11699     case NEON::BI__builtin_neon_vcgth_f16: P = llvm::FCmpInst::FCMP_OGT; break;
11700     }
11701     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11702     Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
11703     Ops[1] = Builder.CreateBitCast(Ops[1], HalfTy);
11704     if (P == llvm::FCmpInst::FCMP_OEQ)
11705       Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
11706     else
11707       Ops[0] = Builder.CreateFCmpS(P, Ops[0], Ops[1]);
11708     return Builder.CreateSExt(Ops[0], Int16Ty, "vcmpd");
11709   }
11710   case NEON::BI__builtin_neon_vceqd_s64:
11711   case NEON::BI__builtin_neon_vceqd_u64:
11712   case NEON::BI__builtin_neon_vcgtd_s64:
11713   case NEON::BI__builtin_neon_vcgtd_u64:
11714   case NEON::BI__builtin_neon_vcltd_s64:
11715   case NEON::BI__builtin_neon_vcltd_u64:
11716   case NEON::BI__builtin_neon_vcged_u64:
11717   case NEON::BI__builtin_neon_vcged_s64:
11718   case NEON::BI__builtin_neon_vcled_u64:
11719   case NEON::BI__builtin_neon_vcled_s64: {
11720     llvm::CmpInst::Predicate P;
11721     switch (BuiltinID) {
11722     default: llvm_unreachable("missing builtin ID in switch!");
11723     case NEON::BI__builtin_neon_vceqd_s64:
11724     case NEON::BI__builtin_neon_vceqd_u64:P = llvm::ICmpInst::ICMP_EQ;break;
11725     case NEON::BI__builtin_neon_vcgtd_s64:P = llvm::ICmpInst::ICMP_SGT;break;
11726     case NEON::BI__builtin_neon_vcgtd_u64:P = llvm::ICmpInst::ICMP_UGT;break;
11727     case NEON::BI__builtin_neon_vcltd_s64:P = llvm::ICmpInst::ICMP_SLT;break;
11728     case NEON::BI__builtin_neon_vcltd_u64:P = llvm::ICmpInst::ICMP_ULT;break;
11729     case NEON::BI__builtin_neon_vcged_u64:P = llvm::ICmpInst::ICMP_UGE;break;
11730     case NEON::BI__builtin_neon_vcged_s64:P = llvm::ICmpInst::ICMP_SGE;break;
11731     case NEON::BI__builtin_neon_vcled_u64:P = llvm::ICmpInst::ICMP_ULE;break;
11732     case NEON::BI__builtin_neon_vcled_s64:P = llvm::ICmpInst::ICMP_SLE;break;
11733     }
11734     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11735     Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty);
11736     Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
11737     Ops[0] = Builder.CreateICmp(P, Ops[0], Ops[1]);
11738     return Builder.CreateSExt(Ops[0], Int64Ty, "vceqd");
11739   }
11740   case NEON::BI__builtin_neon_vtstd_s64:
11741   case NEON::BI__builtin_neon_vtstd_u64: {
11742     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11743     Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty);
11744     Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
11745     Ops[0] = Builder.CreateAnd(Ops[0], Ops[1]);
11746     Ops[0] = Builder.CreateICmp(ICmpInst::ICMP_NE, Ops[0],
11747                                 llvm::Constant::getNullValue(Int64Ty));
11748     return Builder.CreateSExt(Ops[0], Int64Ty, "vtstd");
11749   }
11750   case NEON::BI__builtin_neon_vset_lane_i8:
11751   case NEON::BI__builtin_neon_vset_lane_i16:
11752   case NEON::BI__builtin_neon_vset_lane_i32:
11753   case NEON::BI__builtin_neon_vset_lane_i64:
11754   case NEON::BI__builtin_neon_vset_lane_bf16:
11755   case NEON::BI__builtin_neon_vset_lane_f32:
11756   case NEON::BI__builtin_neon_vsetq_lane_i8:
11757   case NEON::BI__builtin_neon_vsetq_lane_i16:
11758   case NEON::BI__builtin_neon_vsetq_lane_i32:
11759   case NEON::BI__builtin_neon_vsetq_lane_i64:
11760   case NEON::BI__builtin_neon_vsetq_lane_bf16:
11761   case NEON::BI__builtin_neon_vsetq_lane_f32:
11762     Ops.push_back(EmitScalarExpr(E->getArg(2)));
11763     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
11764   case NEON::BI__builtin_neon_vset_lane_f64:
11765     // The vector type needs a cast for the v1f64 variant.
11766     Ops[1] =
11767         Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(DoubleTy, 1));
11768     Ops.push_back(EmitScalarExpr(E->getArg(2)));
11769     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
11770   case NEON::BI__builtin_neon_vsetq_lane_f64:
11771     // The vector type needs a cast for the v2f64 variant.
11772     Ops[1] =
11773         Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(DoubleTy, 2));
11774     Ops.push_back(EmitScalarExpr(E->getArg(2)));
11775     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
11776 
11777   case NEON::BI__builtin_neon_vget_lane_i8:
11778   case NEON::BI__builtin_neon_vdupb_lane_i8:
11779     Ops[0] =
11780         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int8Ty, 8));
11781     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11782                                         "vget_lane");
11783   case NEON::BI__builtin_neon_vgetq_lane_i8:
11784   case NEON::BI__builtin_neon_vdupb_laneq_i8:
11785     Ops[0] =
11786         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int8Ty, 16));
11787     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11788                                         "vgetq_lane");
11789   case NEON::BI__builtin_neon_vget_lane_i16:
11790   case NEON::BI__builtin_neon_vduph_lane_i16:
11791     Ops[0] =
11792         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int16Ty, 4));
11793     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11794                                         "vget_lane");
11795   case NEON::BI__builtin_neon_vgetq_lane_i16:
11796   case NEON::BI__builtin_neon_vduph_laneq_i16:
11797     Ops[0] =
11798         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int16Ty, 8));
11799     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11800                                         "vgetq_lane");
11801   case NEON::BI__builtin_neon_vget_lane_i32:
11802   case NEON::BI__builtin_neon_vdups_lane_i32:
11803     Ops[0] =
11804         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int32Ty, 2));
11805     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11806                                         "vget_lane");
11807   case NEON::BI__builtin_neon_vdups_lane_f32:
11808     Ops[0] =
11809         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(FloatTy, 2));
11810     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11811                                         "vdups_lane");
11812   case NEON::BI__builtin_neon_vgetq_lane_i32:
11813   case NEON::BI__builtin_neon_vdups_laneq_i32:
11814     Ops[0] =
11815         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int32Ty, 4));
11816     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11817                                         "vgetq_lane");
11818   case NEON::BI__builtin_neon_vget_lane_i64:
11819   case NEON::BI__builtin_neon_vdupd_lane_i64:
11820     Ops[0] =
11821         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int64Ty, 1));
11822     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11823                                         "vget_lane");
11824   case NEON::BI__builtin_neon_vdupd_lane_f64:
11825     Ops[0] =
11826         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(DoubleTy, 1));
11827     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11828                                         "vdupd_lane");
11829   case NEON::BI__builtin_neon_vgetq_lane_i64:
11830   case NEON::BI__builtin_neon_vdupd_laneq_i64:
11831     Ops[0] =
11832         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int64Ty, 2));
11833     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11834                                         "vgetq_lane");
11835   case NEON::BI__builtin_neon_vget_lane_f32:
11836     Ops[0] =
11837         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(FloatTy, 2));
11838     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11839                                         "vget_lane");
11840   case NEON::BI__builtin_neon_vget_lane_f64:
11841     Ops[0] =
11842         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(DoubleTy, 1));
11843     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11844                                         "vget_lane");
11845   case NEON::BI__builtin_neon_vgetq_lane_f32:
11846   case NEON::BI__builtin_neon_vdups_laneq_f32:
11847     Ops[0] =
11848         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(FloatTy, 4));
11849     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11850                                         "vgetq_lane");
11851   case NEON::BI__builtin_neon_vgetq_lane_f64:
11852   case NEON::BI__builtin_neon_vdupd_laneq_f64:
11853     Ops[0] =
11854         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(DoubleTy, 2));
11855     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11856                                         "vgetq_lane");
11857   case NEON::BI__builtin_neon_vaddh_f16:
11858     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11859     return Builder.CreateFAdd(Ops[0], Ops[1], "vaddh");
11860   case NEON::BI__builtin_neon_vsubh_f16:
11861     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11862     return Builder.CreateFSub(Ops[0], Ops[1], "vsubh");
11863   case NEON::BI__builtin_neon_vmulh_f16:
11864     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11865     return Builder.CreateFMul(Ops[0], Ops[1], "vmulh");
11866   case NEON::BI__builtin_neon_vdivh_f16:
11867     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11868     return Builder.CreateFDiv(Ops[0], Ops[1], "vdivh");
11869   case NEON::BI__builtin_neon_vfmah_f16:
11870     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
11871     return emitCallMaybeConstrainedFPBuiltin(
11872         *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, HalfTy,
11873         {EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)), Ops[0]});
11874   case NEON::BI__builtin_neon_vfmsh_f16: {
11875     Value* Neg = Builder.CreateFNeg(EmitScalarExpr(E->getArg(1)), "vsubh");
11876 
11877     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
11878     return emitCallMaybeConstrainedFPBuiltin(
11879         *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, HalfTy,
11880         {Neg, EmitScalarExpr(E->getArg(2)), Ops[0]});
11881   }
11882   case NEON::BI__builtin_neon_vaddd_s64:
11883   case NEON::BI__builtin_neon_vaddd_u64:
11884     return Builder.CreateAdd(Ops[0], EmitScalarExpr(E->getArg(1)), "vaddd");
11885   case NEON::BI__builtin_neon_vsubd_s64:
11886   case NEON::BI__builtin_neon_vsubd_u64:
11887     return Builder.CreateSub(Ops[0], EmitScalarExpr(E->getArg(1)), "vsubd");
11888   case NEON::BI__builtin_neon_vqdmlalh_s16:
11889   case NEON::BI__builtin_neon_vqdmlslh_s16: {
11890     SmallVector<Value *, 2> ProductOps;
11891     ProductOps.push_back(vectorWrapScalar16(Ops[1]));
11892     ProductOps.push_back(vectorWrapScalar16(EmitScalarExpr(E->getArg(2))));
11893     auto *VTy = llvm::FixedVectorType::get(Int32Ty, 4);
11894     Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmull, VTy),
11895                           ProductOps, "vqdmlXl");
11896     Constant *CI = ConstantInt::get(SizeTy, 0);
11897     Ops[1] = Builder.CreateExtractElement(Ops[1], CI, "lane0");
11898 
11899     unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlalh_s16
11900                                         ? Intrinsic::aarch64_neon_sqadd
11901                                         : Intrinsic::aarch64_neon_sqsub;
11902     return EmitNeonCall(CGM.getIntrinsic(AccumInt, Int32Ty), Ops, "vqdmlXl");
11903   }
11904   case NEON::BI__builtin_neon_vqshlud_n_s64: {
11905     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11906     Ops[1] = Builder.CreateZExt(Ops[1], Int64Ty);
11907     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqshlu, Int64Ty),
11908                         Ops, "vqshlu_n");
11909   }
11910   case NEON::BI__builtin_neon_vqshld_n_u64:
11911   case NEON::BI__builtin_neon_vqshld_n_s64: {
11912     unsigned Int = BuiltinID == NEON::BI__builtin_neon_vqshld_n_u64
11913                                    ? Intrinsic::aarch64_neon_uqshl
11914                                    : Intrinsic::aarch64_neon_sqshl;
11915     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11916     Ops[1] = Builder.CreateZExt(Ops[1], Int64Ty);
11917     return EmitNeonCall(CGM.getIntrinsic(Int, Int64Ty), Ops, "vqshl_n");
11918   }
11919   case NEON::BI__builtin_neon_vrshrd_n_u64:
11920   case NEON::BI__builtin_neon_vrshrd_n_s64: {
11921     unsigned Int = BuiltinID == NEON::BI__builtin_neon_vrshrd_n_u64
11922                                    ? Intrinsic::aarch64_neon_urshl
11923                                    : Intrinsic::aarch64_neon_srshl;
11924     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11925     int SV = cast<ConstantInt>(Ops[1])->getSExtValue();
11926     Ops[1] = ConstantInt::get(Int64Ty, -SV);
11927     return EmitNeonCall(CGM.getIntrinsic(Int, Int64Ty), Ops, "vrshr_n");
11928   }
11929   case NEON::BI__builtin_neon_vrsrad_n_u64:
11930   case NEON::BI__builtin_neon_vrsrad_n_s64: {
11931     unsigned Int = BuiltinID == NEON::BI__builtin_neon_vrsrad_n_u64
11932                                    ? Intrinsic::aarch64_neon_urshl
11933                                    : Intrinsic::aarch64_neon_srshl;
11934     Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
11935     Ops.push_back(Builder.CreateNeg(EmitScalarExpr(E->getArg(2))));
11936     Ops[1] = Builder.CreateCall(CGM.getIntrinsic(Int, Int64Ty),
11937                                 {Ops[1], Builder.CreateSExt(Ops[2], Int64Ty)});
11938     return Builder.CreateAdd(Ops[0], Builder.CreateBitCast(Ops[1], Int64Ty));
11939   }
11940   case NEON::BI__builtin_neon_vshld_n_s64:
11941   case NEON::BI__builtin_neon_vshld_n_u64: {
11942     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
11943     return Builder.CreateShl(
11944         Ops[0], ConstantInt::get(Int64Ty, Amt->getZExtValue()), "shld_n");
11945   }
11946   case NEON::BI__builtin_neon_vshrd_n_s64: {
11947     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
11948     return Builder.CreateAShr(
11949         Ops[0], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63),
11950                                                    Amt->getZExtValue())),
11951         "shrd_n");
11952   }
11953   case NEON::BI__builtin_neon_vshrd_n_u64: {
11954     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
11955     uint64_t ShiftAmt = Amt->getZExtValue();
11956     // Right-shifting an unsigned value by its size yields 0.
11957     if (ShiftAmt == 64)
11958       return ConstantInt::get(Int64Ty, 0);
11959     return Builder.CreateLShr(Ops[0], ConstantInt::get(Int64Ty, ShiftAmt),
11960                               "shrd_n");
11961   }
11962   case NEON::BI__builtin_neon_vsrad_n_s64: {
11963     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(2)));
11964     Ops[1] = Builder.CreateAShr(
11965         Ops[1], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63),
11966                                                    Amt->getZExtValue())),
11967         "shrd_n");
11968     return Builder.CreateAdd(Ops[0], Ops[1]);
11969   }
11970   case NEON::BI__builtin_neon_vsrad_n_u64: {
11971     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(2)));
11972     uint64_t ShiftAmt = Amt->getZExtValue();
11973     // Right-shifting an unsigned value by its size yields 0.
11974     // As Op + 0 = Op, return Ops[0] directly.
11975     if (ShiftAmt == 64)
11976       return Ops[0];
11977     Ops[1] = Builder.CreateLShr(Ops[1], ConstantInt::get(Int64Ty, ShiftAmt),
11978                                 "shrd_n");
11979     return Builder.CreateAdd(Ops[0], Ops[1]);
11980   }
11981   case NEON::BI__builtin_neon_vqdmlalh_lane_s16:
11982   case NEON::BI__builtin_neon_vqdmlalh_laneq_s16:
11983   case NEON::BI__builtin_neon_vqdmlslh_lane_s16:
11984   case NEON::BI__builtin_neon_vqdmlslh_laneq_s16: {
11985     Ops[2] = Builder.CreateExtractElement(Ops[2], EmitScalarExpr(E->getArg(3)),
11986                                           "lane");
11987     SmallVector<Value *, 2> ProductOps;
11988     ProductOps.push_back(vectorWrapScalar16(Ops[1]));
11989     ProductOps.push_back(vectorWrapScalar16(Ops[2]));
11990     auto *VTy = llvm::FixedVectorType::get(Int32Ty, 4);
11991     Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmull, VTy),
11992                           ProductOps, "vqdmlXl");
11993     Constant *CI = ConstantInt::get(SizeTy, 0);
11994     Ops[1] = Builder.CreateExtractElement(Ops[1], CI, "lane0");
11995     Ops.pop_back();
11996 
11997     unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlalh_lane_s16 ||
11998                        BuiltinID == NEON::BI__builtin_neon_vqdmlalh_laneq_s16)
11999                           ? Intrinsic::aarch64_neon_sqadd
12000                           : Intrinsic::aarch64_neon_sqsub;
12001     return EmitNeonCall(CGM.getIntrinsic(AccInt, Int32Ty), Ops, "vqdmlXl");
12002   }
12003   case NEON::BI__builtin_neon_vqdmlals_s32:
12004   case NEON::BI__builtin_neon_vqdmlsls_s32: {
12005     SmallVector<Value *, 2> ProductOps;
12006     ProductOps.push_back(Ops[1]);
12007     ProductOps.push_back(EmitScalarExpr(E->getArg(2)));
12008     Ops[1] =
12009         EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmulls_scalar),
12010                      ProductOps, "vqdmlXl");
12011 
12012     unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlals_s32
12013                                         ? Intrinsic::aarch64_neon_sqadd
12014                                         : Intrinsic::aarch64_neon_sqsub;
12015     return EmitNeonCall(CGM.getIntrinsic(AccumInt, Int64Ty), Ops, "vqdmlXl");
12016   }
12017   case NEON::BI__builtin_neon_vqdmlals_lane_s32:
12018   case NEON::BI__builtin_neon_vqdmlals_laneq_s32:
12019   case NEON::BI__builtin_neon_vqdmlsls_lane_s32:
12020   case NEON::BI__builtin_neon_vqdmlsls_laneq_s32: {
12021     Ops[2] = Builder.CreateExtractElement(Ops[2], EmitScalarExpr(E->getArg(3)),
12022                                           "lane");
12023     SmallVector<Value *, 2> ProductOps;
12024     ProductOps.push_back(Ops[1]);
12025     ProductOps.push_back(Ops[2]);
12026     Ops[1] =
12027         EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmulls_scalar),
12028                      ProductOps, "vqdmlXl");
12029     Ops.pop_back();
12030 
12031     unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlals_lane_s32 ||
12032                        BuiltinID == NEON::BI__builtin_neon_vqdmlals_laneq_s32)
12033                           ? Intrinsic::aarch64_neon_sqadd
12034                           : Intrinsic::aarch64_neon_sqsub;
12035     return EmitNeonCall(CGM.getIntrinsic(AccInt, Int64Ty), Ops, "vqdmlXl");
12036   }
12037   case NEON::BI__builtin_neon_vget_lane_bf16:
12038   case NEON::BI__builtin_neon_vduph_lane_bf16:
12039   case NEON::BI__builtin_neon_vduph_lane_f16: {
12040     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
12041                                         "vget_lane");
12042   }
12043   case NEON::BI__builtin_neon_vgetq_lane_bf16:
12044   case NEON::BI__builtin_neon_vduph_laneq_bf16:
12045   case NEON::BI__builtin_neon_vduph_laneq_f16: {
12046     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
12047                                         "vgetq_lane");
12048   }
12049 
12050   case clang::AArch64::BI_InterlockedAdd: {
12051     Address DestAddr = CheckAtomicAlignment(*this, E);
12052     Value *Val = EmitScalarExpr(E->getArg(1));
12053     AtomicRMWInst *RMWI =
12054         Builder.CreateAtomicRMW(AtomicRMWInst::Add, DestAddr, Val,
12055                                 llvm::AtomicOrdering::SequentiallyConsistent);
12056     return Builder.CreateAdd(RMWI, Val);
12057   }
12058   }
12059 
12060   llvm::FixedVectorType *VTy = GetNeonType(this, Type);
12061   llvm::Type *Ty = VTy;
12062   if (!Ty)
12063     return nullptr;
12064 
12065   // Not all intrinsics handled by the common case work for AArch64 yet, so only
12066   // defer to common code if it's been added to our special map.
12067   Builtin = findARMVectorIntrinsicInMap(AArch64SIMDIntrinsicMap, BuiltinID,
12068                                         AArch64SIMDIntrinsicsProvenSorted);
12069 
12070   if (Builtin)
12071     return EmitCommonNeonBuiltinExpr(
12072         Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic,
12073         Builtin->NameHint, Builtin->TypeModifier, E, Ops,
12074         /*never use addresses*/ Address::invalid(), Address::invalid(), Arch);
12075 
12076   if (Value *V = EmitAArch64TblBuiltinExpr(*this, BuiltinID, E, Ops, Arch))
12077     return V;
12078 
12079   unsigned Int;
12080   switch (BuiltinID) {
12081   default: return nullptr;
12082   case NEON::BI__builtin_neon_vbsl_v:
12083   case NEON::BI__builtin_neon_vbslq_v: {
12084     llvm::Type *BitTy = llvm::VectorType::getInteger(VTy);
12085     Ops[0] = Builder.CreateBitCast(Ops[0], BitTy, "vbsl");
12086     Ops[1] = Builder.CreateBitCast(Ops[1], BitTy, "vbsl");
12087     Ops[2] = Builder.CreateBitCast(Ops[2], BitTy, "vbsl");
12088 
12089     Ops[1] = Builder.CreateAnd(Ops[0], Ops[1], "vbsl");
12090     Ops[2] = Builder.CreateAnd(Builder.CreateNot(Ops[0]), Ops[2], "vbsl");
12091     Ops[0] = Builder.CreateOr(Ops[1], Ops[2], "vbsl");
12092     return Builder.CreateBitCast(Ops[0], Ty);
12093   }
12094   case NEON::BI__builtin_neon_vfma_lane_v:
12095   case NEON::BI__builtin_neon_vfmaq_lane_v: { // Only used for FP types
12096     // The ARM builtins (and instructions) have the addend as the first
12097     // operand, but the 'fma' intrinsics have it last. Swap it around here.
12098     Value *Addend = Ops[0];
12099     Value *Multiplicand = Ops[1];
12100     Value *LaneSource = Ops[2];
12101     Ops[0] = Multiplicand;
12102     Ops[1] = LaneSource;
12103     Ops[2] = Addend;
12104 
12105     // Now adjust things to handle the lane access.
12106     auto *SourceTy = BuiltinID == NEON::BI__builtin_neon_vfmaq_lane_v
12107                          ? llvm::FixedVectorType::get(VTy->getElementType(),
12108                                                       VTy->getNumElements() / 2)
12109                          : VTy;
12110     llvm::Constant *cst = cast<Constant>(Ops[3]);
12111     Value *SV = llvm::ConstantVector::getSplat(VTy->getElementCount(), cst);
12112     Ops[1] = Builder.CreateBitCast(Ops[1], SourceTy);
12113     Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV, "lane");
12114 
12115     Ops.pop_back();
12116     Int = Builder.getIsFPConstrained() ? Intrinsic::experimental_constrained_fma
12117                                        : Intrinsic::fma;
12118     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fmla");
12119   }
12120   case NEON::BI__builtin_neon_vfma_laneq_v: {
12121     auto *VTy = cast<llvm::FixedVectorType>(Ty);
12122     // v1f64 fma should be mapped to Neon scalar f64 fma
12123     if (VTy && VTy->getElementType() == DoubleTy) {
12124       Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
12125       Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy);
12126       llvm::FixedVectorType *VTy =
12127           GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, true));
12128       Ops[2] = Builder.CreateBitCast(Ops[2], VTy);
12129       Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
12130       Value *Result;
12131       Result = emitCallMaybeConstrainedFPBuiltin(
12132           *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma,
12133           DoubleTy, {Ops[1], Ops[2], Ops[0]});
12134       return Builder.CreateBitCast(Result, Ty);
12135     }
12136     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
12137     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
12138 
12139     auto *STy = llvm::FixedVectorType::get(VTy->getElementType(),
12140                                            VTy->getNumElements() * 2);
12141     Ops[2] = Builder.CreateBitCast(Ops[2], STy);
12142     Value *SV = llvm::ConstantVector::getSplat(VTy->getElementCount(),
12143                                                cast<ConstantInt>(Ops[3]));
12144     Ops[2] = Builder.CreateShuffleVector(Ops[2], Ops[2], SV, "lane");
12145 
12146     return emitCallMaybeConstrainedFPBuiltin(
12147         *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
12148         {Ops[2], Ops[1], Ops[0]});
12149   }
12150   case NEON::BI__builtin_neon_vfmaq_laneq_v: {
12151     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
12152     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
12153 
12154     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
12155     Ops[2] = EmitNeonSplat(Ops[2], cast<ConstantInt>(Ops[3]));
12156     return emitCallMaybeConstrainedFPBuiltin(
12157         *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
12158         {Ops[2], Ops[1], Ops[0]});
12159   }
12160   case NEON::BI__builtin_neon_vfmah_lane_f16:
12161   case NEON::BI__builtin_neon_vfmas_lane_f32:
12162   case NEON::BI__builtin_neon_vfmah_laneq_f16:
12163   case NEON::BI__builtin_neon_vfmas_laneq_f32:
12164   case NEON::BI__builtin_neon_vfmad_lane_f64:
12165   case NEON::BI__builtin_neon_vfmad_laneq_f64: {
12166     Ops.push_back(EmitScalarExpr(E->getArg(3)));
12167     llvm::Type *Ty = ConvertType(E->getCallReturnType(getContext()));
12168     Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
12169     return emitCallMaybeConstrainedFPBuiltin(
12170         *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
12171         {Ops[1], Ops[2], Ops[0]});
12172   }
12173   case NEON::BI__builtin_neon_vmull_v:
12174     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
12175     Int = usgn ? Intrinsic::aarch64_neon_umull : Intrinsic::aarch64_neon_smull;
12176     if (Type.isPoly()) Int = Intrinsic::aarch64_neon_pmull;
12177     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull");
12178   case NEON::BI__builtin_neon_vmax_v:
12179   case NEON::BI__builtin_neon_vmaxq_v:
12180     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
12181     Int = usgn ? Intrinsic::aarch64_neon_umax : Intrinsic::aarch64_neon_smax;
12182     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmax;
12183     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmax");
12184   case NEON::BI__builtin_neon_vmaxh_f16: {
12185     Ops.push_back(EmitScalarExpr(E->getArg(1)));
12186     Int = Intrinsic::aarch64_neon_fmax;
12187     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmax");
12188   }
12189   case NEON::BI__builtin_neon_vmin_v:
12190   case NEON::BI__builtin_neon_vminq_v:
12191     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
12192     Int = usgn ? Intrinsic::aarch64_neon_umin : Intrinsic::aarch64_neon_smin;
12193     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmin;
12194     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmin");
12195   case NEON::BI__builtin_neon_vminh_f16: {
12196     Ops.push_back(EmitScalarExpr(E->getArg(1)));
12197     Int = Intrinsic::aarch64_neon_fmin;
12198     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmin");
12199   }
12200   case NEON::BI__builtin_neon_vabd_v:
12201   case NEON::BI__builtin_neon_vabdq_v:
12202     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
12203     Int = usgn ? Intrinsic::aarch64_neon_uabd : Intrinsic::aarch64_neon_sabd;
12204     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fabd;
12205     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vabd");
12206   case NEON::BI__builtin_neon_vpadal_v:
12207   case NEON::BI__builtin_neon_vpadalq_v: {
12208     unsigned ArgElts = VTy->getNumElements();
12209     llvm::IntegerType *EltTy = cast<IntegerType>(VTy->getElementType());
12210     unsigned BitWidth = EltTy->getBitWidth();
12211     auto *ArgTy = llvm::FixedVectorType::get(
12212         llvm::IntegerType::get(getLLVMContext(), BitWidth / 2), 2 * ArgElts);
12213     llvm::Type* Tys[2] = { VTy, ArgTy };
12214     Int = usgn ? Intrinsic::aarch64_neon_uaddlp : Intrinsic::aarch64_neon_saddlp;
12215     SmallVector<llvm::Value*, 1> TmpOps;
12216     TmpOps.push_back(Ops[1]);
12217     Function *F = CGM.getIntrinsic(Int, Tys);
12218     llvm::Value *tmp = EmitNeonCall(F, TmpOps, "vpadal");
12219     llvm::Value *addend = Builder.CreateBitCast(Ops[0], tmp->getType());
12220     return Builder.CreateAdd(tmp, addend);
12221   }
12222   case NEON::BI__builtin_neon_vpmin_v:
12223   case NEON::BI__builtin_neon_vpminq_v:
12224     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
12225     Int = usgn ? Intrinsic::aarch64_neon_uminp : Intrinsic::aarch64_neon_sminp;
12226     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fminp;
12227     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmin");
12228   case NEON::BI__builtin_neon_vpmax_v:
12229   case NEON::BI__builtin_neon_vpmaxq_v:
12230     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
12231     Int = usgn ? Intrinsic::aarch64_neon_umaxp : Intrinsic::aarch64_neon_smaxp;
12232     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmaxp;
12233     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmax");
12234   case NEON::BI__builtin_neon_vminnm_v:
12235   case NEON::BI__builtin_neon_vminnmq_v:
12236     Int = Intrinsic::aarch64_neon_fminnm;
12237     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vminnm");
12238   case NEON::BI__builtin_neon_vminnmh_f16:
12239     Ops.push_back(EmitScalarExpr(E->getArg(1)));
12240     Int = Intrinsic::aarch64_neon_fminnm;
12241     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vminnm");
12242   case NEON::BI__builtin_neon_vmaxnm_v:
12243   case NEON::BI__builtin_neon_vmaxnmq_v:
12244     Int = Intrinsic::aarch64_neon_fmaxnm;
12245     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmaxnm");
12246   case NEON::BI__builtin_neon_vmaxnmh_f16:
12247     Ops.push_back(EmitScalarExpr(E->getArg(1)));
12248     Int = Intrinsic::aarch64_neon_fmaxnm;
12249     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmaxnm");
12250   case NEON::BI__builtin_neon_vrecpss_f32: {
12251     Ops.push_back(EmitScalarExpr(E->getArg(1)));
12252     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, FloatTy),
12253                         Ops, "vrecps");
12254   }
12255   case NEON::BI__builtin_neon_vrecpsd_f64:
12256     Ops.push_back(EmitScalarExpr(E->getArg(1)));
12257     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, DoubleTy),
12258                         Ops, "vrecps");
12259   case NEON::BI__builtin_neon_vrecpsh_f16:
12260     Ops.push_back(EmitScalarExpr(E->getArg(1)));
12261     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, HalfTy),
12262                         Ops, "vrecps");
12263   case NEON::BI__builtin_neon_vqshrun_n_v:
12264     Int = Intrinsic::aarch64_neon_sqshrun;
12265     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrun_n");
12266   case NEON::BI__builtin_neon_vqrshrun_n_v:
12267     Int = Intrinsic::aarch64_neon_sqrshrun;
12268     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrun_n");
12269   case NEON::BI__builtin_neon_vqshrn_n_v:
12270     Int = usgn ? Intrinsic::aarch64_neon_uqshrn : Intrinsic::aarch64_neon_sqshrn;
12271     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n");
12272   case NEON::BI__builtin_neon_vrshrn_n_v:
12273     Int = Intrinsic::aarch64_neon_rshrn;
12274     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshrn_n");
12275   case NEON::BI__builtin_neon_vqrshrn_n_v:
12276     Int = usgn ? Intrinsic::aarch64_neon_uqrshrn : Intrinsic::aarch64_neon_sqrshrn;
12277     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n");
12278   case NEON::BI__builtin_neon_vrndah_f16: {
12279     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12280     Int = Builder.getIsFPConstrained()
12281               ? Intrinsic::experimental_constrained_round
12282               : Intrinsic::round;
12283     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrnda");
12284   }
12285   case NEON::BI__builtin_neon_vrnda_v:
12286   case NEON::BI__builtin_neon_vrndaq_v: {
12287     Int = Builder.getIsFPConstrained()
12288               ? Intrinsic::experimental_constrained_round
12289               : Intrinsic::round;
12290     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnda");
12291   }
12292   case NEON::BI__builtin_neon_vrndih_f16: {
12293     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12294     Int = Builder.getIsFPConstrained()
12295               ? Intrinsic::experimental_constrained_nearbyint
12296               : Intrinsic::nearbyint;
12297     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndi");
12298   }
12299   case NEON::BI__builtin_neon_vrndmh_f16: {
12300     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12301     Int = Builder.getIsFPConstrained()
12302               ? Intrinsic::experimental_constrained_floor
12303               : Intrinsic::floor;
12304     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndm");
12305   }
12306   case NEON::BI__builtin_neon_vrndm_v:
12307   case NEON::BI__builtin_neon_vrndmq_v: {
12308     Int = Builder.getIsFPConstrained()
12309               ? Intrinsic::experimental_constrained_floor
12310               : Intrinsic::floor;
12311     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndm");
12312   }
12313   case NEON::BI__builtin_neon_vrndnh_f16: {
12314     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12315     Int = Builder.getIsFPConstrained()
12316               ? Intrinsic::experimental_constrained_roundeven
12317               : Intrinsic::roundeven;
12318     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndn");
12319   }
12320   case NEON::BI__builtin_neon_vrndn_v:
12321   case NEON::BI__builtin_neon_vrndnq_v: {
12322     Int = Builder.getIsFPConstrained()
12323               ? Intrinsic::experimental_constrained_roundeven
12324               : Intrinsic::roundeven;
12325     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndn");
12326   }
12327   case NEON::BI__builtin_neon_vrndns_f32: {
12328     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12329     Int = Builder.getIsFPConstrained()
12330               ? Intrinsic::experimental_constrained_roundeven
12331               : Intrinsic::roundeven;
12332     return EmitNeonCall(CGM.getIntrinsic(Int, FloatTy), Ops, "vrndn");
12333   }
12334   case NEON::BI__builtin_neon_vrndph_f16: {
12335     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12336     Int = Builder.getIsFPConstrained()
12337               ? Intrinsic::experimental_constrained_ceil
12338               : Intrinsic::ceil;
12339     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndp");
12340   }
12341   case NEON::BI__builtin_neon_vrndp_v:
12342   case NEON::BI__builtin_neon_vrndpq_v: {
12343     Int = Builder.getIsFPConstrained()
12344               ? Intrinsic::experimental_constrained_ceil
12345               : Intrinsic::ceil;
12346     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndp");
12347   }
12348   case NEON::BI__builtin_neon_vrndxh_f16: {
12349     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12350     Int = Builder.getIsFPConstrained()
12351               ? Intrinsic::experimental_constrained_rint
12352               : Intrinsic::rint;
12353     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndx");
12354   }
12355   case NEON::BI__builtin_neon_vrndx_v:
12356   case NEON::BI__builtin_neon_vrndxq_v: {
12357     Int = Builder.getIsFPConstrained()
12358               ? Intrinsic::experimental_constrained_rint
12359               : Intrinsic::rint;
12360     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndx");
12361   }
12362   case NEON::BI__builtin_neon_vrndh_f16: {
12363     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12364     Int = Builder.getIsFPConstrained()
12365               ? Intrinsic::experimental_constrained_trunc
12366               : Intrinsic::trunc;
12367     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndz");
12368   }
12369   case NEON::BI__builtin_neon_vrnd32x_f32:
12370   case NEON::BI__builtin_neon_vrnd32xq_f32:
12371   case NEON::BI__builtin_neon_vrnd32x_f64:
12372   case NEON::BI__builtin_neon_vrnd32xq_f64: {
12373     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12374     Int = Intrinsic::aarch64_neon_frint32x;
12375     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd32x");
12376   }
12377   case NEON::BI__builtin_neon_vrnd32z_f32:
12378   case NEON::BI__builtin_neon_vrnd32zq_f32:
12379   case NEON::BI__builtin_neon_vrnd32z_f64:
12380   case NEON::BI__builtin_neon_vrnd32zq_f64: {
12381     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12382     Int = Intrinsic::aarch64_neon_frint32z;
12383     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd32z");
12384   }
12385   case NEON::BI__builtin_neon_vrnd64x_f32:
12386   case NEON::BI__builtin_neon_vrnd64xq_f32:
12387   case NEON::BI__builtin_neon_vrnd64x_f64:
12388   case NEON::BI__builtin_neon_vrnd64xq_f64: {
12389     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12390     Int = Intrinsic::aarch64_neon_frint64x;
12391     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd64x");
12392   }
12393   case NEON::BI__builtin_neon_vrnd64z_f32:
12394   case NEON::BI__builtin_neon_vrnd64zq_f32:
12395   case NEON::BI__builtin_neon_vrnd64z_f64:
12396   case NEON::BI__builtin_neon_vrnd64zq_f64: {
12397     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12398     Int = Intrinsic::aarch64_neon_frint64z;
12399     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd64z");
12400   }
12401   case NEON::BI__builtin_neon_vrnd_v:
12402   case NEON::BI__builtin_neon_vrndq_v: {
12403     Int = Builder.getIsFPConstrained()
12404               ? Intrinsic::experimental_constrained_trunc
12405               : Intrinsic::trunc;
12406     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndz");
12407   }
12408   case NEON::BI__builtin_neon_vcvt_f64_v:
12409   case NEON::BI__builtin_neon_vcvtq_f64_v:
12410     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
12411     Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad));
12412     return usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
12413                 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
12414   case NEON::BI__builtin_neon_vcvt_f64_f32: {
12415     assert(Type.getEltType() == NeonTypeFlags::Float64 && quad &&
12416            "unexpected vcvt_f64_f32 builtin");
12417     NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float32, false, false);
12418     Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag));
12419 
12420     return Builder.CreateFPExt(Ops[0], Ty, "vcvt");
12421   }
12422   case NEON::BI__builtin_neon_vcvt_f32_f64: {
12423     assert(Type.getEltType() == NeonTypeFlags::Float32 &&
12424            "unexpected vcvt_f32_f64 builtin");
12425     NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float64, false, true);
12426     Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag));
12427 
12428     return Builder.CreateFPTrunc(Ops[0], Ty, "vcvt");
12429   }
12430   case NEON::BI__builtin_neon_vcvt_s32_v:
12431   case NEON::BI__builtin_neon_vcvt_u32_v:
12432   case NEON::BI__builtin_neon_vcvt_s64_v:
12433   case NEON::BI__builtin_neon_vcvt_u64_v:
12434   case NEON::BI__builtin_neon_vcvt_s16_f16:
12435   case NEON::BI__builtin_neon_vcvt_u16_f16:
12436   case NEON::BI__builtin_neon_vcvtq_s32_v:
12437   case NEON::BI__builtin_neon_vcvtq_u32_v:
12438   case NEON::BI__builtin_neon_vcvtq_s64_v:
12439   case NEON::BI__builtin_neon_vcvtq_u64_v:
12440   case NEON::BI__builtin_neon_vcvtq_s16_f16:
12441   case NEON::BI__builtin_neon_vcvtq_u16_f16: {
12442     Int =
12443         usgn ? Intrinsic::aarch64_neon_fcvtzu : Intrinsic::aarch64_neon_fcvtzs;
12444     llvm::Type *Tys[2] = {Ty, GetFloatNeonType(this, Type)};
12445     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtz");
12446   }
12447   case NEON::BI__builtin_neon_vcvta_s16_f16:
12448   case NEON::BI__builtin_neon_vcvta_u16_f16:
12449   case NEON::BI__builtin_neon_vcvta_s32_v:
12450   case NEON::BI__builtin_neon_vcvtaq_s16_f16:
12451   case NEON::BI__builtin_neon_vcvtaq_s32_v:
12452   case NEON::BI__builtin_neon_vcvta_u32_v:
12453   case NEON::BI__builtin_neon_vcvtaq_u16_f16:
12454   case NEON::BI__builtin_neon_vcvtaq_u32_v:
12455   case NEON::BI__builtin_neon_vcvta_s64_v:
12456   case NEON::BI__builtin_neon_vcvtaq_s64_v:
12457   case NEON::BI__builtin_neon_vcvta_u64_v:
12458   case NEON::BI__builtin_neon_vcvtaq_u64_v: {
12459     Int = usgn ? Intrinsic::aarch64_neon_fcvtau : Intrinsic::aarch64_neon_fcvtas;
12460     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
12461     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvta");
12462   }
12463   case NEON::BI__builtin_neon_vcvtm_s16_f16:
12464   case NEON::BI__builtin_neon_vcvtm_s32_v:
12465   case NEON::BI__builtin_neon_vcvtmq_s16_f16:
12466   case NEON::BI__builtin_neon_vcvtmq_s32_v:
12467   case NEON::BI__builtin_neon_vcvtm_u16_f16:
12468   case NEON::BI__builtin_neon_vcvtm_u32_v:
12469   case NEON::BI__builtin_neon_vcvtmq_u16_f16:
12470   case NEON::BI__builtin_neon_vcvtmq_u32_v:
12471   case NEON::BI__builtin_neon_vcvtm_s64_v:
12472   case NEON::BI__builtin_neon_vcvtmq_s64_v:
12473   case NEON::BI__builtin_neon_vcvtm_u64_v:
12474   case NEON::BI__builtin_neon_vcvtmq_u64_v: {
12475     Int = usgn ? Intrinsic::aarch64_neon_fcvtmu : Intrinsic::aarch64_neon_fcvtms;
12476     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
12477     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtm");
12478   }
12479   case NEON::BI__builtin_neon_vcvtn_s16_f16:
12480   case NEON::BI__builtin_neon_vcvtn_s32_v:
12481   case NEON::BI__builtin_neon_vcvtnq_s16_f16:
12482   case NEON::BI__builtin_neon_vcvtnq_s32_v:
12483   case NEON::BI__builtin_neon_vcvtn_u16_f16:
12484   case NEON::BI__builtin_neon_vcvtn_u32_v:
12485   case NEON::BI__builtin_neon_vcvtnq_u16_f16:
12486   case NEON::BI__builtin_neon_vcvtnq_u32_v:
12487   case NEON::BI__builtin_neon_vcvtn_s64_v:
12488   case NEON::BI__builtin_neon_vcvtnq_s64_v:
12489   case NEON::BI__builtin_neon_vcvtn_u64_v:
12490   case NEON::BI__builtin_neon_vcvtnq_u64_v: {
12491     Int = usgn ? Intrinsic::aarch64_neon_fcvtnu : Intrinsic::aarch64_neon_fcvtns;
12492     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
12493     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtn");
12494   }
12495   case NEON::BI__builtin_neon_vcvtp_s16_f16:
12496   case NEON::BI__builtin_neon_vcvtp_s32_v:
12497   case NEON::BI__builtin_neon_vcvtpq_s16_f16:
12498   case NEON::BI__builtin_neon_vcvtpq_s32_v:
12499   case NEON::BI__builtin_neon_vcvtp_u16_f16:
12500   case NEON::BI__builtin_neon_vcvtp_u32_v:
12501   case NEON::BI__builtin_neon_vcvtpq_u16_f16:
12502   case NEON::BI__builtin_neon_vcvtpq_u32_v:
12503   case NEON::BI__builtin_neon_vcvtp_s64_v:
12504   case NEON::BI__builtin_neon_vcvtpq_s64_v:
12505   case NEON::BI__builtin_neon_vcvtp_u64_v:
12506   case NEON::BI__builtin_neon_vcvtpq_u64_v: {
12507     Int = usgn ? Intrinsic::aarch64_neon_fcvtpu : Intrinsic::aarch64_neon_fcvtps;
12508     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
12509     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtp");
12510   }
12511   case NEON::BI__builtin_neon_vmulx_v:
12512   case NEON::BI__builtin_neon_vmulxq_v: {
12513     Int = Intrinsic::aarch64_neon_fmulx;
12514     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmulx");
12515   }
12516   case NEON::BI__builtin_neon_vmulxh_lane_f16:
12517   case NEON::BI__builtin_neon_vmulxh_laneq_f16: {
12518     // vmulx_lane should be mapped to Neon scalar mulx after
12519     // extracting the scalar element
12520     Ops.push_back(EmitScalarExpr(E->getArg(2)));
12521     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
12522     Ops.pop_back();
12523     Int = Intrinsic::aarch64_neon_fmulx;
12524     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmulx");
12525   }
12526   case NEON::BI__builtin_neon_vmul_lane_v:
12527   case NEON::BI__builtin_neon_vmul_laneq_v: {
12528     // v1f64 vmul_lane should be mapped to Neon scalar mul lane
12529     bool Quad = false;
12530     if (BuiltinID == NEON::BI__builtin_neon_vmul_laneq_v)
12531       Quad = true;
12532     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
12533     llvm::FixedVectorType *VTy =
12534         GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, Quad));
12535     Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
12536     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
12537     Value *Result = Builder.CreateFMul(Ops[0], Ops[1]);
12538     return Builder.CreateBitCast(Result, Ty);
12539   }
12540   case NEON::BI__builtin_neon_vnegd_s64:
12541     return Builder.CreateNeg(EmitScalarExpr(E->getArg(0)), "vnegd");
12542   case NEON::BI__builtin_neon_vnegh_f16:
12543     return Builder.CreateFNeg(EmitScalarExpr(E->getArg(0)), "vnegh");
12544   case NEON::BI__builtin_neon_vpmaxnm_v:
12545   case NEON::BI__builtin_neon_vpmaxnmq_v: {
12546     Int = Intrinsic::aarch64_neon_fmaxnmp;
12547     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmaxnm");
12548   }
12549   case NEON::BI__builtin_neon_vpminnm_v:
12550   case NEON::BI__builtin_neon_vpminnmq_v: {
12551     Int = Intrinsic::aarch64_neon_fminnmp;
12552     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpminnm");
12553   }
12554   case NEON::BI__builtin_neon_vsqrth_f16: {
12555     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12556     Int = Builder.getIsFPConstrained()
12557               ? Intrinsic::experimental_constrained_sqrt
12558               : Intrinsic::sqrt;
12559     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vsqrt");
12560   }
12561   case NEON::BI__builtin_neon_vsqrt_v:
12562   case NEON::BI__builtin_neon_vsqrtq_v: {
12563     Int = Builder.getIsFPConstrained()
12564               ? Intrinsic::experimental_constrained_sqrt
12565               : Intrinsic::sqrt;
12566     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
12567     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqrt");
12568   }
12569   case NEON::BI__builtin_neon_vrbit_v:
12570   case NEON::BI__builtin_neon_vrbitq_v: {
12571     Int = Intrinsic::bitreverse;
12572     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrbit");
12573   }
12574   case NEON::BI__builtin_neon_vaddv_u8:
12575     // FIXME: These are handled by the AArch64 scalar code.
12576     usgn = true;
12577     [[fallthrough]];
12578   case NEON::BI__builtin_neon_vaddv_s8: {
12579     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
12580     Ty = Int32Ty;
12581     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
12582     llvm::Type *Tys[2] = { Ty, VTy };
12583     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12584     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
12585     return Builder.CreateTrunc(Ops[0], Int8Ty);
12586   }
12587   case NEON::BI__builtin_neon_vaddv_u16:
12588     usgn = true;
12589     [[fallthrough]];
12590   case NEON::BI__builtin_neon_vaddv_s16: {
12591     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
12592     Ty = Int32Ty;
12593     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
12594     llvm::Type *Tys[2] = { Ty, VTy };
12595     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12596     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
12597     return Builder.CreateTrunc(Ops[0], Int16Ty);
12598   }
12599   case NEON::BI__builtin_neon_vaddvq_u8:
12600     usgn = true;
12601     [[fallthrough]];
12602   case NEON::BI__builtin_neon_vaddvq_s8: {
12603     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
12604     Ty = Int32Ty;
12605     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
12606     llvm::Type *Tys[2] = { Ty, VTy };
12607     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12608     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
12609     return Builder.CreateTrunc(Ops[0], Int8Ty);
12610   }
12611   case NEON::BI__builtin_neon_vaddvq_u16:
12612     usgn = true;
12613     [[fallthrough]];
12614   case NEON::BI__builtin_neon_vaddvq_s16: {
12615     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
12616     Ty = Int32Ty;
12617     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
12618     llvm::Type *Tys[2] = { Ty, VTy };
12619     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12620     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
12621     return Builder.CreateTrunc(Ops[0], Int16Ty);
12622   }
12623   case NEON::BI__builtin_neon_vmaxv_u8: {
12624     Int = Intrinsic::aarch64_neon_umaxv;
12625     Ty = Int32Ty;
12626     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
12627     llvm::Type *Tys[2] = { Ty, VTy };
12628     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12629     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
12630     return Builder.CreateTrunc(Ops[0], Int8Ty);
12631   }
12632   case NEON::BI__builtin_neon_vmaxv_u16: {
12633     Int = Intrinsic::aarch64_neon_umaxv;
12634     Ty = Int32Ty;
12635     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
12636     llvm::Type *Tys[2] = { Ty, VTy };
12637     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12638     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
12639     return Builder.CreateTrunc(Ops[0], Int16Ty);
12640   }
12641   case NEON::BI__builtin_neon_vmaxvq_u8: {
12642     Int = Intrinsic::aarch64_neon_umaxv;
12643     Ty = Int32Ty;
12644     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
12645     llvm::Type *Tys[2] = { Ty, VTy };
12646     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12647     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
12648     return Builder.CreateTrunc(Ops[0], Int8Ty);
12649   }
12650   case NEON::BI__builtin_neon_vmaxvq_u16: {
12651     Int = Intrinsic::aarch64_neon_umaxv;
12652     Ty = Int32Ty;
12653     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
12654     llvm::Type *Tys[2] = { Ty, VTy };
12655     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12656     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
12657     return Builder.CreateTrunc(Ops[0], Int16Ty);
12658   }
12659   case NEON::BI__builtin_neon_vmaxv_s8: {
12660     Int = Intrinsic::aarch64_neon_smaxv;
12661     Ty = Int32Ty;
12662     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
12663     llvm::Type *Tys[2] = { Ty, VTy };
12664     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12665     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
12666     return Builder.CreateTrunc(Ops[0], Int8Ty);
12667   }
12668   case NEON::BI__builtin_neon_vmaxv_s16: {
12669     Int = Intrinsic::aarch64_neon_smaxv;
12670     Ty = Int32Ty;
12671     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
12672     llvm::Type *Tys[2] = { Ty, VTy };
12673     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12674     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
12675     return Builder.CreateTrunc(Ops[0], Int16Ty);
12676   }
12677   case NEON::BI__builtin_neon_vmaxvq_s8: {
12678     Int = Intrinsic::aarch64_neon_smaxv;
12679     Ty = Int32Ty;
12680     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
12681     llvm::Type *Tys[2] = { Ty, VTy };
12682     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12683     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
12684     return Builder.CreateTrunc(Ops[0], Int8Ty);
12685   }
12686   case NEON::BI__builtin_neon_vmaxvq_s16: {
12687     Int = Intrinsic::aarch64_neon_smaxv;
12688     Ty = Int32Ty;
12689     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
12690     llvm::Type *Tys[2] = { Ty, VTy };
12691     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12692     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
12693     return Builder.CreateTrunc(Ops[0], Int16Ty);
12694   }
12695   case NEON::BI__builtin_neon_vmaxv_f16: {
12696     Int = Intrinsic::aarch64_neon_fmaxv;
12697     Ty = HalfTy;
12698     VTy = llvm::FixedVectorType::get(HalfTy, 4);
12699     llvm::Type *Tys[2] = { Ty, VTy };
12700     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12701     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
12702     return Builder.CreateTrunc(Ops[0], HalfTy);
12703   }
12704   case NEON::BI__builtin_neon_vmaxvq_f16: {
12705     Int = Intrinsic::aarch64_neon_fmaxv;
12706     Ty = HalfTy;
12707     VTy = llvm::FixedVectorType::get(HalfTy, 8);
12708     llvm::Type *Tys[2] = { Ty, VTy };
12709     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12710     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
12711     return Builder.CreateTrunc(Ops[0], HalfTy);
12712   }
12713   case NEON::BI__builtin_neon_vminv_u8: {
12714     Int = Intrinsic::aarch64_neon_uminv;
12715     Ty = Int32Ty;
12716     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
12717     llvm::Type *Tys[2] = { Ty, VTy };
12718     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12719     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12720     return Builder.CreateTrunc(Ops[0], Int8Ty);
12721   }
12722   case NEON::BI__builtin_neon_vminv_u16: {
12723     Int = Intrinsic::aarch64_neon_uminv;
12724     Ty = Int32Ty;
12725     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
12726     llvm::Type *Tys[2] = { Ty, VTy };
12727     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12728     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12729     return Builder.CreateTrunc(Ops[0], Int16Ty);
12730   }
12731   case NEON::BI__builtin_neon_vminvq_u8: {
12732     Int = Intrinsic::aarch64_neon_uminv;
12733     Ty = Int32Ty;
12734     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
12735     llvm::Type *Tys[2] = { Ty, VTy };
12736     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12737     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12738     return Builder.CreateTrunc(Ops[0], Int8Ty);
12739   }
12740   case NEON::BI__builtin_neon_vminvq_u16: {
12741     Int = Intrinsic::aarch64_neon_uminv;
12742     Ty = Int32Ty;
12743     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
12744     llvm::Type *Tys[2] = { Ty, VTy };
12745     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12746     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12747     return Builder.CreateTrunc(Ops[0], Int16Ty);
12748   }
12749   case NEON::BI__builtin_neon_vminv_s8: {
12750     Int = Intrinsic::aarch64_neon_sminv;
12751     Ty = Int32Ty;
12752     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
12753     llvm::Type *Tys[2] = { Ty, VTy };
12754     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12755     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12756     return Builder.CreateTrunc(Ops[0], Int8Ty);
12757   }
12758   case NEON::BI__builtin_neon_vminv_s16: {
12759     Int = Intrinsic::aarch64_neon_sminv;
12760     Ty = Int32Ty;
12761     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
12762     llvm::Type *Tys[2] = { Ty, VTy };
12763     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12764     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12765     return Builder.CreateTrunc(Ops[0], Int16Ty);
12766   }
12767   case NEON::BI__builtin_neon_vminvq_s8: {
12768     Int = Intrinsic::aarch64_neon_sminv;
12769     Ty = Int32Ty;
12770     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
12771     llvm::Type *Tys[2] = { Ty, VTy };
12772     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12773     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12774     return Builder.CreateTrunc(Ops[0], Int8Ty);
12775   }
12776   case NEON::BI__builtin_neon_vminvq_s16: {
12777     Int = Intrinsic::aarch64_neon_sminv;
12778     Ty = Int32Ty;
12779     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
12780     llvm::Type *Tys[2] = { Ty, VTy };
12781     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12782     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12783     return Builder.CreateTrunc(Ops[0], Int16Ty);
12784   }
12785   case NEON::BI__builtin_neon_vminv_f16: {
12786     Int = Intrinsic::aarch64_neon_fminv;
12787     Ty = HalfTy;
12788     VTy = llvm::FixedVectorType::get(HalfTy, 4);
12789     llvm::Type *Tys[2] = { Ty, VTy };
12790     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12791     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12792     return Builder.CreateTrunc(Ops[0], HalfTy);
12793   }
12794   case NEON::BI__builtin_neon_vminvq_f16: {
12795     Int = Intrinsic::aarch64_neon_fminv;
12796     Ty = HalfTy;
12797     VTy = llvm::FixedVectorType::get(HalfTy, 8);
12798     llvm::Type *Tys[2] = { Ty, VTy };
12799     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12800     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12801     return Builder.CreateTrunc(Ops[0], HalfTy);
12802   }
12803   case NEON::BI__builtin_neon_vmaxnmv_f16: {
12804     Int = Intrinsic::aarch64_neon_fmaxnmv;
12805     Ty = HalfTy;
12806     VTy = llvm::FixedVectorType::get(HalfTy, 4);
12807     llvm::Type *Tys[2] = { Ty, VTy };
12808     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12809     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxnmv");
12810     return Builder.CreateTrunc(Ops[0], HalfTy);
12811   }
12812   case NEON::BI__builtin_neon_vmaxnmvq_f16: {
12813     Int = Intrinsic::aarch64_neon_fmaxnmv;
12814     Ty = HalfTy;
12815     VTy = llvm::FixedVectorType::get(HalfTy, 8);
12816     llvm::Type *Tys[2] = { Ty, VTy };
12817     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12818     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxnmv");
12819     return Builder.CreateTrunc(Ops[0], HalfTy);
12820   }
12821   case NEON::BI__builtin_neon_vminnmv_f16: {
12822     Int = Intrinsic::aarch64_neon_fminnmv;
12823     Ty = HalfTy;
12824     VTy = llvm::FixedVectorType::get(HalfTy, 4);
12825     llvm::Type *Tys[2] = { Ty, VTy };
12826     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12827     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminnmv");
12828     return Builder.CreateTrunc(Ops[0], HalfTy);
12829   }
12830   case NEON::BI__builtin_neon_vminnmvq_f16: {
12831     Int = Intrinsic::aarch64_neon_fminnmv;
12832     Ty = HalfTy;
12833     VTy = llvm::FixedVectorType::get(HalfTy, 8);
12834     llvm::Type *Tys[2] = { Ty, VTy };
12835     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12836     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminnmv");
12837     return Builder.CreateTrunc(Ops[0], HalfTy);
12838   }
12839   case NEON::BI__builtin_neon_vmul_n_f64: {
12840     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
12841     Value *RHS = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)), DoubleTy);
12842     return Builder.CreateFMul(Ops[0], RHS);
12843   }
12844   case NEON::BI__builtin_neon_vaddlv_u8: {
12845     Int = Intrinsic::aarch64_neon_uaddlv;
12846     Ty = Int32Ty;
12847     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
12848     llvm::Type *Tys[2] = { Ty, VTy };
12849     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12850     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
12851     return Builder.CreateTrunc(Ops[0], Int16Ty);
12852   }
12853   case NEON::BI__builtin_neon_vaddlv_u16: {
12854     Int = Intrinsic::aarch64_neon_uaddlv;
12855     Ty = Int32Ty;
12856     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
12857     llvm::Type *Tys[2] = { Ty, VTy };
12858     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12859     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
12860   }
12861   case NEON::BI__builtin_neon_vaddlvq_u8: {
12862     Int = Intrinsic::aarch64_neon_uaddlv;
12863     Ty = Int32Ty;
12864     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
12865     llvm::Type *Tys[2] = { Ty, VTy };
12866     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12867     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
12868     return Builder.CreateTrunc(Ops[0], Int16Ty);
12869   }
12870   case NEON::BI__builtin_neon_vaddlvq_u16: {
12871     Int = Intrinsic::aarch64_neon_uaddlv;
12872     Ty = Int32Ty;
12873     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
12874     llvm::Type *Tys[2] = { Ty, VTy };
12875     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12876     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
12877   }
12878   case NEON::BI__builtin_neon_vaddlv_s8: {
12879     Int = Intrinsic::aarch64_neon_saddlv;
12880     Ty = Int32Ty;
12881     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
12882     llvm::Type *Tys[2] = { Ty, VTy };
12883     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12884     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
12885     return Builder.CreateTrunc(Ops[0], Int16Ty);
12886   }
12887   case NEON::BI__builtin_neon_vaddlv_s16: {
12888     Int = Intrinsic::aarch64_neon_saddlv;
12889     Ty = Int32Ty;
12890     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
12891     llvm::Type *Tys[2] = { Ty, VTy };
12892     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12893     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
12894   }
12895   case NEON::BI__builtin_neon_vaddlvq_s8: {
12896     Int = Intrinsic::aarch64_neon_saddlv;
12897     Ty = Int32Ty;
12898     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
12899     llvm::Type *Tys[2] = { Ty, VTy };
12900     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12901     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
12902     return Builder.CreateTrunc(Ops[0], Int16Ty);
12903   }
12904   case NEON::BI__builtin_neon_vaddlvq_s16: {
12905     Int = Intrinsic::aarch64_neon_saddlv;
12906     Ty = Int32Ty;
12907     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
12908     llvm::Type *Tys[2] = { Ty, VTy };
12909     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12910     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
12911   }
12912   case NEON::BI__builtin_neon_vsri_n_v:
12913   case NEON::BI__builtin_neon_vsriq_n_v: {
12914     Int = Intrinsic::aarch64_neon_vsri;
12915     llvm::Function *Intrin = CGM.getIntrinsic(Int, Ty);
12916     return EmitNeonCall(Intrin, Ops, "vsri_n");
12917   }
12918   case NEON::BI__builtin_neon_vsli_n_v:
12919   case NEON::BI__builtin_neon_vsliq_n_v: {
12920     Int = Intrinsic::aarch64_neon_vsli;
12921     llvm::Function *Intrin = CGM.getIntrinsic(Int, Ty);
12922     return EmitNeonCall(Intrin, Ops, "vsli_n");
12923   }
12924   case NEON::BI__builtin_neon_vsra_n_v:
12925   case NEON::BI__builtin_neon_vsraq_n_v:
12926     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
12927     Ops[1] = EmitNeonRShiftImm(Ops[1], Ops[2], Ty, usgn, "vsra_n");
12928     return Builder.CreateAdd(Ops[0], Ops[1]);
12929   case NEON::BI__builtin_neon_vrsra_n_v:
12930   case NEON::BI__builtin_neon_vrsraq_n_v: {
12931     Int = usgn ? Intrinsic::aarch64_neon_urshl : Intrinsic::aarch64_neon_srshl;
12932     SmallVector<llvm::Value*,2> TmpOps;
12933     TmpOps.push_back(Ops[1]);
12934     TmpOps.push_back(Ops[2]);
12935     Function* F = CGM.getIntrinsic(Int, Ty);
12936     llvm::Value *tmp = EmitNeonCall(F, TmpOps, "vrshr_n", 1, true);
12937     Ops[0] = Builder.CreateBitCast(Ops[0], VTy);
12938     return Builder.CreateAdd(Ops[0], tmp);
12939   }
12940   case NEON::BI__builtin_neon_vld1_v:
12941   case NEON::BI__builtin_neon_vld1q_v: {
12942     return Builder.CreateAlignedLoad(VTy, Ops[0], PtrOp0.getAlignment());
12943   }
12944   case NEON::BI__builtin_neon_vst1_v:
12945   case NEON::BI__builtin_neon_vst1q_v:
12946     Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
12947     return Builder.CreateAlignedStore(Ops[1], Ops[0], PtrOp0.getAlignment());
12948   case NEON::BI__builtin_neon_vld1_lane_v:
12949   case NEON::BI__builtin_neon_vld1q_lane_v: {
12950     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
12951     Ops[0] = Builder.CreateAlignedLoad(VTy->getElementType(), Ops[0],
12952                                        PtrOp0.getAlignment());
12953     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vld1_lane");
12954   }
12955   case NEON::BI__builtin_neon_vldap1_lane_s64:
12956   case NEON::BI__builtin_neon_vldap1q_lane_s64: {
12957     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
12958     llvm::LoadInst *LI = Builder.CreateAlignedLoad(
12959         VTy->getElementType(), Ops[0], PtrOp0.getAlignment());
12960     LI->setAtomic(llvm::AtomicOrdering::Acquire);
12961     Ops[0] = LI;
12962     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vldap1_lane");
12963   }
12964   case NEON::BI__builtin_neon_vld1_dup_v:
12965   case NEON::BI__builtin_neon_vld1q_dup_v: {
12966     Value *V = PoisonValue::get(Ty);
12967     Ops[0] = Builder.CreateAlignedLoad(VTy->getElementType(), Ops[0],
12968                                        PtrOp0.getAlignment());
12969     llvm::Constant *CI = ConstantInt::get(Int32Ty, 0);
12970     Ops[0] = Builder.CreateInsertElement(V, Ops[0], CI);
12971     return EmitNeonSplat(Ops[0], CI);
12972   }
12973   case NEON::BI__builtin_neon_vst1_lane_v:
12974   case NEON::BI__builtin_neon_vst1q_lane_v:
12975     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
12976     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
12977     return Builder.CreateAlignedStore(Ops[1], Ops[0], PtrOp0.getAlignment());
12978   case NEON::BI__builtin_neon_vstl1_lane_s64:
12979   case NEON::BI__builtin_neon_vstl1q_lane_s64: {
12980     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
12981     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
12982     llvm::StoreInst *SI =
12983         Builder.CreateAlignedStore(Ops[1], Ops[0], PtrOp0.getAlignment());
12984     SI->setAtomic(llvm::AtomicOrdering::Release);
12985     return SI;
12986   }
12987   case NEON::BI__builtin_neon_vld2_v:
12988   case NEON::BI__builtin_neon_vld2q_v: {
12989     llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
12990     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2, Tys);
12991     Ops[1] = Builder.CreateCall(F, Ops[1], "vld2");
12992     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
12993   }
12994   case NEON::BI__builtin_neon_vld3_v:
12995   case NEON::BI__builtin_neon_vld3q_v: {
12996     llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
12997     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3, Tys);
12998     Ops[1] = Builder.CreateCall(F, Ops[1], "vld3");
12999     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
13000   }
13001   case NEON::BI__builtin_neon_vld4_v:
13002   case NEON::BI__builtin_neon_vld4q_v: {
13003     llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
13004     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4, Tys);
13005     Ops[1] = Builder.CreateCall(F, Ops[1], "vld4");
13006     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
13007   }
13008   case NEON::BI__builtin_neon_vld2_dup_v:
13009   case NEON::BI__builtin_neon_vld2q_dup_v: {
13010     llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
13011     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2r, Tys);
13012     Ops[1] = Builder.CreateCall(F, Ops[1], "vld2");
13013     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
13014   }
13015   case NEON::BI__builtin_neon_vld3_dup_v:
13016   case NEON::BI__builtin_neon_vld3q_dup_v: {
13017     llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
13018     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3r, Tys);
13019     Ops[1] = Builder.CreateCall(F, Ops[1], "vld3");
13020     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
13021   }
13022   case NEON::BI__builtin_neon_vld4_dup_v:
13023   case NEON::BI__builtin_neon_vld4q_dup_v: {
13024     llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
13025     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4r, Tys);
13026     Ops[1] = Builder.CreateCall(F, Ops[1], "vld4");
13027     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
13028   }
13029   case NEON::BI__builtin_neon_vld2_lane_v:
13030   case NEON::BI__builtin_neon_vld2q_lane_v: {
13031     llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
13032     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2lane, Tys);
13033     std::rotate(Ops.begin() + 1, Ops.begin() + 2, Ops.end());
13034     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
13035     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
13036     Ops[3] = Builder.CreateZExt(Ops[3], Int64Ty);
13037     Ops[1] = Builder.CreateCall(F, ArrayRef(Ops).slice(1), "vld2_lane");
13038     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
13039   }
13040   case NEON::BI__builtin_neon_vld3_lane_v:
13041   case NEON::BI__builtin_neon_vld3q_lane_v: {
13042     llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
13043     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3lane, Tys);
13044     std::rotate(Ops.begin() + 1, Ops.begin() + 2, Ops.end());
13045     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
13046     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
13047     Ops[3] = Builder.CreateBitCast(Ops[3], Ty);
13048     Ops[4] = Builder.CreateZExt(Ops[4], Int64Ty);
13049     Ops[1] = Builder.CreateCall(F, ArrayRef(Ops).slice(1), "vld3_lane");
13050     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
13051   }
13052   case NEON::BI__builtin_neon_vld4_lane_v:
13053   case NEON::BI__builtin_neon_vld4q_lane_v: {
13054     llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
13055     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4lane, Tys);
13056     std::rotate(Ops.begin() + 1, Ops.begin() + 2, Ops.end());
13057     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
13058     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
13059     Ops[3] = Builder.CreateBitCast(Ops[3], Ty);
13060     Ops[4] = Builder.CreateBitCast(Ops[4], Ty);
13061     Ops[5] = Builder.CreateZExt(Ops[5], Int64Ty);
13062     Ops[1] = Builder.CreateCall(F, ArrayRef(Ops).slice(1), "vld4_lane");
13063     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
13064   }
13065   case NEON::BI__builtin_neon_vst2_v:
13066   case NEON::BI__builtin_neon_vst2q_v: {
13067     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
13068     llvm::Type *Tys[2] = { VTy, Ops[2]->getType() };
13069     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st2, Tys),
13070                         Ops, "");
13071   }
13072   case NEON::BI__builtin_neon_vst2_lane_v:
13073   case NEON::BI__builtin_neon_vst2q_lane_v: {
13074     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
13075     Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty);
13076     llvm::Type *Tys[2] = { VTy, Ops[3]->getType() };
13077     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st2lane, Tys),
13078                         Ops, "");
13079   }
13080   case NEON::BI__builtin_neon_vst3_v:
13081   case NEON::BI__builtin_neon_vst3q_v: {
13082     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
13083     llvm::Type *Tys[2] = { VTy, Ops[3]->getType() };
13084     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st3, Tys),
13085                         Ops, "");
13086   }
13087   case NEON::BI__builtin_neon_vst3_lane_v:
13088   case NEON::BI__builtin_neon_vst3q_lane_v: {
13089     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
13090     Ops[3] = Builder.CreateZExt(Ops[3], Int64Ty);
13091     llvm::Type *Tys[2] = { VTy, Ops[4]->getType() };
13092     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st3lane, Tys),
13093                         Ops, "");
13094   }
13095   case NEON::BI__builtin_neon_vst4_v:
13096   case NEON::BI__builtin_neon_vst4q_v: {
13097     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
13098     llvm::Type *Tys[2] = { VTy, Ops[4]->getType() };
13099     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st4, Tys),
13100                         Ops, "");
13101   }
13102   case NEON::BI__builtin_neon_vst4_lane_v:
13103   case NEON::BI__builtin_neon_vst4q_lane_v: {
13104     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
13105     Ops[4] = Builder.CreateZExt(Ops[4], Int64Ty);
13106     llvm::Type *Tys[2] = { VTy, Ops[5]->getType() };
13107     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st4lane, Tys),
13108                         Ops, "");
13109   }
13110   case NEON::BI__builtin_neon_vtrn_v:
13111   case NEON::BI__builtin_neon_vtrnq_v: {
13112     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
13113     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
13114     Value *SV = nullptr;
13115 
13116     for (unsigned vi = 0; vi != 2; ++vi) {
13117       SmallVector<int, 16> Indices;
13118       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
13119         Indices.push_back(i+vi);
13120         Indices.push_back(i+e+vi);
13121       }
13122       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
13123       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vtrn");
13124       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
13125     }
13126     return SV;
13127   }
13128   case NEON::BI__builtin_neon_vuzp_v:
13129   case NEON::BI__builtin_neon_vuzpq_v: {
13130     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
13131     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
13132     Value *SV = nullptr;
13133 
13134     for (unsigned vi = 0; vi != 2; ++vi) {
13135       SmallVector<int, 16> Indices;
13136       for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
13137         Indices.push_back(2*i+vi);
13138 
13139       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
13140       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vuzp");
13141       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
13142     }
13143     return SV;
13144   }
13145   case NEON::BI__builtin_neon_vzip_v:
13146   case NEON::BI__builtin_neon_vzipq_v: {
13147     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
13148     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
13149     Value *SV = nullptr;
13150 
13151     for (unsigned vi = 0; vi != 2; ++vi) {
13152       SmallVector<int, 16> Indices;
13153       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
13154         Indices.push_back((i + vi*e) >> 1);
13155         Indices.push_back(((i + vi*e) >> 1)+e);
13156       }
13157       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
13158       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vzip");
13159       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
13160     }
13161     return SV;
13162   }
13163   case NEON::BI__builtin_neon_vqtbl1q_v: {
13164     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl1, Ty),
13165                         Ops, "vtbl1");
13166   }
13167   case NEON::BI__builtin_neon_vqtbl2q_v: {
13168     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl2, Ty),
13169                         Ops, "vtbl2");
13170   }
13171   case NEON::BI__builtin_neon_vqtbl3q_v: {
13172     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl3, Ty),
13173                         Ops, "vtbl3");
13174   }
13175   case NEON::BI__builtin_neon_vqtbl4q_v: {
13176     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl4, Ty),
13177                         Ops, "vtbl4");
13178   }
13179   case NEON::BI__builtin_neon_vqtbx1q_v: {
13180     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx1, Ty),
13181                         Ops, "vtbx1");
13182   }
13183   case NEON::BI__builtin_neon_vqtbx2q_v: {
13184     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx2, Ty),
13185                         Ops, "vtbx2");
13186   }
13187   case NEON::BI__builtin_neon_vqtbx3q_v: {
13188     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx3, Ty),
13189                         Ops, "vtbx3");
13190   }
13191   case NEON::BI__builtin_neon_vqtbx4q_v: {
13192     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx4, Ty),
13193                         Ops, "vtbx4");
13194   }
13195   case NEON::BI__builtin_neon_vsqadd_v:
13196   case NEON::BI__builtin_neon_vsqaddq_v: {
13197     Int = Intrinsic::aarch64_neon_usqadd;
13198     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqadd");
13199   }
13200   case NEON::BI__builtin_neon_vuqadd_v:
13201   case NEON::BI__builtin_neon_vuqaddq_v: {
13202     Int = Intrinsic::aarch64_neon_suqadd;
13203     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd");
13204   }
13205   }
13206 }
13207 
13208 Value *CodeGenFunction::EmitBPFBuiltinExpr(unsigned BuiltinID,
13209                                            const CallExpr *E) {
13210   assert((BuiltinID == BPF::BI__builtin_preserve_field_info ||
13211           BuiltinID == BPF::BI__builtin_btf_type_id ||
13212           BuiltinID == BPF::BI__builtin_preserve_type_info ||
13213           BuiltinID == BPF::BI__builtin_preserve_enum_value) &&
13214          "unexpected BPF builtin");
13215 
13216   // A sequence number, injected into IR builtin functions, to
13217   // prevent CSE given the only difference of the function
13218   // may just be the debuginfo metadata.
13219   static uint32_t BuiltinSeqNum;
13220 
13221   switch (BuiltinID) {
13222   default:
13223     llvm_unreachable("Unexpected BPF builtin");
13224   case BPF::BI__builtin_preserve_field_info: {
13225     const Expr *Arg = E->getArg(0);
13226     bool IsBitField = Arg->IgnoreParens()->getObjectKind() == OK_BitField;
13227 
13228     if (!getDebugInfo()) {
13229       CGM.Error(E->getExprLoc(),
13230                 "using __builtin_preserve_field_info() without -g");
13231       return IsBitField ? EmitLValue(Arg).getBitFieldPointer()
13232                         : EmitLValue(Arg).getPointer(*this);
13233     }
13234 
13235     // Enable underlying preserve_*_access_index() generation.
13236     bool OldIsInPreservedAIRegion = IsInPreservedAIRegion;
13237     IsInPreservedAIRegion = true;
13238     Value *FieldAddr = IsBitField ? EmitLValue(Arg).getBitFieldPointer()
13239                                   : EmitLValue(Arg).getPointer(*this);
13240     IsInPreservedAIRegion = OldIsInPreservedAIRegion;
13241 
13242     ConstantInt *C = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
13243     Value *InfoKind = ConstantInt::get(Int64Ty, C->getSExtValue());
13244 
13245     // Built the IR for the preserve_field_info intrinsic.
13246     llvm::Function *FnGetFieldInfo = llvm::Intrinsic::getDeclaration(
13247         &CGM.getModule(), llvm::Intrinsic::bpf_preserve_field_info,
13248         {FieldAddr->getType()});
13249     return Builder.CreateCall(FnGetFieldInfo, {FieldAddr, InfoKind});
13250   }
13251   case BPF::BI__builtin_btf_type_id:
13252   case BPF::BI__builtin_preserve_type_info: {
13253     if (!getDebugInfo()) {
13254       CGM.Error(E->getExprLoc(), "using builtin function without -g");
13255       return nullptr;
13256     }
13257 
13258     const Expr *Arg0 = E->getArg(0);
13259     llvm::DIType *DbgInfo = getDebugInfo()->getOrCreateStandaloneType(
13260         Arg0->getType(), Arg0->getExprLoc());
13261 
13262     ConstantInt *Flag = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
13263     Value *FlagValue = ConstantInt::get(Int64Ty, Flag->getSExtValue());
13264     Value *SeqNumVal = ConstantInt::get(Int32Ty, BuiltinSeqNum++);
13265 
13266     llvm::Function *FnDecl;
13267     if (BuiltinID == BPF::BI__builtin_btf_type_id)
13268       FnDecl = llvm::Intrinsic::getDeclaration(
13269           &CGM.getModule(), llvm::Intrinsic::bpf_btf_type_id, {});
13270     else
13271       FnDecl = llvm::Intrinsic::getDeclaration(
13272           &CGM.getModule(), llvm::Intrinsic::bpf_preserve_type_info, {});
13273     CallInst *Fn = Builder.CreateCall(FnDecl, {SeqNumVal, FlagValue});
13274     Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo);
13275     return Fn;
13276   }
13277   case BPF::BI__builtin_preserve_enum_value: {
13278     if (!getDebugInfo()) {
13279       CGM.Error(E->getExprLoc(), "using builtin function without -g");
13280       return nullptr;
13281     }
13282 
13283     const Expr *Arg0 = E->getArg(0);
13284     llvm::DIType *DbgInfo = getDebugInfo()->getOrCreateStandaloneType(
13285         Arg0->getType(), Arg0->getExprLoc());
13286 
13287     // Find enumerator
13288     const auto *UO = cast<UnaryOperator>(Arg0->IgnoreParens());
13289     const auto *CE = cast<CStyleCastExpr>(UO->getSubExpr());
13290     const auto *DR = cast<DeclRefExpr>(CE->getSubExpr());
13291     const auto *Enumerator = cast<EnumConstantDecl>(DR->getDecl());
13292 
13293     auto InitVal = Enumerator->getInitVal();
13294     std::string InitValStr;
13295     if (InitVal.isNegative() || InitVal > uint64_t(INT64_MAX))
13296       InitValStr = std::to_string(InitVal.getSExtValue());
13297     else
13298       InitValStr = std::to_string(InitVal.getZExtValue());
13299     std::string EnumStr = Enumerator->getNameAsString() + ":" + InitValStr;
13300     Value *EnumStrVal = Builder.CreateGlobalStringPtr(EnumStr);
13301 
13302     ConstantInt *Flag = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
13303     Value *FlagValue = ConstantInt::get(Int64Ty, Flag->getSExtValue());
13304     Value *SeqNumVal = ConstantInt::get(Int32Ty, BuiltinSeqNum++);
13305 
13306     llvm::Function *IntrinsicFn = llvm::Intrinsic::getDeclaration(
13307         &CGM.getModule(), llvm::Intrinsic::bpf_preserve_enum_value, {});
13308     CallInst *Fn =
13309         Builder.CreateCall(IntrinsicFn, {SeqNumVal, EnumStrVal, FlagValue});
13310     Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo);
13311     return Fn;
13312   }
13313   }
13314 }
13315 
13316 llvm::Value *CodeGenFunction::
13317 BuildVector(ArrayRef<llvm::Value*> Ops) {
13318   assert((Ops.size() & (Ops.size() - 1)) == 0 &&
13319          "Not a power-of-two sized vector!");
13320   bool AllConstants = true;
13321   for (unsigned i = 0, e = Ops.size(); i != e && AllConstants; ++i)
13322     AllConstants &= isa<Constant>(Ops[i]);
13323 
13324   // If this is a constant vector, create a ConstantVector.
13325   if (AllConstants) {
13326     SmallVector<llvm::Constant*, 16> CstOps;
13327     for (unsigned i = 0, e = Ops.size(); i != e; ++i)
13328       CstOps.push_back(cast<Constant>(Ops[i]));
13329     return llvm::ConstantVector::get(CstOps);
13330   }
13331 
13332   // Otherwise, insertelement the values to build the vector.
13333   Value *Result = llvm::PoisonValue::get(
13334       llvm::FixedVectorType::get(Ops[0]->getType(), Ops.size()));
13335 
13336   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
13337     Result = Builder.CreateInsertElement(Result, Ops[i], Builder.getInt64(i));
13338 
13339   return Result;
13340 }
13341 
13342 // Convert the mask from an integer type to a vector of i1.
13343 static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask,
13344                               unsigned NumElts) {
13345 
13346   auto *MaskTy = llvm::FixedVectorType::get(
13347       CGF.Builder.getInt1Ty(),
13348       cast<IntegerType>(Mask->getType())->getBitWidth());
13349   Value *MaskVec = CGF.Builder.CreateBitCast(Mask, MaskTy);
13350 
13351   // If we have less than 8 elements, then the starting mask was an i8 and
13352   // we need to extract down to the right number of elements.
13353   if (NumElts < 8) {
13354     int Indices[4];
13355     for (unsigned i = 0; i != NumElts; ++i)
13356       Indices[i] = i;
13357     MaskVec = CGF.Builder.CreateShuffleVector(
13358         MaskVec, MaskVec, ArrayRef(Indices, NumElts), "extract");
13359   }
13360   return MaskVec;
13361 }
13362 
13363 static Value *EmitX86MaskedStore(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
13364                                  Align Alignment) {
13365   Value *Ptr = Ops[0];
13366 
13367   Value *MaskVec = getMaskVecValue(
13368       CGF, Ops[2],
13369       cast<llvm::FixedVectorType>(Ops[1]->getType())->getNumElements());
13370 
13371   return CGF.Builder.CreateMaskedStore(Ops[1], Ptr, Alignment, MaskVec);
13372 }
13373 
13374 static Value *EmitX86MaskedLoad(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
13375                                 Align Alignment) {
13376   llvm::Type *Ty = Ops[1]->getType();
13377   Value *Ptr = Ops[0];
13378 
13379   Value *MaskVec = getMaskVecValue(
13380       CGF, Ops[2], cast<llvm::FixedVectorType>(Ty)->getNumElements());
13381 
13382   return CGF.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, MaskVec, Ops[1]);
13383 }
13384 
13385 static Value *EmitX86ExpandLoad(CodeGenFunction &CGF,
13386                                 ArrayRef<Value *> Ops) {
13387   auto *ResultTy = cast<llvm::VectorType>(Ops[1]->getType());
13388   Value *Ptr = Ops[0];
13389 
13390   Value *MaskVec = getMaskVecValue(
13391       CGF, Ops[2], cast<FixedVectorType>(ResultTy)->getNumElements());
13392 
13393   llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_expandload,
13394                                            ResultTy);
13395   return CGF.Builder.CreateCall(F, { Ptr, MaskVec, Ops[1] });
13396 }
13397 
13398 static Value *EmitX86CompressExpand(CodeGenFunction &CGF,
13399                                     ArrayRef<Value *> Ops,
13400                                     bool IsCompress) {
13401   auto *ResultTy = cast<llvm::FixedVectorType>(Ops[1]->getType());
13402 
13403   Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements());
13404 
13405   Intrinsic::ID IID = IsCompress ? Intrinsic::x86_avx512_mask_compress
13406                                  : Intrinsic::x86_avx512_mask_expand;
13407   llvm::Function *F = CGF.CGM.getIntrinsic(IID, ResultTy);
13408   return CGF.Builder.CreateCall(F, { Ops[0], Ops[1], MaskVec });
13409 }
13410 
13411 static Value *EmitX86CompressStore(CodeGenFunction &CGF,
13412                                    ArrayRef<Value *> Ops) {
13413   auto *ResultTy = cast<llvm::FixedVectorType>(Ops[1]->getType());
13414   Value *Ptr = Ops[0];
13415 
13416   Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements());
13417 
13418   llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_compressstore,
13419                                            ResultTy);
13420   return CGF.Builder.CreateCall(F, { Ops[1], Ptr, MaskVec });
13421 }
13422 
13423 static Value *EmitX86MaskLogic(CodeGenFunction &CGF, Instruction::BinaryOps Opc,
13424                               ArrayRef<Value *> Ops,
13425                               bool InvertLHS = false) {
13426   unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
13427   Value *LHS = getMaskVecValue(CGF, Ops[0], NumElts);
13428   Value *RHS = getMaskVecValue(CGF, Ops[1], NumElts);
13429 
13430   if (InvertLHS)
13431     LHS = CGF.Builder.CreateNot(LHS);
13432 
13433   return CGF.Builder.CreateBitCast(CGF.Builder.CreateBinOp(Opc, LHS, RHS),
13434                                    Ops[0]->getType());
13435 }
13436 
13437 static Value *EmitX86FunnelShift(CodeGenFunction &CGF, Value *Op0, Value *Op1,
13438                                  Value *Amt, bool IsRight) {
13439   llvm::Type *Ty = Op0->getType();
13440 
13441   // Amount may be scalar immediate, in which case create a splat vector.
13442   // Funnel shifts amounts are treated as modulo and types are all power-of-2 so
13443   // we only care about the lowest log2 bits anyway.
13444   if (Amt->getType() != Ty) {
13445     unsigned NumElts = cast<llvm::FixedVectorType>(Ty)->getNumElements();
13446     Amt = CGF.Builder.CreateIntCast(Amt, Ty->getScalarType(), false);
13447     Amt = CGF.Builder.CreateVectorSplat(NumElts, Amt);
13448   }
13449 
13450   unsigned IID = IsRight ? Intrinsic::fshr : Intrinsic::fshl;
13451   Function *F = CGF.CGM.getIntrinsic(IID, Ty);
13452   return CGF.Builder.CreateCall(F, {Op0, Op1, Amt});
13453 }
13454 
13455 static Value *EmitX86vpcom(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
13456                            bool IsSigned) {
13457   Value *Op0 = Ops[0];
13458   Value *Op1 = Ops[1];
13459   llvm::Type *Ty = Op0->getType();
13460   uint64_t Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7;
13461 
13462   CmpInst::Predicate Pred;
13463   switch (Imm) {
13464   case 0x0:
13465     Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
13466     break;
13467   case 0x1:
13468     Pred = IsSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE;
13469     break;
13470   case 0x2:
13471     Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
13472     break;
13473   case 0x3:
13474     Pred = IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE;
13475     break;
13476   case 0x4:
13477     Pred = ICmpInst::ICMP_EQ;
13478     break;
13479   case 0x5:
13480     Pred = ICmpInst::ICMP_NE;
13481     break;
13482   case 0x6:
13483     return llvm::Constant::getNullValue(Ty); // FALSE
13484   case 0x7:
13485     return llvm::Constant::getAllOnesValue(Ty); // TRUE
13486   default:
13487     llvm_unreachable("Unexpected XOP vpcom/vpcomu predicate");
13488   }
13489 
13490   Value *Cmp = CGF.Builder.CreateICmp(Pred, Op0, Op1);
13491   Value *Res = CGF.Builder.CreateSExt(Cmp, Ty);
13492   return Res;
13493 }
13494 
13495 static Value *EmitX86Select(CodeGenFunction &CGF,
13496                             Value *Mask, Value *Op0, Value *Op1) {
13497 
13498   // If the mask is all ones just return first argument.
13499   if (const auto *C = dyn_cast<Constant>(Mask))
13500     if (C->isAllOnesValue())
13501       return Op0;
13502 
13503   Mask = getMaskVecValue(
13504       CGF, Mask, cast<llvm::FixedVectorType>(Op0->getType())->getNumElements());
13505 
13506   return CGF.Builder.CreateSelect(Mask, Op0, Op1);
13507 }
13508 
13509 static Value *EmitX86ScalarSelect(CodeGenFunction &CGF,
13510                                   Value *Mask, Value *Op0, Value *Op1) {
13511   // If the mask is all ones just return first argument.
13512   if (const auto *C = dyn_cast<Constant>(Mask))
13513     if (C->isAllOnesValue())
13514       return Op0;
13515 
13516   auto *MaskTy = llvm::FixedVectorType::get(
13517       CGF.Builder.getInt1Ty(), Mask->getType()->getIntegerBitWidth());
13518   Mask = CGF.Builder.CreateBitCast(Mask, MaskTy);
13519   Mask = CGF.Builder.CreateExtractElement(Mask, (uint64_t)0);
13520   return CGF.Builder.CreateSelect(Mask, Op0, Op1);
13521 }
13522 
13523 static Value *EmitX86MaskedCompareResult(CodeGenFunction &CGF, Value *Cmp,
13524                                          unsigned NumElts, Value *MaskIn) {
13525   if (MaskIn) {
13526     const auto *C = dyn_cast<Constant>(MaskIn);
13527     if (!C || !C->isAllOnesValue())
13528       Cmp = CGF.Builder.CreateAnd(Cmp, getMaskVecValue(CGF, MaskIn, NumElts));
13529   }
13530 
13531   if (NumElts < 8) {
13532     int Indices[8];
13533     for (unsigned i = 0; i != NumElts; ++i)
13534       Indices[i] = i;
13535     for (unsigned i = NumElts; i != 8; ++i)
13536       Indices[i] = i % NumElts + NumElts;
13537     Cmp = CGF.Builder.CreateShuffleVector(
13538         Cmp, llvm::Constant::getNullValue(Cmp->getType()), Indices);
13539   }
13540 
13541   return CGF.Builder.CreateBitCast(Cmp,
13542                                    IntegerType::get(CGF.getLLVMContext(),
13543                                                     std::max(NumElts, 8U)));
13544 }
13545 
13546 static Value *EmitX86MaskedCompare(CodeGenFunction &CGF, unsigned CC,
13547                                    bool Signed, ArrayRef<Value *> Ops) {
13548   assert((Ops.size() == 2 || Ops.size() == 4) &&
13549          "Unexpected number of arguments");
13550   unsigned NumElts =
13551       cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
13552   Value *Cmp;
13553 
13554   if (CC == 3) {
13555     Cmp = Constant::getNullValue(
13556         llvm::FixedVectorType::get(CGF.Builder.getInt1Ty(), NumElts));
13557   } else if (CC == 7) {
13558     Cmp = Constant::getAllOnesValue(
13559         llvm::FixedVectorType::get(CGF.Builder.getInt1Ty(), NumElts));
13560   } else {
13561     ICmpInst::Predicate Pred;
13562     switch (CC) {
13563     default: llvm_unreachable("Unknown condition code");
13564     case 0: Pred = ICmpInst::ICMP_EQ;  break;
13565     case 1: Pred = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break;
13566     case 2: Pred = Signed ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break;
13567     case 4: Pred = ICmpInst::ICMP_NE;  break;
13568     case 5: Pred = Signed ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break;
13569     case 6: Pred = Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break;
13570     }
13571     Cmp = CGF.Builder.CreateICmp(Pred, Ops[0], Ops[1]);
13572   }
13573 
13574   Value *MaskIn = nullptr;
13575   if (Ops.size() == 4)
13576     MaskIn = Ops[3];
13577 
13578   return EmitX86MaskedCompareResult(CGF, Cmp, NumElts, MaskIn);
13579 }
13580 
13581 static Value *EmitX86ConvertToMask(CodeGenFunction &CGF, Value *In) {
13582   Value *Zero = Constant::getNullValue(In->getType());
13583   return EmitX86MaskedCompare(CGF, 1, true, { In, Zero });
13584 }
13585 
13586 static Value *EmitX86ConvertIntToFp(CodeGenFunction &CGF, const CallExpr *E,
13587                                     ArrayRef<Value *> Ops, bool IsSigned) {
13588   unsigned Rnd = cast<llvm::ConstantInt>(Ops[3])->getZExtValue();
13589   llvm::Type *Ty = Ops[1]->getType();
13590 
13591   Value *Res;
13592   if (Rnd != 4) {
13593     Intrinsic::ID IID = IsSigned ? Intrinsic::x86_avx512_sitofp_round
13594                                  : Intrinsic::x86_avx512_uitofp_round;
13595     Function *F = CGF.CGM.getIntrinsic(IID, { Ty, Ops[0]->getType() });
13596     Res = CGF.Builder.CreateCall(F, { Ops[0], Ops[3] });
13597   } else {
13598     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
13599     Res = IsSigned ? CGF.Builder.CreateSIToFP(Ops[0], Ty)
13600                    : CGF.Builder.CreateUIToFP(Ops[0], Ty);
13601   }
13602 
13603   return EmitX86Select(CGF, Ops[2], Res, Ops[1]);
13604 }
13605 
13606 // Lowers X86 FMA intrinsics to IR.
13607 static Value *EmitX86FMAExpr(CodeGenFunction &CGF, const CallExpr *E,
13608                              ArrayRef<Value *> Ops, unsigned BuiltinID,
13609                              bool IsAddSub) {
13610 
13611   bool Subtract = false;
13612   Intrinsic::ID IID = Intrinsic::not_intrinsic;
13613   switch (BuiltinID) {
13614   default: break;
13615   case clang::X86::BI__builtin_ia32_vfmsubph512_mask3:
13616     Subtract = true;
13617     [[fallthrough]];
13618   case clang::X86::BI__builtin_ia32_vfmaddph512_mask:
13619   case clang::X86::BI__builtin_ia32_vfmaddph512_maskz:
13620   case clang::X86::BI__builtin_ia32_vfmaddph512_mask3:
13621     IID = llvm::Intrinsic::x86_avx512fp16_vfmadd_ph_512;
13622     break;
13623   case clang::X86::BI__builtin_ia32_vfmsubaddph512_mask3:
13624     Subtract = true;
13625     [[fallthrough]];
13626   case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask:
13627   case clang::X86::BI__builtin_ia32_vfmaddsubph512_maskz:
13628   case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask3:
13629     IID = llvm::Intrinsic::x86_avx512fp16_vfmaddsub_ph_512;
13630     break;
13631   case clang::X86::BI__builtin_ia32_vfmsubps512_mask3:
13632     Subtract = true;
13633     [[fallthrough]];
13634   case clang::X86::BI__builtin_ia32_vfmaddps512_mask:
13635   case clang::X86::BI__builtin_ia32_vfmaddps512_maskz:
13636   case clang::X86::BI__builtin_ia32_vfmaddps512_mask3:
13637     IID = llvm::Intrinsic::x86_avx512_vfmadd_ps_512; break;
13638   case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3:
13639     Subtract = true;
13640     [[fallthrough]];
13641   case clang::X86::BI__builtin_ia32_vfmaddpd512_mask:
13642   case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz:
13643   case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3:
13644     IID = llvm::Intrinsic::x86_avx512_vfmadd_pd_512; break;
13645   case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3:
13646     Subtract = true;
13647     [[fallthrough]];
13648   case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask:
13649   case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz:
13650   case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3:
13651     IID = llvm::Intrinsic::x86_avx512_vfmaddsub_ps_512;
13652     break;
13653   case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
13654     Subtract = true;
13655     [[fallthrough]];
13656   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask:
13657   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
13658   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
13659     IID = llvm::Intrinsic::x86_avx512_vfmaddsub_pd_512;
13660     break;
13661   }
13662 
13663   Value *A = Ops[0];
13664   Value *B = Ops[1];
13665   Value *C = Ops[2];
13666 
13667   if (Subtract)
13668     C = CGF.Builder.CreateFNeg(C);
13669 
13670   Value *Res;
13671 
13672   // Only handle in case of _MM_FROUND_CUR_DIRECTION/4 (no rounding).
13673   if (IID != Intrinsic::not_intrinsic &&
13674       (cast<llvm::ConstantInt>(Ops.back())->getZExtValue() != (uint64_t)4 ||
13675        IsAddSub)) {
13676     Function *Intr = CGF.CGM.getIntrinsic(IID);
13677     Res = CGF.Builder.CreateCall(Intr, {A, B, C, Ops.back() });
13678   } else {
13679     llvm::Type *Ty = A->getType();
13680     Function *FMA;
13681     if (CGF.Builder.getIsFPConstrained()) {
13682       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
13683       FMA = CGF.CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, Ty);
13684       Res = CGF.Builder.CreateConstrainedFPCall(FMA, {A, B, C});
13685     } else {
13686       FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty);
13687       Res = CGF.Builder.CreateCall(FMA, {A, B, C});
13688     }
13689   }
13690 
13691   // Handle any required masking.
13692   Value *MaskFalseVal = nullptr;
13693   switch (BuiltinID) {
13694   case clang::X86::BI__builtin_ia32_vfmaddph512_mask:
13695   case clang::X86::BI__builtin_ia32_vfmaddps512_mask:
13696   case clang::X86::BI__builtin_ia32_vfmaddpd512_mask:
13697   case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask:
13698   case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask:
13699   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask:
13700     MaskFalseVal = Ops[0];
13701     break;
13702   case clang::X86::BI__builtin_ia32_vfmaddph512_maskz:
13703   case clang::X86::BI__builtin_ia32_vfmaddps512_maskz:
13704   case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz:
13705   case clang::X86::BI__builtin_ia32_vfmaddsubph512_maskz:
13706   case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz:
13707   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
13708     MaskFalseVal = Constant::getNullValue(Ops[0]->getType());
13709     break;
13710   case clang::X86::BI__builtin_ia32_vfmsubph512_mask3:
13711   case clang::X86::BI__builtin_ia32_vfmaddph512_mask3:
13712   case clang::X86::BI__builtin_ia32_vfmsubps512_mask3:
13713   case clang::X86::BI__builtin_ia32_vfmaddps512_mask3:
13714   case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3:
13715   case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3:
13716   case clang::X86::BI__builtin_ia32_vfmsubaddph512_mask3:
13717   case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask3:
13718   case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3:
13719   case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3:
13720   case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
13721   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
13722     MaskFalseVal = Ops[2];
13723     break;
13724   }
13725 
13726   if (MaskFalseVal)
13727     return EmitX86Select(CGF, Ops[3], Res, MaskFalseVal);
13728 
13729   return Res;
13730 }
13731 
13732 static Value *EmitScalarFMAExpr(CodeGenFunction &CGF, const CallExpr *E,
13733                                 MutableArrayRef<Value *> Ops, Value *Upper,
13734                                 bool ZeroMask = false, unsigned PTIdx = 0,
13735                                 bool NegAcc = false) {
13736   unsigned Rnd = 4;
13737   if (Ops.size() > 4)
13738     Rnd = cast<llvm::ConstantInt>(Ops[4])->getZExtValue();
13739 
13740   if (NegAcc)
13741     Ops[2] = CGF.Builder.CreateFNeg(Ops[2]);
13742 
13743   Ops[0] = CGF.Builder.CreateExtractElement(Ops[0], (uint64_t)0);
13744   Ops[1] = CGF.Builder.CreateExtractElement(Ops[1], (uint64_t)0);
13745   Ops[2] = CGF.Builder.CreateExtractElement(Ops[2], (uint64_t)0);
13746   Value *Res;
13747   if (Rnd != 4) {
13748     Intrinsic::ID IID;
13749 
13750     switch (Ops[0]->getType()->getPrimitiveSizeInBits()) {
13751     case 16:
13752       IID = Intrinsic::x86_avx512fp16_vfmadd_f16;
13753       break;
13754     case 32:
13755       IID = Intrinsic::x86_avx512_vfmadd_f32;
13756       break;
13757     case 64:
13758       IID = Intrinsic::x86_avx512_vfmadd_f64;
13759       break;
13760     default:
13761       llvm_unreachable("Unexpected size");
13762     }
13763     Res = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID),
13764                                  {Ops[0], Ops[1], Ops[2], Ops[4]});
13765   } else if (CGF.Builder.getIsFPConstrained()) {
13766     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
13767     Function *FMA = CGF.CGM.getIntrinsic(
13768         Intrinsic::experimental_constrained_fma, Ops[0]->getType());
13769     Res = CGF.Builder.CreateConstrainedFPCall(FMA, Ops.slice(0, 3));
13770   } else {
13771     Function *FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ops[0]->getType());
13772     Res = CGF.Builder.CreateCall(FMA, Ops.slice(0, 3));
13773   }
13774   // If we have more than 3 arguments, we need to do masking.
13775   if (Ops.size() > 3) {
13776     Value *PassThru = ZeroMask ? Constant::getNullValue(Res->getType())
13777                                : Ops[PTIdx];
13778 
13779     // If we negated the accumulator and the its the PassThru value we need to
13780     // bypass the negate. Conveniently Upper should be the same thing in this
13781     // case.
13782     if (NegAcc && PTIdx == 2)
13783       PassThru = CGF.Builder.CreateExtractElement(Upper, (uint64_t)0);
13784 
13785     Res = EmitX86ScalarSelect(CGF, Ops[3], Res, PassThru);
13786   }
13787   return CGF.Builder.CreateInsertElement(Upper, Res, (uint64_t)0);
13788 }
13789 
13790 static Value *EmitX86Muldq(CodeGenFunction &CGF, bool IsSigned,
13791                            ArrayRef<Value *> Ops) {
13792   llvm::Type *Ty = Ops[0]->getType();
13793   // Arguments have a vXi32 type so cast to vXi64.
13794   Ty = llvm::FixedVectorType::get(CGF.Int64Ty,
13795                                   Ty->getPrimitiveSizeInBits() / 64);
13796   Value *LHS = CGF.Builder.CreateBitCast(Ops[0], Ty);
13797   Value *RHS = CGF.Builder.CreateBitCast(Ops[1], Ty);
13798 
13799   if (IsSigned) {
13800     // Shift left then arithmetic shift right.
13801     Constant *ShiftAmt = ConstantInt::get(Ty, 32);
13802     LHS = CGF.Builder.CreateShl(LHS, ShiftAmt);
13803     LHS = CGF.Builder.CreateAShr(LHS, ShiftAmt);
13804     RHS = CGF.Builder.CreateShl(RHS, ShiftAmt);
13805     RHS = CGF.Builder.CreateAShr(RHS, ShiftAmt);
13806   } else {
13807     // Clear the upper bits.
13808     Constant *Mask = ConstantInt::get(Ty, 0xffffffff);
13809     LHS = CGF.Builder.CreateAnd(LHS, Mask);
13810     RHS = CGF.Builder.CreateAnd(RHS, Mask);
13811   }
13812 
13813   return CGF.Builder.CreateMul(LHS, RHS);
13814 }
13815 
13816 // Emit a masked pternlog intrinsic. This only exists because the header has to
13817 // use a macro and we aren't able to pass the input argument to a pternlog
13818 // builtin and a select builtin without evaluating it twice.
13819 static Value *EmitX86Ternlog(CodeGenFunction &CGF, bool ZeroMask,
13820                              ArrayRef<Value *> Ops) {
13821   llvm::Type *Ty = Ops[0]->getType();
13822 
13823   unsigned VecWidth = Ty->getPrimitiveSizeInBits();
13824   unsigned EltWidth = Ty->getScalarSizeInBits();
13825   Intrinsic::ID IID;
13826   if (VecWidth == 128 && EltWidth == 32)
13827     IID = Intrinsic::x86_avx512_pternlog_d_128;
13828   else if (VecWidth == 256 && EltWidth == 32)
13829     IID = Intrinsic::x86_avx512_pternlog_d_256;
13830   else if (VecWidth == 512 && EltWidth == 32)
13831     IID = Intrinsic::x86_avx512_pternlog_d_512;
13832   else if (VecWidth == 128 && EltWidth == 64)
13833     IID = Intrinsic::x86_avx512_pternlog_q_128;
13834   else if (VecWidth == 256 && EltWidth == 64)
13835     IID = Intrinsic::x86_avx512_pternlog_q_256;
13836   else if (VecWidth == 512 && EltWidth == 64)
13837     IID = Intrinsic::x86_avx512_pternlog_q_512;
13838   else
13839     llvm_unreachable("Unexpected intrinsic");
13840 
13841   Value *Ternlog = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID),
13842                                           Ops.drop_back());
13843   Value *PassThru = ZeroMask ? ConstantAggregateZero::get(Ty) : Ops[0];
13844   return EmitX86Select(CGF, Ops[4], Ternlog, PassThru);
13845 }
13846 
13847 static Value *EmitX86SExtMask(CodeGenFunction &CGF, Value *Op,
13848                               llvm::Type *DstTy) {
13849   unsigned NumberOfElements =
13850       cast<llvm::FixedVectorType>(DstTy)->getNumElements();
13851   Value *Mask = getMaskVecValue(CGF, Op, NumberOfElements);
13852   return CGF.Builder.CreateSExt(Mask, DstTy, "vpmovm2");
13853 }
13854 
13855 Value *CodeGenFunction::EmitX86CpuIs(const CallExpr *E) {
13856   const Expr *CPUExpr = E->getArg(0)->IgnoreParenCasts();
13857   StringRef CPUStr = cast<clang::StringLiteral>(CPUExpr)->getString();
13858   return EmitX86CpuIs(CPUStr);
13859 }
13860 
13861 // Convert F16 halfs to floats.
13862 static Value *EmitX86CvtF16ToFloatExpr(CodeGenFunction &CGF,
13863                                        ArrayRef<Value *> Ops,
13864                                        llvm::Type *DstTy) {
13865   assert((Ops.size() == 1 || Ops.size() == 3 || Ops.size() == 4) &&
13866          "Unknown cvtph2ps intrinsic");
13867 
13868   // If the SAE intrinsic doesn't use default rounding then we can't upgrade.
13869   if (Ops.size() == 4 && cast<llvm::ConstantInt>(Ops[3])->getZExtValue() != 4) {
13870     Function *F =
13871         CGF.CGM.getIntrinsic(Intrinsic::x86_avx512_mask_vcvtph2ps_512);
13872     return CGF.Builder.CreateCall(F, {Ops[0], Ops[1], Ops[2], Ops[3]});
13873   }
13874 
13875   unsigned NumDstElts = cast<llvm::FixedVectorType>(DstTy)->getNumElements();
13876   Value *Src = Ops[0];
13877 
13878   // Extract the subvector.
13879   if (NumDstElts !=
13880       cast<llvm::FixedVectorType>(Src->getType())->getNumElements()) {
13881     assert(NumDstElts == 4 && "Unexpected vector size");
13882     Src = CGF.Builder.CreateShuffleVector(Src, ArrayRef<int>{0, 1, 2, 3});
13883   }
13884 
13885   // Bitcast from vXi16 to vXf16.
13886   auto *HalfTy = llvm::FixedVectorType::get(
13887       llvm::Type::getHalfTy(CGF.getLLVMContext()), NumDstElts);
13888   Src = CGF.Builder.CreateBitCast(Src, HalfTy);
13889 
13890   // Perform the fp-extension.
13891   Value *Res = CGF.Builder.CreateFPExt(Src, DstTy, "cvtph2ps");
13892 
13893   if (Ops.size() >= 3)
13894     Res = EmitX86Select(CGF, Ops[2], Res, Ops[1]);
13895   return Res;
13896 }
13897 
13898 Value *CodeGenFunction::EmitX86CpuIs(StringRef CPUStr) {
13899 
13900   llvm::Type *Int32Ty = Builder.getInt32Ty();
13901 
13902   // Matching the struct layout from the compiler-rt/libgcc structure that is
13903   // filled in:
13904   // unsigned int __cpu_vendor;
13905   // unsigned int __cpu_type;
13906   // unsigned int __cpu_subtype;
13907   // unsigned int __cpu_features[1];
13908   llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty,
13909                                           llvm::ArrayType::get(Int32Ty, 1));
13910 
13911   // Grab the global __cpu_model.
13912   llvm::Constant *CpuModel = CGM.CreateRuntimeVariable(STy, "__cpu_model");
13913   cast<llvm::GlobalValue>(CpuModel)->setDSOLocal(true);
13914 
13915   // Calculate the index needed to access the correct field based on the
13916   // range. Also adjust the expected value.
13917   unsigned Index;
13918   unsigned Value;
13919   std::tie(Index, Value) = StringSwitch<std::pair<unsigned, unsigned>>(CPUStr)
13920 #define X86_VENDOR(ENUM, STRING)                                               \
13921   .Case(STRING, {0u, static_cast<unsigned>(llvm::X86::ENUM)})
13922 #define X86_CPU_TYPE_ALIAS(ENUM, ALIAS)                                        \
13923   .Case(ALIAS, {1u, static_cast<unsigned>(llvm::X86::ENUM)})
13924 #define X86_CPU_TYPE(ENUM, STR)                                                \
13925   .Case(STR, {1u, static_cast<unsigned>(llvm::X86::ENUM)})
13926 #define X86_CPU_SUBTYPE_ALIAS(ENUM, ALIAS)                                     \
13927   .Case(ALIAS, {2u, static_cast<unsigned>(llvm::X86::ENUM)})
13928 #define X86_CPU_SUBTYPE(ENUM, STR)                                             \
13929   .Case(STR, {2u, static_cast<unsigned>(llvm::X86::ENUM)})
13930 #include "llvm/TargetParser/X86TargetParser.def"
13931                                .Default({0, 0});
13932   assert(Value != 0 && "Invalid CPUStr passed to CpuIs");
13933 
13934   // Grab the appropriate field from __cpu_model.
13935   llvm::Value *Idxs[] = {ConstantInt::get(Int32Ty, 0),
13936                          ConstantInt::get(Int32Ty, Index)};
13937   llvm::Value *CpuValue = Builder.CreateGEP(STy, CpuModel, Idxs);
13938   CpuValue = Builder.CreateAlignedLoad(Int32Ty, CpuValue,
13939                                        CharUnits::fromQuantity(4));
13940 
13941   // Check the value of the field against the requested value.
13942   return Builder.CreateICmpEQ(CpuValue,
13943                                   llvm::ConstantInt::get(Int32Ty, Value));
13944 }
13945 
13946 Value *CodeGenFunction::EmitX86CpuSupports(const CallExpr *E) {
13947   const Expr *FeatureExpr = E->getArg(0)->IgnoreParenCasts();
13948   StringRef FeatureStr = cast<StringLiteral>(FeatureExpr)->getString();
13949   return EmitX86CpuSupports(FeatureStr);
13950 }
13951 
13952 Value *CodeGenFunction::EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs) {
13953   return EmitX86CpuSupports(llvm::X86::getCpuSupportsMask(FeatureStrs));
13954 }
13955 
13956 llvm::Value *
13957 CodeGenFunction::EmitX86CpuSupports(std::array<uint32_t, 4> FeatureMask) {
13958   Value *Result = Builder.getTrue();
13959   if (FeatureMask[0] != 0) {
13960     // Matching the struct layout from the compiler-rt/libgcc structure that is
13961     // filled in:
13962     // unsigned int __cpu_vendor;
13963     // unsigned int __cpu_type;
13964     // unsigned int __cpu_subtype;
13965     // unsigned int __cpu_features[1];
13966     llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty,
13967                                             llvm::ArrayType::get(Int32Ty, 1));
13968 
13969     // Grab the global __cpu_model.
13970     llvm::Constant *CpuModel = CGM.CreateRuntimeVariable(STy, "__cpu_model");
13971     cast<llvm::GlobalValue>(CpuModel)->setDSOLocal(true);
13972 
13973     // Grab the first (0th) element from the field __cpu_features off of the
13974     // global in the struct STy.
13975     Value *Idxs[] = {Builder.getInt32(0), Builder.getInt32(3),
13976                      Builder.getInt32(0)};
13977     Value *CpuFeatures = Builder.CreateGEP(STy, CpuModel, Idxs);
13978     Value *Features = Builder.CreateAlignedLoad(Int32Ty, CpuFeatures,
13979                                                 CharUnits::fromQuantity(4));
13980 
13981     // Check the value of the bit corresponding to the feature requested.
13982     Value *Mask = Builder.getInt32(FeatureMask[0]);
13983     Value *Bitset = Builder.CreateAnd(Features, Mask);
13984     Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask);
13985     Result = Builder.CreateAnd(Result, Cmp);
13986   }
13987 
13988   llvm::Type *ATy = llvm::ArrayType::get(Int32Ty, 3);
13989   llvm::Constant *CpuFeatures2 =
13990       CGM.CreateRuntimeVariable(ATy, "__cpu_features2");
13991   cast<llvm::GlobalValue>(CpuFeatures2)->setDSOLocal(true);
13992   for (int i = 1; i != 4; ++i) {
13993     const uint32_t M = FeatureMask[i];
13994     if (!M)
13995       continue;
13996     Value *Idxs[] = {Builder.getInt32(0), Builder.getInt32(i - 1)};
13997     Value *Features = Builder.CreateAlignedLoad(
13998         Int32Ty, Builder.CreateGEP(ATy, CpuFeatures2, Idxs),
13999         CharUnits::fromQuantity(4));
14000     // Check the value of the bit corresponding to the feature requested.
14001     Value *Mask = Builder.getInt32(M);
14002     Value *Bitset = Builder.CreateAnd(Features, Mask);
14003     Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask);
14004     Result = Builder.CreateAnd(Result, Cmp);
14005   }
14006 
14007   return Result;
14008 }
14009 
14010 Value *CodeGenFunction::EmitAArch64CpuInit() {
14011   llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, false);
14012   llvm::FunctionCallee Func =
14013       CGM.CreateRuntimeFunction(FTy, "__init_cpu_features_resolver");
14014   cast<llvm::GlobalValue>(Func.getCallee())->setDSOLocal(true);
14015   cast<llvm::GlobalValue>(Func.getCallee())
14016       ->setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass);
14017   return Builder.CreateCall(Func);
14018 }
14019 
14020 Value *CodeGenFunction::EmitX86CpuInit() {
14021   llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy,
14022                                                     /*Variadic*/ false);
14023   llvm::FunctionCallee Func =
14024       CGM.CreateRuntimeFunction(FTy, "__cpu_indicator_init");
14025   cast<llvm::GlobalValue>(Func.getCallee())->setDSOLocal(true);
14026   cast<llvm::GlobalValue>(Func.getCallee())
14027       ->setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass);
14028   return Builder.CreateCall(Func);
14029 }
14030 
14031 llvm::Value *
14032 CodeGenFunction::EmitAArch64CpuSupports(ArrayRef<StringRef> FeaturesStrs) {
14033   uint64_t FeaturesMask = llvm::AArch64::getCpuSupportsMask(FeaturesStrs);
14034   Value *Result = Builder.getTrue();
14035   if (FeaturesMask != 0) {
14036     // Get features from structure in runtime library
14037     // struct {
14038     //   unsigned long long features;
14039     // } __aarch64_cpu_features;
14040     llvm::Type *STy = llvm::StructType::get(Int64Ty);
14041     llvm::Constant *AArch64CPUFeatures =
14042         CGM.CreateRuntimeVariable(STy, "__aarch64_cpu_features");
14043     cast<llvm::GlobalValue>(AArch64CPUFeatures)->setDSOLocal(true);
14044     llvm::Value *CpuFeatures = Builder.CreateGEP(
14045         STy, AArch64CPUFeatures,
14046         {ConstantInt::get(Int32Ty, 0), ConstantInt::get(Int32Ty, 0)});
14047     Value *Features = Builder.CreateAlignedLoad(Int64Ty, CpuFeatures,
14048                                                 CharUnits::fromQuantity(8));
14049     Value *Mask = Builder.getInt64(FeaturesMask);
14050     Value *Bitset = Builder.CreateAnd(Features, Mask);
14051     Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask);
14052     Result = Builder.CreateAnd(Result, Cmp);
14053   }
14054   return Result;
14055 }
14056 
14057 Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
14058                                            const CallExpr *E) {
14059   if (BuiltinID == X86::BI__builtin_cpu_is)
14060     return EmitX86CpuIs(E);
14061   if (BuiltinID == X86::BI__builtin_cpu_supports)
14062     return EmitX86CpuSupports(E);
14063   if (BuiltinID == X86::BI__builtin_cpu_init)
14064     return EmitX86CpuInit();
14065 
14066   // Handle MSVC intrinsics before argument evaluation to prevent double
14067   // evaluation.
14068   if (std::optional<MSVCIntrin> MsvcIntId = translateX86ToMsvcIntrin(BuiltinID))
14069     return EmitMSVCBuiltinExpr(*MsvcIntId, E);
14070 
14071   SmallVector<Value*, 4> Ops;
14072   bool IsMaskFCmp = false;
14073   bool IsConjFMA = false;
14074 
14075   // Find out if any arguments are required to be integer constant expressions.
14076   unsigned ICEArguments = 0;
14077   ASTContext::GetBuiltinTypeError Error;
14078   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
14079   assert(Error == ASTContext::GE_None && "Should not codegen an error");
14080 
14081   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
14082     Ops.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, i, E));
14083   }
14084 
14085   // These exist so that the builtin that takes an immediate can be bounds
14086   // checked by clang to avoid passing bad immediates to the backend. Since
14087   // AVX has a larger immediate than SSE we would need separate builtins to
14088   // do the different bounds checking. Rather than create a clang specific
14089   // SSE only builtin, this implements eight separate builtins to match gcc
14090   // implementation.
14091   auto getCmpIntrinsicCall = [this, &Ops](Intrinsic::ID ID, unsigned Imm) {
14092     Ops.push_back(llvm::ConstantInt::get(Int8Ty, Imm));
14093     llvm::Function *F = CGM.getIntrinsic(ID);
14094     return Builder.CreateCall(F, Ops);
14095   };
14096 
14097   // For the vector forms of FP comparisons, translate the builtins directly to
14098   // IR.
14099   // TODO: The builtins could be removed if the SSE header files used vector
14100   // extension comparisons directly (vector ordered/unordered may need
14101   // additional support via __builtin_isnan()).
14102   auto getVectorFCmpIR = [this, &Ops, E](CmpInst::Predicate Pred,
14103                                          bool IsSignaling) {
14104     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
14105     Value *Cmp;
14106     if (IsSignaling)
14107       Cmp = Builder.CreateFCmpS(Pred, Ops[0], Ops[1]);
14108     else
14109       Cmp = Builder.CreateFCmp(Pred, Ops[0], Ops[1]);
14110     llvm::VectorType *FPVecTy = cast<llvm::VectorType>(Ops[0]->getType());
14111     llvm::VectorType *IntVecTy = llvm::VectorType::getInteger(FPVecTy);
14112     Value *Sext = Builder.CreateSExt(Cmp, IntVecTy);
14113     return Builder.CreateBitCast(Sext, FPVecTy);
14114   };
14115 
14116   switch (BuiltinID) {
14117   default: return nullptr;
14118   case X86::BI_mm_prefetch: {
14119     Value *Address = Ops[0];
14120     ConstantInt *C = cast<ConstantInt>(Ops[1]);
14121     Value *RW = ConstantInt::get(Int32Ty, (C->getZExtValue() >> 2) & 0x1);
14122     Value *Locality = ConstantInt::get(Int32Ty, C->getZExtValue() & 0x3);
14123     Value *Data = ConstantInt::get(Int32Ty, 1);
14124     Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType());
14125     return Builder.CreateCall(F, {Address, RW, Locality, Data});
14126   }
14127   case X86::BI_mm_clflush: {
14128     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_clflush),
14129                               Ops[0]);
14130   }
14131   case X86::BI_mm_lfence: {
14132     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_lfence));
14133   }
14134   case X86::BI_mm_mfence: {
14135     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_mfence));
14136   }
14137   case X86::BI_mm_sfence: {
14138     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_sfence));
14139   }
14140   case X86::BI_mm_pause: {
14141     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_pause));
14142   }
14143   case X86::BI__rdtsc: {
14144     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_rdtsc));
14145   }
14146   case X86::BI__builtin_ia32_rdtscp: {
14147     Value *Call = Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_rdtscp));
14148     Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 1),
14149                                       Ops[0]);
14150     return Builder.CreateExtractValue(Call, 0);
14151   }
14152   case X86::BI__builtin_ia32_lzcnt_u16:
14153   case X86::BI__builtin_ia32_lzcnt_u32:
14154   case X86::BI__builtin_ia32_lzcnt_u64: {
14155     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType());
14156     return Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)});
14157   }
14158   case X86::BI__builtin_ia32_tzcnt_u16:
14159   case X86::BI__builtin_ia32_tzcnt_u32:
14160   case X86::BI__builtin_ia32_tzcnt_u64: {
14161     Function *F = CGM.getIntrinsic(Intrinsic::cttz, Ops[0]->getType());
14162     return Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)});
14163   }
14164   case X86::BI__builtin_ia32_undef128:
14165   case X86::BI__builtin_ia32_undef256:
14166   case X86::BI__builtin_ia32_undef512:
14167     // The x86 definition of "undef" is not the same as the LLVM definition
14168     // (PR32176). We leave optimizing away an unnecessary zero constant to the
14169     // IR optimizer and backend.
14170     // TODO: If we had a "freeze" IR instruction to generate a fixed undef
14171     // value, we should use that here instead of a zero.
14172     return llvm::Constant::getNullValue(ConvertType(E->getType()));
14173   case X86::BI__builtin_ia32_vec_init_v8qi:
14174   case X86::BI__builtin_ia32_vec_init_v4hi:
14175   case X86::BI__builtin_ia32_vec_init_v2si:
14176     return Builder.CreateBitCast(BuildVector(Ops),
14177                                  llvm::Type::getX86_MMXTy(getLLVMContext()));
14178   case X86::BI__builtin_ia32_vec_ext_v2si:
14179   case X86::BI__builtin_ia32_vec_ext_v16qi:
14180   case X86::BI__builtin_ia32_vec_ext_v8hi:
14181   case X86::BI__builtin_ia32_vec_ext_v4si:
14182   case X86::BI__builtin_ia32_vec_ext_v4sf:
14183   case X86::BI__builtin_ia32_vec_ext_v2di:
14184   case X86::BI__builtin_ia32_vec_ext_v32qi:
14185   case X86::BI__builtin_ia32_vec_ext_v16hi:
14186   case X86::BI__builtin_ia32_vec_ext_v8si:
14187   case X86::BI__builtin_ia32_vec_ext_v4di: {
14188     unsigned NumElts =
14189         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
14190     uint64_t Index = cast<ConstantInt>(Ops[1])->getZExtValue();
14191     Index &= NumElts - 1;
14192     // These builtins exist so we can ensure the index is an ICE and in range.
14193     // Otherwise we could just do this in the header file.
14194     return Builder.CreateExtractElement(Ops[0], Index);
14195   }
14196   case X86::BI__builtin_ia32_vec_set_v16qi:
14197   case X86::BI__builtin_ia32_vec_set_v8hi:
14198   case X86::BI__builtin_ia32_vec_set_v4si:
14199   case X86::BI__builtin_ia32_vec_set_v2di:
14200   case X86::BI__builtin_ia32_vec_set_v32qi:
14201   case X86::BI__builtin_ia32_vec_set_v16hi:
14202   case X86::BI__builtin_ia32_vec_set_v8si:
14203   case X86::BI__builtin_ia32_vec_set_v4di: {
14204     unsigned NumElts =
14205         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
14206     unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue();
14207     Index &= NumElts - 1;
14208     // These builtins exist so we can ensure the index is an ICE and in range.
14209     // Otherwise we could just do this in the header file.
14210     return Builder.CreateInsertElement(Ops[0], Ops[1], Index);
14211   }
14212   case X86::BI_mm_setcsr:
14213   case X86::BI__builtin_ia32_ldmxcsr: {
14214     Address Tmp = CreateMemTemp(E->getArg(0)->getType());
14215     Builder.CreateStore(Ops[0], Tmp);
14216     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_ldmxcsr),
14217                               Tmp.getPointer());
14218   }
14219   case X86::BI_mm_getcsr:
14220   case X86::BI__builtin_ia32_stmxcsr: {
14221     Address Tmp = CreateMemTemp(E->getType());
14222     Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_stmxcsr),
14223                        Tmp.getPointer());
14224     return Builder.CreateLoad(Tmp, "stmxcsr");
14225   }
14226   case X86::BI__builtin_ia32_xsave:
14227   case X86::BI__builtin_ia32_xsave64:
14228   case X86::BI__builtin_ia32_xrstor:
14229   case X86::BI__builtin_ia32_xrstor64:
14230   case X86::BI__builtin_ia32_xsaveopt:
14231   case X86::BI__builtin_ia32_xsaveopt64:
14232   case X86::BI__builtin_ia32_xrstors:
14233   case X86::BI__builtin_ia32_xrstors64:
14234   case X86::BI__builtin_ia32_xsavec:
14235   case X86::BI__builtin_ia32_xsavec64:
14236   case X86::BI__builtin_ia32_xsaves:
14237   case X86::BI__builtin_ia32_xsaves64:
14238   case X86::BI__builtin_ia32_xsetbv:
14239   case X86::BI_xsetbv: {
14240     Intrinsic::ID ID;
14241 #define INTRINSIC_X86_XSAVE_ID(NAME) \
14242     case X86::BI__builtin_ia32_##NAME: \
14243       ID = Intrinsic::x86_##NAME; \
14244       break
14245     switch (BuiltinID) {
14246     default: llvm_unreachable("Unsupported intrinsic!");
14247     INTRINSIC_X86_XSAVE_ID(xsave);
14248     INTRINSIC_X86_XSAVE_ID(xsave64);
14249     INTRINSIC_X86_XSAVE_ID(xrstor);
14250     INTRINSIC_X86_XSAVE_ID(xrstor64);
14251     INTRINSIC_X86_XSAVE_ID(xsaveopt);
14252     INTRINSIC_X86_XSAVE_ID(xsaveopt64);
14253     INTRINSIC_X86_XSAVE_ID(xrstors);
14254     INTRINSIC_X86_XSAVE_ID(xrstors64);
14255     INTRINSIC_X86_XSAVE_ID(xsavec);
14256     INTRINSIC_X86_XSAVE_ID(xsavec64);
14257     INTRINSIC_X86_XSAVE_ID(xsaves);
14258     INTRINSIC_X86_XSAVE_ID(xsaves64);
14259     INTRINSIC_X86_XSAVE_ID(xsetbv);
14260     case X86::BI_xsetbv:
14261       ID = Intrinsic::x86_xsetbv;
14262       break;
14263     }
14264 #undef INTRINSIC_X86_XSAVE_ID
14265     Value *Mhi = Builder.CreateTrunc(
14266       Builder.CreateLShr(Ops[1], ConstantInt::get(Int64Ty, 32)), Int32Ty);
14267     Value *Mlo = Builder.CreateTrunc(Ops[1], Int32Ty);
14268     Ops[1] = Mhi;
14269     Ops.push_back(Mlo);
14270     return Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
14271   }
14272   case X86::BI__builtin_ia32_xgetbv:
14273   case X86::BI_xgetbv:
14274     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_xgetbv), Ops);
14275   case X86::BI__builtin_ia32_storedqudi128_mask:
14276   case X86::BI__builtin_ia32_storedqusi128_mask:
14277   case X86::BI__builtin_ia32_storedquhi128_mask:
14278   case X86::BI__builtin_ia32_storedquqi128_mask:
14279   case X86::BI__builtin_ia32_storeupd128_mask:
14280   case X86::BI__builtin_ia32_storeups128_mask:
14281   case X86::BI__builtin_ia32_storedqudi256_mask:
14282   case X86::BI__builtin_ia32_storedqusi256_mask:
14283   case X86::BI__builtin_ia32_storedquhi256_mask:
14284   case X86::BI__builtin_ia32_storedquqi256_mask:
14285   case X86::BI__builtin_ia32_storeupd256_mask:
14286   case X86::BI__builtin_ia32_storeups256_mask:
14287   case X86::BI__builtin_ia32_storedqudi512_mask:
14288   case X86::BI__builtin_ia32_storedqusi512_mask:
14289   case X86::BI__builtin_ia32_storedquhi512_mask:
14290   case X86::BI__builtin_ia32_storedquqi512_mask:
14291   case X86::BI__builtin_ia32_storeupd512_mask:
14292   case X86::BI__builtin_ia32_storeups512_mask:
14293     return EmitX86MaskedStore(*this, Ops, Align(1));
14294 
14295   case X86::BI__builtin_ia32_storesh128_mask:
14296   case X86::BI__builtin_ia32_storess128_mask:
14297   case X86::BI__builtin_ia32_storesd128_mask:
14298     return EmitX86MaskedStore(*this, Ops, Align(1));
14299 
14300   case X86::BI__builtin_ia32_vpopcntb_128:
14301   case X86::BI__builtin_ia32_vpopcntd_128:
14302   case X86::BI__builtin_ia32_vpopcntq_128:
14303   case X86::BI__builtin_ia32_vpopcntw_128:
14304   case X86::BI__builtin_ia32_vpopcntb_256:
14305   case X86::BI__builtin_ia32_vpopcntd_256:
14306   case X86::BI__builtin_ia32_vpopcntq_256:
14307   case X86::BI__builtin_ia32_vpopcntw_256:
14308   case X86::BI__builtin_ia32_vpopcntb_512:
14309   case X86::BI__builtin_ia32_vpopcntd_512:
14310   case X86::BI__builtin_ia32_vpopcntq_512:
14311   case X86::BI__builtin_ia32_vpopcntw_512: {
14312     llvm::Type *ResultType = ConvertType(E->getType());
14313     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ResultType);
14314     return Builder.CreateCall(F, Ops);
14315   }
14316   case X86::BI__builtin_ia32_cvtmask2b128:
14317   case X86::BI__builtin_ia32_cvtmask2b256:
14318   case X86::BI__builtin_ia32_cvtmask2b512:
14319   case X86::BI__builtin_ia32_cvtmask2w128:
14320   case X86::BI__builtin_ia32_cvtmask2w256:
14321   case X86::BI__builtin_ia32_cvtmask2w512:
14322   case X86::BI__builtin_ia32_cvtmask2d128:
14323   case X86::BI__builtin_ia32_cvtmask2d256:
14324   case X86::BI__builtin_ia32_cvtmask2d512:
14325   case X86::BI__builtin_ia32_cvtmask2q128:
14326   case X86::BI__builtin_ia32_cvtmask2q256:
14327   case X86::BI__builtin_ia32_cvtmask2q512:
14328     return EmitX86SExtMask(*this, Ops[0], ConvertType(E->getType()));
14329 
14330   case X86::BI__builtin_ia32_cvtb2mask128:
14331   case X86::BI__builtin_ia32_cvtb2mask256:
14332   case X86::BI__builtin_ia32_cvtb2mask512:
14333   case X86::BI__builtin_ia32_cvtw2mask128:
14334   case X86::BI__builtin_ia32_cvtw2mask256:
14335   case X86::BI__builtin_ia32_cvtw2mask512:
14336   case X86::BI__builtin_ia32_cvtd2mask128:
14337   case X86::BI__builtin_ia32_cvtd2mask256:
14338   case X86::BI__builtin_ia32_cvtd2mask512:
14339   case X86::BI__builtin_ia32_cvtq2mask128:
14340   case X86::BI__builtin_ia32_cvtq2mask256:
14341   case X86::BI__builtin_ia32_cvtq2mask512:
14342     return EmitX86ConvertToMask(*this, Ops[0]);
14343 
14344   case X86::BI__builtin_ia32_cvtdq2ps512_mask:
14345   case X86::BI__builtin_ia32_cvtqq2ps512_mask:
14346   case X86::BI__builtin_ia32_cvtqq2pd512_mask:
14347   case X86::BI__builtin_ia32_vcvtw2ph512_mask:
14348   case X86::BI__builtin_ia32_vcvtdq2ph512_mask:
14349   case X86::BI__builtin_ia32_vcvtqq2ph512_mask:
14350     return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ true);
14351   case X86::BI__builtin_ia32_cvtudq2ps512_mask:
14352   case X86::BI__builtin_ia32_cvtuqq2ps512_mask:
14353   case X86::BI__builtin_ia32_cvtuqq2pd512_mask:
14354   case X86::BI__builtin_ia32_vcvtuw2ph512_mask:
14355   case X86::BI__builtin_ia32_vcvtudq2ph512_mask:
14356   case X86::BI__builtin_ia32_vcvtuqq2ph512_mask:
14357     return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ false);
14358 
14359   case X86::BI__builtin_ia32_vfmaddss3:
14360   case X86::BI__builtin_ia32_vfmaddsd3:
14361   case X86::BI__builtin_ia32_vfmaddsh3_mask:
14362   case X86::BI__builtin_ia32_vfmaddss3_mask:
14363   case X86::BI__builtin_ia32_vfmaddsd3_mask:
14364     return EmitScalarFMAExpr(*this, E, Ops, Ops[0]);
14365   case X86::BI__builtin_ia32_vfmaddss:
14366   case X86::BI__builtin_ia32_vfmaddsd:
14367     return EmitScalarFMAExpr(*this, E, Ops,
14368                              Constant::getNullValue(Ops[0]->getType()));
14369   case X86::BI__builtin_ia32_vfmaddsh3_maskz:
14370   case X86::BI__builtin_ia32_vfmaddss3_maskz:
14371   case X86::BI__builtin_ia32_vfmaddsd3_maskz:
14372     return EmitScalarFMAExpr(*this, E, Ops, Ops[0], /*ZeroMask*/ true);
14373   case X86::BI__builtin_ia32_vfmaddsh3_mask3:
14374   case X86::BI__builtin_ia32_vfmaddss3_mask3:
14375   case X86::BI__builtin_ia32_vfmaddsd3_mask3:
14376     return EmitScalarFMAExpr(*this, E, Ops, Ops[2], /*ZeroMask*/ false, 2);
14377   case X86::BI__builtin_ia32_vfmsubsh3_mask3:
14378   case X86::BI__builtin_ia32_vfmsubss3_mask3:
14379   case X86::BI__builtin_ia32_vfmsubsd3_mask3:
14380     return EmitScalarFMAExpr(*this, E, Ops, Ops[2], /*ZeroMask*/ false, 2,
14381                              /*NegAcc*/ true);
14382   case X86::BI__builtin_ia32_vfmaddph:
14383   case X86::BI__builtin_ia32_vfmaddps:
14384   case X86::BI__builtin_ia32_vfmaddpd:
14385   case X86::BI__builtin_ia32_vfmaddph256:
14386   case X86::BI__builtin_ia32_vfmaddps256:
14387   case X86::BI__builtin_ia32_vfmaddpd256:
14388   case X86::BI__builtin_ia32_vfmaddph512_mask:
14389   case X86::BI__builtin_ia32_vfmaddph512_maskz:
14390   case X86::BI__builtin_ia32_vfmaddph512_mask3:
14391   case X86::BI__builtin_ia32_vfmaddps512_mask:
14392   case X86::BI__builtin_ia32_vfmaddps512_maskz:
14393   case X86::BI__builtin_ia32_vfmaddps512_mask3:
14394   case X86::BI__builtin_ia32_vfmsubps512_mask3:
14395   case X86::BI__builtin_ia32_vfmaddpd512_mask:
14396   case X86::BI__builtin_ia32_vfmaddpd512_maskz:
14397   case X86::BI__builtin_ia32_vfmaddpd512_mask3:
14398   case X86::BI__builtin_ia32_vfmsubpd512_mask3:
14399   case X86::BI__builtin_ia32_vfmsubph512_mask3:
14400     return EmitX86FMAExpr(*this, E, Ops, BuiltinID, /*IsAddSub*/ false);
14401   case X86::BI__builtin_ia32_vfmaddsubph512_mask:
14402   case X86::BI__builtin_ia32_vfmaddsubph512_maskz:
14403   case X86::BI__builtin_ia32_vfmaddsubph512_mask3:
14404   case X86::BI__builtin_ia32_vfmsubaddph512_mask3:
14405   case X86::BI__builtin_ia32_vfmaddsubps512_mask:
14406   case X86::BI__builtin_ia32_vfmaddsubps512_maskz:
14407   case X86::BI__builtin_ia32_vfmaddsubps512_mask3:
14408   case X86::BI__builtin_ia32_vfmsubaddps512_mask3:
14409   case X86::BI__builtin_ia32_vfmaddsubpd512_mask:
14410   case X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
14411   case X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
14412   case X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
14413     return EmitX86FMAExpr(*this, E, Ops, BuiltinID, /*IsAddSub*/ true);
14414 
14415   case X86::BI__builtin_ia32_movdqa32store128_mask:
14416   case X86::BI__builtin_ia32_movdqa64store128_mask:
14417   case X86::BI__builtin_ia32_storeaps128_mask:
14418   case X86::BI__builtin_ia32_storeapd128_mask:
14419   case X86::BI__builtin_ia32_movdqa32store256_mask:
14420   case X86::BI__builtin_ia32_movdqa64store256_mask:
14421   case X86::BI__builtin_ia32_storeaps256_mask:
14422   case X86::BI__builtin_ia32_storeapd256_mask:
14423   case X86::BI__builtin_ia32_movdqa32store512_mask:
14424   case X86::BI__builtin_ia32_movdqa64store512_mask:
14425   case X86::BI__builtin_ia32_storeaps512_mask:
14426   case X86::BI__builtin_ia32_storeapd512_mask:
14427     return EmitX86MaskedStore(
14428         *this, Ops,
14429         getContext().getTypeAlignInChars(E->getArg(1)->getType()).getAsAlign());
14430 
14431   case X86::BI__builtin_ia32_loadups128_mask:
14432   case X86::BI__builtin_ia32_loadups256_mask:
14433   case X86::BI__builtin_ia32_loadups512_mask:
14434   case X86::BI__builtin_ia32_loadupd128_mask:
14435   case X86::BI__builtin_ia32_loadupd256_mask:
14436   case X86::BI__builtin_ia32_loadupd512_mask:
14437   case X86::BI__builtin_ia32_loaddquqi128_mask:
14438   case X86::BI__builtin_ia32_loaddquqi256_mask:
14439   case X86::BI__builtin_ia32_loaddquqi512_mask:
14440   case X86::BI__builtin_ia32_loaddquhi128_mask:
14441   case X86::BI__builtin_ia32_loaddquhi256_mask:
14442   case X86::BI__builtin_ia32_loaddquhi512_mask:
14443   case X86::BI__builtin_ia32_loaddqusi128_mask:
14444   case X86::BI__builtin_ia32_loaddqusi256_mask:
14445   case X86::BI__builtin_ia32_loaddqusi512_mask:
14446   case X86::BI__builtin_ia32_loaddqudi128_mask:
14447   case X86::BI__builtin_ia32_loaddqudi256_mask:
14448   case X86::BI__builtin_ia32_loaddqudi512_mask:
14449     return EmitX86MaskedLoad(*this, Ops, Align(1));
14450 
14451   case X86::BI__builtin_ia32_loadsh128_mask:
14452   case X86::BI__builtin_ia32_loadss128_mask:
14453   case X86::BI__builtin_ia32_loadsd128_mask:
14454     return EmitX86MaskedLoad(*this, Ops, Align(1));
14455 
14456   case X86::BI__builtin_ia32_loadaps128_mask:
14457   case X86::BI__builtin_ia32_loadaps256_mask:
14458   case X86::BI__builtin_ia32_loadaps512_mask:
14459   case X86::BI__builtin_ia32_loadapd128_mask:
14460   case X86::BI__builtin_ia32_loadapd256_mask:
14461   case X86::BI__builtin_ia32_loadapd512_mask:
14462   case X86::BI__builtin_ia32_movdqa32load128_mask:
14463   case X86::BI__builtin_ia32_movdqa32load256_mask:
14464   case X86::BI__builtin_ia32_movdqa32load512_mask:
14465   case X86::BI__builtin_ia32_movdqa64load128_mask:
14466   case X86::BI__builtin_ia32_movdqa64load256_mask:
14467   case X86::BI__builtin_ia32_movdqa64load512_mask:
14468     return EmitX86MaskedLoad(
14469         *this, Ops,
14470         getContext().getTypeAlignInChars(E->getArg(1)->getType()).getAsAlign());
14471 
14472   case X86::BI__builtin_ia32_expandloaddf128_mask:
14473   case X86::BI__builtin_ia32_expandloaddf256_mask:
14474   case X86::BI__builtin_ia32_expandloaddf512_mask:
14475   case X86::BI__builtin_ia32_expandloadsf128_mask:
14476   case X86::BI__builtin_ia32_expandloadsf256_mask:
14477   case X86::BI__builtin_ia32_expandloadsf512_mask:
14478   case X86::BI__builtin_ia32_expandloaddi128_mask:
14479   case X86::BI__builtin_ia32_expandloaddi256_mask:
14480   case X86::BI__builtin_ia32_expandloaddi512_mask:
14481   case X86::BI__builtin_ia32_expandloadsi128_mask:
14482   case X86::BI__builtin_ia32_expandloadsi256_mask:
14483   case X86::BI__builtin_ia32_expandloadsi512_mask:
14484   case X86::BI__builtin_ia32_expandloadhi128_mask:
14485   case X86::BI__builtin_ia32_expandloadhi256_mask:
14486   case X86::BI__builtin_ia32_expandloadhi512_mask:
14487   case X86::BI__builtin_ia32_expandloadqi128_mask:
14488   case X86::BI__builtin_ia32_expandloadqi256_mask:
14489   case X86::BI__builtin_ia32_expandloadqi512_mask:
14490     return EmitX86ExpandLoad(*this, Ops);
14491 
14492   case X86::BI__builtin_ia32_compressstoredf128_mask:
14493   case X86::BI__builtin_ia32_compressstoredf256_mask:
14494   case X86::BI__builtin_ia32_compressstoredf512_mask:
14495   case X86::BI__builtin_ia32_compressstoresf128_mask:
14496   case X86::BI__builtin_ia32_compressstoresf256_mask:
14497   case X86::BI__builtin_ia32_compressstoresf512_mask:
14498   case X86::BI__builtin_ia32_compressstoredi128_mask:
14499   case X86::BI__builtin_ia32_compressstoredi256_mask:
14500   case X86::BI__builtin_ia32_compressstoredi512_mask:
14501   case X86::BI__builtin_ia32_compressstoresi128_mask:
14502   case X86::BI__builtin_ia32_compressstoresi256_mask:
14503   case X86::BI__builtin_ia32_compressstoresi512_mask:
14504   case X86::BI__builtin_ia32_compressstorehi128_mask:
14505   case X86::BI__builtin_ia32_compressstorehi256_mask:
14506   case X86::BI__builtin_ia32_compressstorehi512_mask:
14507   case X86::BI__builtin_ia32_compressstoreqi128_mask:
14508   case X86::BI__builtin_ia32_compressstoreqi256_mask:
14509   case X86::BI__builtin_ia32_compressstoreqi512_mask:
14510     return EmitX86CompressStore(*this, Ops);
14511 
14512   case X86::BI__builtin_ia32_expanddf128_mask:
14513   case X86::BI__builtin_ia32_expanddf256_mask:
14514   case X86::BI__builtin_ia32_expanddf512_mask:
14515   case X86::BI__builtin_ia32_expandsf128_mask:
14516   case X86::BI__builtin_ia32_expandsf256_mask:
14517   case X86::BI__builtin_ia32_expandsf512_mask:
14518   case X86::BI__builtin_ia32_expanddi128_mask:
14519   case X86::BI__builtin_ia32_expanddi256_mask:
14520   case X86::BI__builtin_ia32_expanddi512_mask:
14521   case X86::BI__builtin_ia32_expandsi128_mask:
14522   case X86::BI__builtin_ia32_expandsi256_mask:
14523   case X86::BI__builtin_ia32_expandsi512_mask:
14524   case X86::BI__builtin_ia32_expandhi128_mask:
14525   case X86::BI__builtin_ia32_expandhi256_mask:
14526   case X86::BI__builtin_ia32_expandhi512_mask:
14527   case X86::BI__builtin_ia32_expandqi128_mask:
14528   case X86::BI__builtin_ia32_expandqi256_mask:
14529   case X86::BI__builtin_ia32_expandqi512_mask:
14530     return EmitX86CompressExpand(*this, Ops, /*IsCompress*/false);
14531 
14532   case X86::BI__builtin_ia32_compressdf128_mask:
14533   case X86::BI__builtin_ia32_compressdf256_mask:
14534   case X86::BI__builtin_ia32_compressdf512_mask:
14535   case X86::BI__builtin_ia32_compresssf128_mask:
14536   case X86::BI__builtin_ia32_compresssf256_mask:
14537   case X86::BI__builtin_ia32_compresssf512_mask:
14538   case X86::BI__builtin_ia32_compressdi128_mask:
14539   case X86::BI__builtin_ia32_compressdi256_mask:
14540   case X86::BI__builtin_ia32_compressdi512_mask:
14541   case X86::BI__builtin_ia32_compresssi128_mask:
14542   case X86::BI__builtin_ia32_compresssi256_mask:
14543   case X86::BI__builtin_ia32_compresssi512_mask:
14544   case X86::BI__builtin_ia32_compresshi128_mask:
14545   case X86::BI__builtin_ia32_compresshi256_mask:
14546   case X86::BI__builtin_ia32_compresshi512_mask:
14547   case X86::BI__builtin_ia32_compressqi128_mask:
14548   case X86::BI__builtin_ia32_compressqi256_mask:
14549   case X86::BI__builtin_ia32_compressqi512_mask:
14550     return EmitX86CompressExpand(*this, Ops, /*IsCompress*/true);
14551 
14552   case X86::BI__builtin_ia32_gather3div2df:
14553   case X86::BI__builtin_ia32_gather3div2di:
14554   case X86::BI__builtin_ia32_gather3div4df:
14555   case X86::BI__builtin_ia32_gather3div4di:
14556   case X86::BI__builtin_ia32_gather3div4sf:
14557   case X86::BI__builtin_ia32_gather3div4si:
14558   case X86::BI__builtin_ia32_gather3div8sf:
14559   case X86::BI__builtin_ia32_gather3div8si:
14560   case X86::BI__builtin_ia32_gather3siv2df:
14561   case X86::BI__builtin_ia32_gather3siv2di:
14562   case X86::BI__builtin_ia32_gather3siv4df:
14563   case X86::BI__builtin_ia32_gather3siv4di:
14564   case X86::BI__builtin_ia32_gather3siv4sf:
14565   case X86::BI__builtin_ia32_gather3siv4si:
14566   case X86::BI__builtin_ia32_gather3siv8sf:
14567   case X86::BI__builtin_ia32_gather3siv8si:
14568   case X86::BI__builtin_ia32_gathersiv8df:
14569   case X86::BI__builtin_ia32_gathersiv16sf:
14570   case X86::BI__builtin_ia32_gatherdiv8df:
14571   case X86::BI__builtin_ia32_gatherdiv16sf:
14572   case X86::BI__builtin_ia32_gathersiv8di:
14573   case X86::BI__builtin_ia32_gathersiv16si:
14574   case X86::BI__builtin_ia32_gatherdiv8di:
14575   case X86::BI__builtin_ia32_gatherdiv16si: {
14576     Intrinsic::ID IID;
14577     switch (BuiltinID) {
14578     default: llvm_unreachable("Unexpected builtin");
14579     case X86::BI__builtin_ia32_gather3div2df:
14580       IID = Intrinsic::x86_avx512_mask_gather3div2_df;
14581       break;
14582     case X86::BI__builtin_ia32_gather3div2di:
14583       IID = Intrinsic::x86_avx512_mask_gather3div2_di;
14584       break;
14585     case X86::BI__builtin_ia32_gather3div4df:
14586       IID = Intrinsic::x86_avx512_mask_gather3div4_df;
14587       break;
14588     case X86::BI__builtin_ia32_gather3div4di:
14589       IID = Intrinsic::x86_avx512_mask_gather3div4_di;
14590       break;
14591     case X86::BI__builtin_ia32_gather3div4sf:
14592       IID = Intrinsic::x86_avx512_mask_gather3div4_sf;
14593       break;
14594     case X86::BI__builtin_ia32_gather3div4si:
14595       IID = Intrinsic::x86_avx512_mask_gather3div4_si;
14596       break;
14597     case X86::BI__builtin_ia32_gather3div8sf:
14598       IID = Intrinsic::x86_avx512_mask_gather3div8_sf;
14599       break;
14600     case X86::BI__builtin_ia32_gather3div8si:
14601       IID = Intrinsic::x86_avx512_mask_gather3div8_si;
14602       break;
14603     case X86::BI__builtin_ia32_gather3siv2df:
14604       IID = Intrinsic::x86_avx512_mask_gather3siv2_df;
14605       break;
14606     case X86::BI__builtin_ia32_gather3siv2di:
14607       IID = Intrinsic::x86_avx512_mask_gather3siv2_di;
14608       break;
14609     case X86::BI__builtin_ia32_gather3siv4df:
14610       IID = Intrinsic::x86_avx512_mask_gather3siv4_df;
14611       break;
14612     case X86::BI__builtin_ia32_gather3siv4di:
14613       IID = Intrinsic::x86_avx512_mask_gather3siv4_di;
14614       break;
14615     case X86::BI__builtin_ia32_gather3siv4sf:
14616       IID = Intrinsic::x86_avx512_mask_gather3siv4_sf;
14617       break;
14618     case X86::BI__builtin_ia32_gather3siv4si:
14619       IID = Intrinsic::x86_avx512_mask_gather3siv4_si;
14620       break;
14621     case X86::BI__builtin_ia32_gather3siv8sf:
14622       IID = Intrinsic::x86_avx512_mask_gather3siv8_sf;
14623       break;
14624     case X86::BI__builtin_ia32_gather3siv8si:
14625       IID = Intrinsic::x86_avx512_mask_gather3siv8_si;
14626       break;
14627     case X86::BI__builtin_ia32_gathersiv8df:
14628       IID = Intrinsic::x86_avx512_mask_gather_dpd_512;
14629       break;
14630     case X86::BI__builtin_ia32_gathersiv16sf:
14631       IID = Intrinsic::x86_avx512_mask_gather_dps_512;
14632       break;
14633     case X86::BI__builtin_ia32_gatherdiv8df:
14634       IID = Intrinsic::x86_avx512_mask_gather_qpd_512;
14635       break;
14636     case X86::BI__builtin_ia32_gatherdiv16sf:
14637       IID = Intrinsic::x86_avx512_mask_gather_qps_512;
14638       break;
14639     case X86::BI__builtin_ia32_gathersiv8di:
14640       IID = Intrinsic::x86_avx512_mask_gather_dpq_512;
14641       break;
14642     case X86::BI__builtin_ia32_gathersiv16si:
14643       IID = Intrinsic::x86_avx512_mask_gather_dpi_512;
14644       break;
14645     case X86::BI__builtin_ia32_gatherdiv8di:
14646       IID = Intrinsic::x86_avx512_mask_gather_qpq_512;
14647       break;
14648     case X86::BI__builtin_ia32_gatherdiv16si:
14649       IID = Intrinsic::x86_avx512_mask_gather_qpi_512;
14650       break;
14651     }
14652 
14653     unsigned MinElts = std::min(
14654         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(),
14655         cast<llvm::FixedVectorType>(Ops[2]->getType())->getNumElements());
14656     Ops[3] = getMaskVecValue(*this, Ops[3], MinElts);
14657     Function *Intr = CGM.getIntrinsic(IID);
14658     return Builder.CreateCall(Intr, Ops);
14659   }
14660 
14661   case X86::BI__builtin_ia32_scattersiv8df:
14662   case X86::BI__builtin_ia32_scattersiv16sf:
14663   case X86::BI__builtin_ia32_scatterdiv8df:
14664   case X86::BI__builtin_ia32_scatterdiv16sf:
14665   case X86::BI__builtin_ia32_scattersiv8di:
14666   case X86::BI__builtin_ia32_scattersiv16si:
14667   case X86::BI__builtin_ia32_scatterdiv8di:
14668   case X86::BI__builtin_ia32_scatterdiv16si:
14669   case X86::BI__builtin_ia32_scatterdiv2df:
14670   case X86::BI__builtin_ia32_scatterdiv2di:
14671   case X86::BI__builtin_ia32_scatterdiv4df:
14672   case X86::BI__builtin_ia32_scatterdiv4di:
14673   case X86::BI__builtin_ia32_scatterdiv4sf:
14674   case X86::BI__builtin_ia32_scatterdiv4si:
14675   case X86::BI__builtin_ia32_scatterdiv8sf:
14676   case X86::BI__builtin_ia32_scatterdiv8si:
14677   case X86::BI__builtin_ia32_scattersiv2df:
14678   case X86::BI__builtin_ia32_scattersiv2di:
14679   case X86::BI__builtin_ia32_scattersiv4df:
14680   case X86::BI__builtin_ia32_scattersiv4di:
14681   case X86::BI__builtin_ia32_scattersiv4sf:
14682   case X86::BI__builtin_ia32_scattersiv4si:
14683   case X86::BI__builtin_ia32_scattersiv8sf:
14684   case X86::BI__builtin_ia32_scattersiv8si: {
14685     Intrinsic::ID IID;
14686     switch (BuiltinID) {
14687     default: llvm_unreachable("Unexpected builtin");
14688     case X86::BI__builtin_ia32_scattersiv8df:
14689       IID = Intrinsic::x86_avx512_mask_scatter_dpd_512;
14690       break;
14691     case X86::BI__builtin_ia32_scattersiv16sf:
14692       IID = Intrinsic::x86_avx512_mask_scatter_dps_512;
14693       break;
14694     case X86::BI__builtin_ia32_scatterdiv8df:
14695       IID = Intrinsic::x86_avx512_mask_scatter_qpd_512;
14696       break;
14697     case X86::BI__builtin_ia32_scatterdiv16sf:
14698       IID = Intrinsic::x86_avx512_mask_scatter_qps_512;
14699       break;
14700     case X86::BI__builtin_ia32_scattersiv8di:
14701       IID = Intrinsic::x86_avx512_mask_scatter_dpq_512;
14702       break;
14703     case X86::BI__builtin_ia32_scattersiv16si:
14704       IID = Intrinsic::x86_avx512_mask_scatter_dpi_512;
14705       break;
14706     case X86::BI__builtin_ia32_scatterdiv8di:
14707       IID = Intrinsic::x86_avx512_mask_scatter_qpq_512;
14708       break;
14709     case X86::BI__builtin_ia32_scatterdiv16si:
14710       IID = Intrinsic::x86_avx512_mask_scatter_qpi_512;
14711       break;
14712     case X86::BI__builtin_ia32_scatterdiv2df:
14713       IID = Intrinsic::x86_avx512_mask_scatterdiv2_df;
14714       break;
14715     case X86::BI__builtin_ia32_scatterdiv2di:
14716       IID = Intrinsic::x86_avx512_mask_scatterdiv2_di;
14717       break;
14718     case X86::BI__builtin_ia32_scatterdiv4df:
14719       IID = Intrinsic::x86_avx512_mask_scatterdiv4_df;
14720       break;
14721     case X86::BI__builtin_ia32_scatterdiv4di:
14722       IID = Intrinsic::x86_avx512_mask_scatterdiv4_di;
14723       break;
14724     case X86::BI__builtin_ia32_scatterdiv4sf:
14725       IID = Intrinsic::x86_avx512_mask_scatterdiv4_sf;
14726       break;
14727     case X86::BI__builtin_ia32_scatterdiv4si:
14728       IID = Intrinsic::x86_avx512_mask_scatterdiv4_si;
14729       break;
14730     case X86::BI__builtin_ia32_scatterdiv8sf:
14731       IID = Intrinsic::x86_avx512_mask_scatterdiv8_sf;
14732       break;
14733     case X86::BI__builtin_ia32_scatterdiv8si:
14734       IID = Intrinsic::x86_avx512_mask_scatterdiv8_si;
14735       break;
14736     case X86::BI__builtin_ia32_scattersiv2df:
14737       IID = Intrinsic::x86_avx512_mask_scattersiv2_df;
14738       break;
14739     case X86::BI__builtin_ia32_scattersiv2di:
14740       IID = Intrinsic::x86_avx512_mask_scattersiv2_di;
14741       break;
14742     case X86::BI__builtin_ia32_scattersiv4df:
14743       IID = Intrinsic::x86_avx512_mask_scattersiv4_df;
14744       break;
14745     case X86::BI__builtin_ia32_scattersiv4di:
14746       IID = Intrinsic::x86_avx512_mask_scattersiv4_di;
14747       break;
14748     case X86::BI__builtin_ia32_scattersiv4sf:
14749       IID = Intrinsic::x86_avx512_mask_scattersiv4_sf;
14750       break;
14751     case X86::BI__builtin_ia32_scattersiv4si:
14752       IID = Intrinsic::x86_avx512_mask_scattersiv4_si;
14753       break;
14754     case X86::BI__builtin_ia32_scattersiv8sf:
14755       IID = Intrinsic::x86_avx512_mask_scattersiv8_sf;
14756       break;
14757     case X86::BI__builtin_ia32_scattersiv8si:
14758       IID = Intrinsic::x86_avx512_mask_scattersiv8_si;
14759       break;
14760     }
14761 
14762     unsigned MinElts = std::min(
14763         cast<llvm::FixedVectorType>(Ops[2]->getType())->getNumElements(),
14764         cast<llvm::FixedVectorType>(Ops[3]->getType())->getNumElements());
14765     Ops[1] = getMaskVecValue(*this, Ops[1], MinElts);
14766     Function *Intr = CGM.getIntrinsic(IID);
14767     return Builder.CreateCall(Intr, Ops);
14768   }
14769 
14770   case X86::BI__builtin_ia32_vextractf128_pd256:
14771   case X86::BI__builtin_ia32_vextractf128_ps256:
14772   case X86::BI__builtin_ia32_vextractf128_si256:
14773   case X86::BI__builtin_ia32_extract128i256:
14774   case X86::BI__builtin_ia32_extractf64x4_mask:
14775   case X86::BI__builtin_ia32_extractf32x4_mask:
14776   case X86::BI__builtin_ia32_extracti64x4_mask:
14777   case X86::BI__builtin_ia32_extracti32x4_mask:
14778   case X86::BI__builtin_ia32_extractf32x8_mask:
14779   case X86::BI__builtin_ia32_extracti32x8_mask:
14780   case X86::BI__builtin_ia32_extractf32x4_256_mask:
14781   case X86::BI__builtin_ia32_extracti32x4_256_mask:
14782   case X86::BI__builtin_ia32_extractf64x2_256_mask:
14783   case X86::BI__builtin_ia32_extracti64x2_256_mask:
14784   case X86::BI__builtin_ia32_extractf64x2_512_mask:
14785   case X86::BI__builtin_ia32_extracti64x2_512_mask: {
14786     auto *DstTy = cast<llvm::FixedVectorType>(ConvertType(E->getType()));
14787     unsigned NumElts = DstTy->getNumElements();
14788     unsigned SrcNumElts =
14789         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
14790     unsigned SubVectors = SrcNumElts / NumElts;
14791     unsigned Index = cast<ConstantInt>(Ops[1])->getZExtValue();
14792     assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors");
14793     Index &= SubVectors - 1; // Remove any extra bits.
14794     Index *= NumElts;
14795 
14796     int Indices[16];
14797     for (unsigned i = 0; i != NumElts; ++i)
14798       Indices[i] = i + Index;
14799 
14800     Value *Res = Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
14801                                              "extract");
14802 
14803     if (Ops.size() == 4)
14804       Res = EmitX86Select(*this, Ops[3], Res, Ops[2]);
14805 
14806     return Res;
14807   }
14808   case X86::BI__builtin_ia32_vinsertf128_pd256:
14809   case X86::BI__builtin_ia32_vinsertf128_ps256:
14810   case X86::BI__builtin_ia32_vinsertf128_si256:
14811   case X86::BI__builtin_ia32_insert128i256:
14812   case X86::BI__builtin_ia32_insertf64x4:
14813   case X86::BI__builtin_ia32_insertf32x4:
14814   case X86::BI__builtin_ia32_inserti64x4:
14815   case X86::BI__builtin_ia32_inserti32x4:
14816   case X86::BI__builtin_ia32_insertf32x8:
14817   case X86::BI__builtin_ia32_inserti32x8:
14818   case X86::BI__builtin_ia32_insertf32x4_256:
14819   case X86::BI__builtin_ia32_inserti32x4_256:
14820   case X86::BI__builtin_ia32_insertf64x2_256:
14821   case X86::BI__builtin_ia32_inserti64x2_256:
14822   case X86::BI__builtin_ia32_insertf64x2_512:
14823   case X86::BI__builtin_ia32_inserti64x2_512: {
14824     unsigned DstNumElts =
14825         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
14826     unsigned SrcNumElts =
14827         cast<llvm::FixedVectorType>(Ops[1]->getType())->getNumElements();
14828     unsigned SubVectors = DstNumElts / SrcNumElts;
14829     unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue();
14830     assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors");
14831     Index &= SubVectors - 1; // Remove any extra bits.
14832     Index *= SrcNumElts;
14833 
14834     int Indices[16];
14835     for (unsigned i = 0; i != DstNumElts; ++i)
14836       Indices[i] = (i >= SrcNumElts) ? SrcNumElts + (i % SrcNumElts) : i;
14837 
14838     Value *Op1 = Builder.CreateShuffleVector(
14839         Ops[1], ArrayRef(Indices, DstNumElts), "widen");
14840 
14841     for (unsigned i = 0; i != DstNumElts; ++i) {
14842       if (i >= Index && i < (Index + SrcNumElts))
14843         Indices[i] = (i - Index) + DstNumElts;
14844       else
14845         Indices[i] = i;
14846     }
14847 
14848     return Builder.CreateShuffleVector(Ops[0], Op1,
14849                                        ArrayRef(Indices, DstNumElts), "insert");
14850   }
14851   case X86::BI__builtin_ia32_pmovqd512_mask:
14852   case X86::BI__builtin_ia32_pmovwb512_mask: {
14853     Value *Res = Builder.CreateTrunc(Ops[0], Ops[1]->getType());
14854     return EmitX86Select(*this, Ops[2], Res, Ops[1]);
14855   }
14856   case X86::BI__builtin_ia32_pmovdb512_mask:
14857   case X86::BI__builtin_ia32_pmovdw512_mask:
14858   case X86::BI__builtin_ia32_pmovqw512_mask: {
14859     if (const auto *C = dyn_cast<Constant>(Ops[2]))
14860       if (C->isAllOnesValue())
14861         return Builder.CreateTrunc(Ops[0], Ops[1]->getType());
14862 
14863     Intrinsic::ID IID;
14864     switch (BuiltinID) {
14865     default: llvm_unreachable("Unsupported intrinsic!");
14866     case X86::BI__builtin_ia32_pmovdb512_mask:
14867       IID = Intrinsic::x86_avx512_mask_pmov_db_512;
14868       break;
14869     case X86::BI__builtin_ia32_pmovdw512_mask:
14870       IID = Intrinsic::x86_avx512_mask_pmov_dw_512;
14871       break;
14872     case X86::BI__builtin_ia32_pmovqw512_mask:
14873       IID = Intrinsic::x86_avx512_mask_pmov_qw_512;
14874       break;
14875     }
14876 
14877     Function *Intr = CGM.getIntrinsic(IID);
14878     return Builder.CreateCall(Intr, Ops);
14879   }
14880   case X86::BI__builtin_ia32_pblendw128:
14881   case X86::BI__builtin_ia32_blendpd:
14882   case X86::BI__builtin_ia32_blendps:
14883   case X86::BI__builtin_ia32_blendpd256:
14884   case X86::BI__builtin_ia32_blendps256:
14885   case X86::BI__builtin_ia32_pblendw256:
14886   case X86::BI__builtin_ia32_pblendd128:
14887   case X86::BI__builtin_ia32_pblendd256: {
14888     unsigned NumElts =
14889         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
14890     unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
14891 
14892     int Indices[16];
14893     // If there are more than 8 elements, the immediate is used twice so make
14894     // sure we handle that.
14895     for (unsigned i = 0; i != NumElts; ++i)
14896       Indices[i] = ((Imm >> (i % 8)) & 0x1) ? NumElts + i : i;
14897 
14898     return Builder.CreateShuffleVector(Ops[0], Ops[1],
14899                                        ArrayRef(Indices, NumElts), "blend");
14900   }
14901   case X86::BI__builtin_ia32_pshuflw:
14902   case X86::BI__builtin_ia32_pshuflw256:
14903   case X86::BI__builtin_ia32_pshuflw512: {
14904     uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
14905     auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
14906     unsigned NumElts = Ty->getNumElements();
14907 
14908     // Splat the 8-bits of immediate 4 times to help the loop wrap around.
14909     Imm = (Imm & 0xff) * 0x01010101;
14910 
14911     int Indices[32];
14912     for (unsigned l = 0; l != NumElts; l += 8) {
14913       for (unsigned i = 0; i != 4; ++i) {
14914         Indices[l + i] = l + (Imm & 3);
14915         Imm >>= 2;
14916       }
14917       for (unsigned i = 4; i != 8; ++i)
14918         Indices[l + i] = l + i;
14919     }
14920 
14921     return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
14922                                        "pshuflw");
14923   }
14924   case X86::BI__builtin_ia32_pshufhw:
14925   case X86::BI__builtin_ia32_pshufhw256:
14926   case X86::BI__builtin_ia32_pshufhw512: {
14927     uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
14928     auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
14929     unsigned NumElts = Ty->getNumElements();
14930 
14931     // Splat the 8-bits of immediate 4 times to help the loop wrap around.
14932     Imm = (Imm & 0xff) * 0x01010101;
14933 
14934     int Indices[32];
14935     for (unsigned l = 0; l != NumElts; l += 8) {
14936       for (unsigned i = 0; i != 4; ++i)
14937         Indices[l + i] = l + i;
14938       for (unsigned i = 4; i != 8; ++i) {
14939         Indices[l + i] = l + 4 + (Imm & 3);
14940         Imm >>= 2;
14941       }
14942     }
14943 
14944     return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
14945                                        "pshufhw");
14946   }
14947   case X86::BI__builtin_ia32_pshufd:
14948   case X86::BI__builtin_ia32_pshufd256:
14949   case X86::BI__builtin_ia32_pshufd512:
14950   case X86::BI__builtin_ia32_vpermilpd:
14951   case X86::BI__builtin_ia32_vpermilps:
14952   case X86::BI__builtin_ia32_vpermilpd256:
14953   case X86::BI__builtin_ia32_vpermilps256:
14954   case X86::BI__builtin_ia32_vpermilpd512:
14955   case X86::BI__builtin_ia32_vpermilps512: {
14956     uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
14957     auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
14958     unsigned NumElts = Ty->getNumElements();
14959     unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128;
14960     unsigned NumLaneElts = NumElts / NumLanes;
14961 
14962     // Splat the 8-bits of immediate 4 times to help the loop wrap around.
14963     Imm = (Imm & 0xff) * 0x01010101;
14964 
14965     int Indices[16];
14966     for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
14967       for (unsigned i = 0; i != NumLaneElts; ++i) {
14968         Indices[i + l] = (Imm % NumLaneElts) + l;
14969         Imm /= NumLaneElts;
14970       }
14971     }
14972 
14973     return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
14974                                        "permil");
14975   }
14976   case X86::BI__builtin_ia32_shufpd:
14977   case X86::BI__builtin_ia32_shufpd256:
14978   case X86::BI__builtin_ia32_shufpd512:
14979   case X86::BI__builtin_ia32_shufps:
14980   case X86::BI__builtin_ia32_shufps256:
14981   case X86::BI__builtin_ia32_shufps512: {
14982     uint32_t Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
14983     auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
14984     unsigned NumElts = Ty->getNumElements();
14985     unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128;
14986     unsigned NumLaneElts = NumElts / NumLanes;
14987 
14988     // Splat the 8-bits of immediate 4 times to help the loop wrap around.
14989     Imm = (Imm & 0xff) * 0x01010101;
14990 
14991     int Indices[16];
14992     for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
14993       for (unsigned i = 0; i != NumLaneElts; ++i) {
14994         unsigned Index = Imm % NumLaneElts;
14995         Imm /= NumLaneElts;
14996         if (i >= (NumLaneElts / 2))
14997           Index += NumElts;
14998         Indices[l + i] = l + Index;
14999       }
15000     }
15001 
15002     return Builder.CreateShuffleVector(Ops[0], Ops[1],
15003                                        ArrayRef(Indices, NumElts), "shufp");
15004   }
15005   case X86::BI__builtin_ia32_permdi256:
15006   case X86::BI__builtin_ia32_permdf256:
15007   case X86::BI__builtin_ia32_permdi512:
15008   case X86::BI__builtin_ia32_permdf512: {
15009     unsigned Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
15010     auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
15011     unsigned NumElts = Ty->getNumElements();
15012 
15013     // These intrinsics operate on 256-bit lanes of four 64-bit elements.
15014     int Indices[8];
15015     for (unsigned l = 0; l != NumElts; l += 4)
15016       for (unsigned i = 0; i != 4; ++i)
15017         Indices[l + i] = l + ((Imm >> (2 * i)) & 0x3);
15018 
15019     return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
15020                                        "perm");
15021   }
15022   case X86::BI__builtin_ia32_palignr128:
15023   case X86::BI__builtin_ia32_palignr256:
15024   case X86::BI__builtin_ia32_palignr512: {
15025     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff;
15026 
15027     unsigned NumElts =
15028         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
15029     assert(NumElts % 16 == 0);
15030 
15031     // If palignr is shifting the pair of vectors more than the size of two
15032     // lanes, emit zero.
15033     if (ShiftVal >= 32)
15034       return llvm::Constant::getNullValue(ConvertType(E->getType()));
15035 
15036     // If palignr is shifting the pair of input vectors more than one lane,
15037     // but less than two lanes, convert to shifting in zeroes.
15038     if (ShiftVal > 16) {
15039       ShiftVal -= 16;
15040       Ops[1] = Ops[0];
15041       Ops[0] = llvm::Constant::getNullValue(Ops[0]->getType());
15042     }
15043 
15044     int Indices[64];
15045     // 256-bit palignr operates on 128-bit lanes so we need to handle that
15046     for (unsigned l = 0; l != NumElts; l += 16) {
15047       for (unsigned i = 0; i != 16; ++i) {
15048         unsigned Idx = ShiftVal + i;
15049         if (Idx >= 16)
15050           Idx += NumElts - 16; // End of lane, switch operand.
15051         Indices[l + i] = Idx + l;
15052       }
15053     }
15054 
15055     return Builder.CreateShuffleVector(Ops[1], Ops[0],
15056                                        ArrayRef(Indices, NumElts), "palignr");
15057   }
15058   case X86::BI__builtin_ia32_alignd128:
15059   case X86::BI__builtin_ia32_alignd256:
15060   case X86::BI__builtin_ia32_alignd512:
15061   case X86::BI__builtin_ia32_alignq128:
15062   case X86::BI__builtin_ia32_alignq256:
15063   case X86::BI__builtin_ia32_alignq512: {
15064     unsigned NumElts =
15065         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
15066     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff;
15067 
15068     // Mask the shift amount to width of a vector.
15069     ShiftVal &= NumElts - 1;
15070 
15071     int Indices[16];
15072     for (unsigned i = 0; i != NumElts; ++i)
15073       Indices[i] = i + ShiftVal;
15074 
15075     return Builder.CreateShuffleVector(Ops[1], Ops[0],
15076                                        ArrayRef(Indices, NumElts), "valign");
15077   }
15078   case X86::BI__builtin_ia32_shuf_f32x4_256:
15079   case X86::BI__builtin_ia32_shuf_f64x2_256:
15080   case X86::BI__builtin_ia32_shuf_i32x4_256:
15081   case X86::BI__builtin_ia32_shuf_i64x2_256:
15082   case X86::BI__builtin_ia32_shuf_f32x4:
15083   case X86::BI__builtin_ia32_shuf_f64x2:
15084   case X86::BI__builtin_ia32_shuf_i32x4:
15085   case X86::BI__builtin_ia32_shuf_i64x2: {
15086     unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
15087     auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
15088     unsigned NumElts = Ty->getNumElements();
15089     unsigned NumLanes = Ty->getPrimitiveSizeInBits() == 512 ? 4 : 2;
15090     unsigned NumLaneElts = NumElts / NumLanes;
15091 
15092     int Indices[16];
15093     for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
15094       unsigned Index = (Imm % NumLanes) * NumLaneElts;
15095       Imm /= NumLanes; // Discard the bits we just used.
15096       if (l >= (NumElts / 2))
15097         Index += NumElts; // Switch to other source.
15098       for (unsigned i = 0; i != NumLaneElts; ++i) {
15099         Indices[l + i] = Index + i;
15100       }
15101     }
15102 
15103     return Builder.CreateShuffleVector(Ops[0], Ops[1],
15104                                        ArrayRef(Indices, NumElts), "shuf");
15105   }
15106 
15107   case X86::BI__builtin_ia32_vperm2f128_pd256:
15108   case X86::BI__builtin_ia32_vperm2f128_ps256:
15109   case X86::BI__builtin_ia32_vperm2f128_si256:
15110   case X86::BI__builtin_ia32_permti256: {
15111     unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
15112     unsigned NumElts =
15113         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
15114 
15115     // This takes a very simple approach since there are two lanes and a
15116     // shuffle can have 2 inputs. So we reserve the first input for the first
15117     // lane and the second input for the second lane. This may result in
15118     // duplicate sources, but this can be dealt with in the backend.
15119 
15120     Value *OutOps[2];
15121     int Indices[8];
15122     for (unsigned l = 0; l != 2; ++l) {
15123       // Determine the source for this lane.
15124       if (Imm & (1 << ((l * 4) + 3)))
15125         OutOps[l] = llvm::ConstantAggregateZero::get(Ops[0]->getType());
15126       else if (Imm & (1 << ((l * 4) + 1)))
15127         OutOps[l] = Ops[1];
15128       else
15129         OutOps[l] = Ops[0];
15130 
15131       for (unsigned i = 0; i != NumElts/2; ++i) {
15132         // Start with ith element of the source for this lane.
15133         unsigned Idx = (l * NumElts) + i;
15134         // If bit 0 of the immediate half is set, switch to the high half of
15135         // the source.
15136         if (Imm & (1 << (l * 4)))
15137           Idx += NumElts/2;
15138         Indices[(l * (NumElts/2)) + i] = Idx;
15139       }
15140     }
15141 
15142     return Builder.CreateShuffleVector(OutOps[0], OutOps[1],
15143                                        ArrayRef(Indices, NumElts), "vperm");
15144   }
15145 
15146   case X86::BI__builtin_ia32_pslldqi128_byteshift:
15147   case X86::BI__builtin_ia32_pslldqi256_byteshift:
15148   case X86::BI__builtin_ia32_pslldqi512_byteshift: {
15149     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
15150     auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType());
15151     // Builtin type is vXi64 so multiply by 8 to get bytes.
15152     unsigned NumElts = ResultType->getNumElements() * 8;
15153 
15154     // If pslldq is shifting the vector more than 15 bytes, emit zero.
15155     if (ShiftVal >= 16)
15156       return llvm::Constant::getNullValue(ResultType);
15157 
15158     int Indices[64];
15159     // 256/512-bit pslldq operates on 128-bit lanes so we need to handle that
15160     for (unsigned l = 0; l != NumElts; l += 16) {
15161       for (unsigned i = 0; i != 16; ++i) {
15162         unsigned Idx = NumElts + i - ShiftVal;
15163         if (Idx < NumElts) Idx -= NumElts - 16; // end of lane, switch operand.
15164         Indices[l + i] = Idx + l;
15165       }
15166     }
15167 
15168     auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts);
15169     Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast");
15170     Value *Zero = llvm::Constant::getNullValue(VecTy);
15171     Value *SV = Builder.CreateShuffleVector(
15172         Zero, Cast, ArrayRef(Indices, NumElts), "pslldq");
15173     return Builder.CreateBitCast(SV, Ops[0]->getType(), "cast");
15174   }
15175   case X86::BI__builtin_ia32_psrldqi128_byteshift:
15176   case X86::BI__builtin_ia32_psrldqi256_byteshift:
15177   case X86::BI__builtin_ia32_psrldqi512_byteshift: {
15178     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
15179     auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType());
15180     // Builtin type is vXi64 so multiply by 8 to get bytes.
15181     unsigned NumElts = ResultType->getNumElements() * 8;
15182 
15183     // If psrldq is shifting the vector more than 15 bytes, emit zero.
15184     if (ShiftVal >= 16)
15185       return llvm::Constant::getNullValue(ResultType);
15186 
15187     int Indices[64];
15188     // 256/512-bit psrldq operates on 128-bit lanes so we need to handle that
15189     for (unsigned l = 0; l != NumElts; l += 16) {
15190       for (unsigned i = 0; i != 16; ++i) {
15191         unsigned Idx = i + ShiftVal;
15192         if (Idx >= 16) Idx += NumElts - 16; // end of lane, switch operand.
15193         Indices[l + i] = Idx + l;
15194       }
15195     }
15196 
15197     auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts);
15198     Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast");
15199     Value *Zero = llvm::Constant::getNullValue(VecTy);
15200     Value *SV = Builder.CreateShuffleVector(
15201         Cast, Zero, ArrayRef(Indices, NumElts), "psrldq");
15202     return Builder.CreateBitCast(SV, ResultType, "cast");
15203   }
15204   case X86::BI__builtin_ia32_kshiftliqi:
15205   case X86::BI__builtin_ia32_kshiftlihi:
15206   case X86::BI__builtin_ia32_kshiftlisi:
15207   case X86::BI__builtin_ia32_kshiftlidi: {
15208     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
15209     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
15210 
15211     if (ShiftVal >= NumElts)
15212       return llvm::Constant::getNullValue(Ops[0]->getType());
15213 
15214     Value *In = getMaskVecValue(*this, Ops[0], NumElts);
15215 
15216     int Indices[64];
15217     for (unsigned i = 0; i != NumElts; ++i)
15218       Indices[i] = NumElts + i - ShiftVal;
15219 
15220     Value *Zero = llvm::Constant::getNullValue(In->getType());
15221     Value *SV = Builder.CreateShuffleVector(
15222         Zero, In, ArrayRef(Indices, NumElts), "kshiftl");
15223     return Builder.CreateBitCast(SV, Ops[0]->getType());
15224   }
15225   case X86::BI__builtin_ia32_kshiftriqi:
15226   case X86::BI__builtin_ia32_kshiftrihi:
15227   case X86::BI__builtin_ia32_kshiftrisi:
15228   case X86::BI__builtin_ia32_kshiftridi: {
15229     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
15230     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
15231 
15232     if (ShiftVal >= NumElts)
15233       return llvm::Constant::getNullValue(Ops[0]->getType());
15234 
15235     Value *In = getMaskVecValue(*this, Ops[0], NumElts);
15236 
15237     int Indices[64];
15238     for (unsigned i = 0; i != NumElts; ++i)
15239       Indices[i] = i + ShiftVal;
15240 
15241     Value *Zero = llvm::Constant::getNullValue(In->getType());
15242     Value *SV = Builder.CreateShuffleVector(
15243         In, Zero, ArrayRef(Indices, NumElts), "kshiftr");
15244     return Builder.CreateBitCast(SV, Ops[0]->getType());
15245   }
15246   case X86::BI__builtin_ia32_movnti:
15247   case X86::BI__builtin_ia32_movnti64:
15248   case X86::BI__builtin_ia32_movntsd:
15249   case X86::BI__builtin_ia32_movntss: {
15250     llvm::MDNode *Node = llvm::MDNode::get(
15251         getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
15252 
15253     Value *Ptr = Ops[0];
15254     Value *Src = Ops[1];
15255 
15256     // Extract the 0'th element of the source vector.
15257     if (BuiltinID == X86::BI__builtin_ia32_movntsd ||
15258         BuiltinID == X86::BI__builtin_ia32_movntss)
15259       Src = Builder.CreateExtractElement(Src, (uint64_t)0, "extract");
15260 
15261     // Unaligned nontemporal store of the scalar value.
15262     StoreInst *SI = Builder.CreateDefaultAlignedStore(Src, Ptr);
15263     SI->setMetadata(llvm::LLVMContext::MD_nontemporal, Node);
15264     SI->setAlignment(llvm::Align(1));
15265     return SI;
15266   }
15267   // Rotate is a special case of funnel shift - 1st 2 args are the same.
15268   case X86::BI__builtin_ia32_vprotb:
15269   case X86::BI__builtin_ia32_vprotw:
15270   case X86::BI__builtin_ia32_vprotd:
15271   case X86::BI__builtin_ia32_vprotq:
15272   case X86::BI__builtin_ia32_vprotbi:
15273   case X86::BI__builtin_ia32_vprotwi:
15274   case X86::BI__builtin_ia32_vprotdi:
15275   case X86::BI__builtin_ia32_vprotqi:
15276   case X86::BI__builtin_ia32_prold128:
15277   case X86::BI__builtin_ia32_prold256:
15278   case X86::BI__builtin_ia32_prold512:
15279   case X86::BI__builtin_ia32_prolq128:
15280   case X86::BI__builtin_ia32_prolq256:
15281   case X86::BI__builtin_ia32_prolq512:
15282   case X86::BI__builtin_ia32_prolvd128:
15283   case X86::BI__builtin_ia32_prolvd256:
15284   case X86::BI__builtin_ia32_prolvd512:
15285   case X86::BI__builtin_ia32_prolvq128:
15286   case X86::BI__builtin_ia32_prolvq256:
15287   case X86::BI__builtin_ia32_prolvq512:
15288     return EmitX86FunnelShift(*this, Ops[0], Ops[0], Ops[1], false);
15289   case X86::BI__builtin_ia32_prord128:
15290   case X86::BI__builtin_ia32_prord256:
15291   case X86::BI__builtin_ia32_prord512:
15292   case X86::BI__builtin_ia32_prorq128:
15293   case X86::BI__builtin_ia32_prorq256:
15294   case X86::BI__builtin_ia32_prorq512:
15295   case X86::BI__builtin_ia32_prorvd128:
15296   case X86::BI__builtin_ia32_prorvd256:
15297   case X86::BI__builtin_ia32_prorvd512:
15298   case X86::BI__builtin_ia32_prorvq128:
15299   case X86::BI__builtin_ia32_prorvq256:
15300   case X86::BI__builtin_ia32_prorvq512:
15301     return EmitX86FunnelShift(*this, Ops[0], Ops[0], Ops[1], true);
15302   case X86::BI__builtin_ia32_selectb_128:
15303   case X86::BI__builtin_ia32_selectb_256:
15304   case X86::BI__builtin_ia32_selectb_512:
15305   case X86::BI__builtin_ia32_selectw_128:
15306   case X86::BI__builtin_ia32_selectw_256:
15307   case X86::BI__builtin_ia32_selectw_512:
15308   case X86::BI__builtin_ia32_selectd_128:
15309   case X86::BI__builtin_ia32_selectd_256:
15310   case X86::BI__builtin_ia32_selectd_512:
15311   case X86::BI__builtin_ia32_selectq_128:
15312   case X86::BI__builtin_ia32_selectq_256:
15313   case X86::BI__builtin_ia32_selectq_512:
15314   case X86::BI__builtin_ia32_selectph_128:
15315   case X86::BI__builtin_ia32_selectph_256:
15316   case X86::BI__builtin_ia32_selectph_512:
15317   case X86::BI__builtin_ia32_selectpbf_128:
15318   case X86::BI__builtin_ia32_selectpbf_256:
15319   case X86::BI__builtin_ia32_selectpbf_512:
15320   case X86::BI__builtin_ia32_selectps_128:
15321   case X86::BI__builtin_ia32_selectps_256:
15322   case X86::BI__builtin_ia32_selectps_512:
15323   case X86::BI__builtin_ia32_selectpd_128:
15324   case X86::BI__builtin_ia32_selectpd_256:
15325   case X86::BI__builtin_ia32_selectpd_512:
15326     return EmitX86Select(*this, Ops[0], Ops[1], Ops[2]);
15327   case X86::BI__builtin_ia32_selectsh_128:
15328   case X86::BI__builtin_ia32_selectsbf_128:
15329   case X86::BI__builtin_ia32_selectss_128:
15330   case X86::BI__builtin_ia32_selectsd_128: {
15331     Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
15332     Value *B = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
15333     A = EmitX86ScalarSelect(*this, Ops[0], A, B);
15334     return Builder.CreateInsertElement(Ops[1], A, (uint64_t)0);
15335   }
15336   case X86::BI__builtin_ia32_cmpb128_mask:
15337   case X86::BI__builtin_ia32_cmpb256_mask:
15338   case X86::BI__builtin_ia32_cmpb512_mask:
15339   case X86::BI__builtin_ia32_cmpw128_mask:
15340   case X86::BI__builtin_ia32_cmpw256_mask:
15341   case X86::BI__builtin_ia32_cmpw512_mask:
15342   case X86::BI__builtin_ia32_cmpd128_mask:
15343   case X86::BI__builtin_ia32_cmpd256_mask:
15344   case X86::BI__builtin_ia32_cmpd512_mask:
15345   case X86::BI__builtin_ia32_cmpq128_mask:
15346   case X86::BI__builtin_ia32_cmpq256_mask:
15347   case X86::BI__builtin_ia32_cmpq512_mask: {
15348     unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7;
15349     return EmitX86MaskedCompare(*this, CC, true, Ops);
15350   }
15351   case X86::BI__builtin_ia32_ucmpb128_mask:
15352   case X86::BI__builtin_ia32_ucmpb256_mask:
15353   case X86::BI__builtin_ia32_ucmpb512_mask:
15354   case X86::BI__builtin_ia32_ucmpw128_mask:
15355   case X86::BI__builtin_ia32_ucmpw256_mask:
15356   case X86::BI__builtin_ia32_ucmpw512_mask:
15357   case X86::BI__builtin_ia32_ucmpd128_mask:
15358   case X86::BI__builtin_ia32_ucmpd256_mask:
15359   case X86::BI__builtin_ia32_ucmpd512_mask:
15360   case X86::BI__builtin_ia32_ucmpq128_mask:
15361   case X86::BI__builtin_ia32_ucmpq256_mask:
15362   case X86::BI__builtin_ia32_ucmpq512_mask: {
15363     unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7;
15364     return EmitX86MaskedCompare(*this, CC, false, Ops);
15365   }
15366   case X86::BI__builtin_ia32_vpcomb:
15367   case X86::BI__builtin_ia32_vpcomw:
15368   case X86::BI__builtin_ia32_vpcomd:
15369   case X86::BI__builtin_ia32_vpcomq:
15370     return EmitX86vpcom(*this, Ops, true);
15371   case X86::BI__builtin_ia32_vpcomub:
15372   case X86::BI__builtin_ia32_vpcomuw:
15373   case X86::BI__builtin_ia32_vpcomud:
15374   case X86::BI__builtin_ia32_vpcomuq:
15375     return EmitX86vpcom(*this, Ops, false);
15376 
15377   case X86::BI__builtin_ia32_kortestcqi:
15378   case X86::BI__builtin_ia32_kortestchi:
15379   case X86::BI__builtin_ia32_kortestcsi:
15380   case X86::BI__builtin_ia32_kortestcdi: {
15381     Value *Or = EmitX86MaskLogic(*this, Instruction::Or, Ops);
15382     Value *C = llvm::Constant::getAllOnesValue(Ops[0]->getType());
15383     Value *Cmp = Builder.CreateICmpEQ(Or, C);
15384     return Builder.CreateZExt(Cmp, ConvertType(E->getType()));
15385   }
15386   case X86::BI__builtin_ia32_kortestzqi:
15387   case X86::BI__builtin_ia32_kortestzhi:
15388   case X86::BI__builtin_ia32_kortestzsi:
15389   case X86::BI__builtin_ia32_kortestzdi: {
15390     Value *Or = EmitX86MaskLogic(*this, Instruction::Or, Ops);
15391     Value *C = llvm::Constant::getNullValue(Ops[0]->getType());
15392     Value *Cmp = Builder.CreateICmpEQ(Or, C);
15393     return Builder.CreateZExt(Cmp, ConvertType(E->getType()));
15394   }
15395 
15396   case X86::BI__builtin_ia32_ktestcqi:
15397   case X86::BI__builtin_ia32_ktestzqi:
15398   case X86::BI__builtin_ia32_ktestchi:
15399   case X86::BI__builtin_ia32_ktestzhi:
15400   case X86::BI__builtin_ia32_ktestcsi:
15401   case X86::BI__builtin_ia32_ktestzsi:
15402   case X86::BI__builtin_ia32_ktestcdi:
15403   case X86::BI__builtin_ia32_ktestzdi: {
15404     Intrinsic::ID IID;
15405     switch (BuiltinID) {
15406     default: llvm_unreachable("Unsupported intrinsic!");
15407     case X86::BI__builtin_ia32_ktestcqi:
15408       IID = Intrinsic::x86_avx512_ktestc_b;
15409       break;
15410     case X86::BI__builtin_ia32_ktestzqi:
15411       IID = Intrinsic::x86_avx512_ktestz_b;
15412       break;
15413     case X86::BI__builtin_ia32_ktestchi:
15414       IID = Intrinsic::x86_avx512_ktestc_w;
15415       break;
15416     case X86::BI__builtin_ia32_ktestzhi:
15417       IID = Intrinsic::x86_avx512_ktestz_w;
15418       break;
15419     case X86::BI__builtin_ia32_ktestcsi:
15420       IID = Intrinsic::x86_avx512_ktestc_d;
15421       break;
15422     case X86::BI__builtin_ia32_ktestzsi:
15423       IID = Intrinsic::x86_avx512_ktestz_d;
15424       break;
15425     case X86::BI__builtin_ia32_ktestcdi:
15426       IID = Intrinsic::x86_avx512_ktestc_q;
15427       break;
15428     case X86::BI__builtin_ia32_ktestzdi:
15429       IID = Intrinsic::x86_avx512_ktestz_q;
15430       break;
15431     }
15432 
15433     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
15434     Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
15435     Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
15436     Function *Intr = CGM.getIntrinsic(IID);
15437     return Builder.CreateCall(Intr, {LHS, RHS});
15438   }
15439 
15440   case X86::BI__builtin_ia32_kaddqi:
15441   case X86::BI__builtin_ia32_kaddhi:
15442   case X86::BI__builtin_ia32_kaddsi:
15443   case X86::BI__builtin_ia32_kadddi: {
15444     Intrinsic::ID IID;
15445     switch (BuiltinID) {
15446     default: llvm_unreachable("Unsupported intrinsic!");
15447     case X86::BI__builtin_ia32_kaddqi:
15448       IID = Intrinsic::x86_avx512_kadd_b;
15449       break;
15450     case X86::BI__builtin_ia32_kaddhi:
15451       IID = Intrinsic::x86_avx512_kadd_w;
15452       break;
15453     case X86::BI__builtin_ia32_kaddsi:
15454       IID = Intrinsic::x86_avx512_kadd_d;
15455       break;
15456     case X86::BI__builtin_ia32_kadddi:
15457       IID = Intrinsic::x86_avx512_kadd_q;
15458       break;
15459     }
15460 
15461     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
15462     Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
15463     Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
15464     Function *Intr = CGM.getIntrinsic(IID);
15465     Value *Res = Builder.CreateCall(Intr, {LHS, RHS});
15466     return Builder.CreateBitCast(Res, Ops[0]->getType());
15467   }
15468   case X86::BI__builtin_ia32_kandqi:
15469   case X86::BI__builtin_ia32_kandhi:
15470   case X86::BI__builtin_ia32_kandsi:
15471   case X86::BI__builtin_ia32_kanddi:
15472     return EmitX86MaskLogic(*this, Instruction::And, Ops);
15473   case X86::BI__builtin_ia32_kandnqi:
15474   case X86::BI__builtin_ia32_kandnhi:
15475   case X86::BI__builtin_ia32_kandnsi:
15476   case X86::BI__builtin_ia32_kandndi:
15477     return EmitX86MaskLogic(*this, Instruction::And, Ops, true);
15478   case X86::BI__builtin_ia32_korqi:
15479   case X86::BI__builtin_ia32_korhi:
15480   case X86::BI__builtin_ia32_korsi:
15481   case X86::BI__builtin_ia32_kordi:
15482     return EmitX86MaskLogic(*this, Instruction::Or, Ops);
15483   case X86::BI__builtin_ia32_kxnorqi:
15484   case X86::BI__builtin_ia32_kxnorhi:
15485   case X86::BI__builtin_ia32_kxnorsi:
15486   case X86::BI__builtin_ia32_kxnordi:
15487     return EmitX86MaskLogic(*this, Instruction::Xor, Ops, true);
15488   case X86::BI__builtin_ia32_kxorqi:
15489   case X86::BI__builtin_ia32_kxorhi:
15490   case X86::BI__builtin_ia32_kxorsi:
15491   case X86::BI__builtin_ia32_kxordi:
15492     return EmitX86MaskLogic(*this, Instruction::Xor,  Ops);
15493   case X86::BI__builtin_ia32_knotqi:
15494   case X86::BI__builtin_ia32_knothi:
15495   case X86::BI__builtin_ia32_knotsi:
15496   case X86::BI__builtin_ia32_knotdi: {
15497     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
15498     Value *Res = getMaskVecValue(*this, Ops[0], NumElts);
15499     return Builder.CreateBitCast(Builder.CreateNot(Res),
15500                                  Ops[0]->getType());
15501   }
15502   case X86::BI__builtin_ia32_kmovb:
15503   case X86::BI__builtin_ia32_kmovw:
15504   case X86::BI__builtin_ia32_kmovd:
15505   case X86::BI__builtin_ia32_kmovq: {
15506     // Bitcast to vXi1 type and then back to integer. This gets the mask
15507     // register type into the IR, but might be optimized out depending on
15508     // what's around it.
15509     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
15510     Value *Res = getMaskVecValue(*this, Ops[0], NumElts);
15511     return Builder.CreateBitCast(Res, Ops[0]->getType());
15512   }
15513 
15514   case X86::BI__builtin_ia32_kunpckdi:
15515   case X86::BI__builtin_ia32_kunpcksi:
15516   case X86::BI__builtin_ia32_kunpckhi: {
15517     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
15518     Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
15519     Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
15520     int Indices[64];
15521     for (unsigned i = 0; i != NumElts; ++i)
15522       Indices[i] = i;
15523 
15524     // First extract half of each vector. This gives better codegen than
15525     // doing it in a single shuffle.
15526     LHS = Builder.CreateShuffleVector(LHS, LHS, ArrayRef(Indices, NumElts / 2));
15527     RHS = Builder.CreateShuffleVector(RHS, RHS, ArrayRef(Indices, NumElts / 2));
15528     // Concat the vectors.
15529     // NOTE: Operands are swapped to match the intrinsic definition.
15530     Value *Res =
15531         Builder.CreateShuffleVector(RHS, LHS, ArrayRef(Indices, NumElts));
15532     return Builder.CreateBitCast(Res, Ops[0]->getType());
15533   }
15534 
15535   case X86::BI__builtin_ia32_vplzcntd_128:
15536   case X86::BI__builtin_ia32_vplzcntd_256:
15537   case X86::BI__builtin_ia32_vplzcntd_512:
15538   case X86::BI__builtin_ia32_vplzcntq_128:
15539   case X86::BI__builtin_ia32_vplzcntq_256:
15540   case X86::BI__builtin_ia32_vplzcntq_512: {
15541     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType());
15542     return Builder.CreateCall(F, {Ops[0],Builder.getInt1(false)});
15543   }
15544   case X86::BI__builtin_ia32_sqrtss:
15545   case X86::BI__builtin_ia32_sqrtsd: {
15546     Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
15547     Function *F;
15548     if (Builder.getIsFPConstrained()) {
15549       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
15550       F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
15551                            A->getType());
15552       A = Builder.CreateConstrainedFPCall(F, {A});
15553     } else {
15554       F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
15555       A = Builder.CreateCall(F, {A});
15556     }
15557     return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
15558   }
15559   case X86::BI__builtin_ia32_sqrtsh_round_mask:
15560   case X86::BI__builtin_ia32_sqrtsd_round_mask:
15561   case X86::BI__builtin_ia32_sqrtss_round_mask: {
15562     unsigned CC = cast<llvm::ConstantInt>(Ops[4])->getZExtValue();
15563     // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
15564     // otherwise keep the intrinsic.
15565     if (CC != 4) {
15566       Intrinsic::ID IID;
15567 
15568       switch (BuiltinID) {
15569       default:
15570         llvm_unreachable("Unsupported intrinsic!");
15571       case X86::BI__builtin_ia32_sqrtsh_round_mask:
15572         IID = Intrinsic::x86_avx512fp16_mask_sqrt_sh;
15573         break;
15574       case X86::BI__builtin_ia32_sqrtsd_round_mask:
15575         IID = Intrinsic::x86_avx512_mask_sqrt_sd;
15576         break;
15577       case X86::BI__builtin_ia32_sqrtss_round_mask:
15578         IID = Intrinsic::x86_avx512_mask_sqrt_ss;
15579         break;
15580       }
15581       return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
15582     }
15583     Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
15584     Function *F;
15585     if (Builder.getIsFPConstrained()) {
15586       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
15587       F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
15588                            A->getType());
15589       A = Builder.CreateConstrainedFPCall(F, A);
15590     } else {
15591       F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
15592       A = Builder.CreateCall(F, A);
15593     }
15594     Value *Src = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
15595     A = EmitX86ScalarSelect(*this, Ops[3], A, Src);
15596     return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
15597   }
15598   case X86::BI__builtin_ia32_sqrtpd256:
15599   case X86::BI__builtin_ia32_sqrtpd:
15600   case X86::BI__builtin_ia32_sqrtps256:
15601   case X86::BI__builtin_ia32_sqrtps:
15602   case X86::BI__builtin_ia32_sqrtph256:
15603   case X86::BI__builtin_ia32_sqrtph:
15604   case X86::BI__builtin_ia32_sqrtph512:
15605   case X86::BI__builtin_ia32_sqrtps512:
15606   case X86::BI__builtin_ia32_sqrtpd512: {
15607     if (Ops.size() == 2) {
15608       unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
15609       // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
15610       // otherwise keep the intrinsic.
15611       if (CC != 4) {
15612         Intrinsic::ID IID;
15613 
15614         switch (BuiltinID) {
15615         default:
15616           llvm_unreachable("Unsupported intrinsic!");
15617         case X86::BI__builtin_ia32_sqrtph512:
15618           IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
15619           break;
15620         case X86::BI__builtin_ia32_sqrtps512:
15621           IID = Intrinsic::x86_avx512_sqrt_ps_512;
15622           break;
15623         case X86::BI__builtin_ia32_sqrtpd512:
15624           IID = Intrinsic::x86_avx512_sqrt_pd_512;
15625           break;
15626         }
15627         return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
15628       }
15629     }
15630     if (Builder.getIsFPConstrained()) {
15631       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
15632       Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
15633                                      Ops[0]->getType());
15634       return Builder.CreateConstrainedFPCall(F, Ops[0]);
15635     } else {
15636       Function *F = CGM.getIntrinsic(Intrinsic::sqrt, Ops[0]->getType());
15637       return Builder.CreateCall(F, Ops[0]);
15638     }
15639   }
15640 
15641   case X86::BI__builtin_ia32_pmuludq128:
15642   case X86::BI__builtin_ia32_pmuludq256:
15643   case X86::BI__builtin_ia32_pmuludq512:
15644     return EmitX86Muldq(*this, /*IsSigned*/false, Ops);
15645 
15646   case X86::BI__builtin_ia32_pmuldq128:
15647   case X86::BI__builtin_ia32_pmuldq256:
15648   case X86::BI__builtin_ia32_pmuldq512:
15649     return EmitX86Muldq(*this, /*IsSigned*/true, Ops);
15650 
15651   case X86::BI__builtin_ia32_pternlogd512_mask:
15652   case X86::BI__builtin_ia32_pternlogq512_mask:
15653   case X86::BI__builtin_ia32_pternlogd128_mask:
15654   case X86::BI__builtin_ia32_pternlogd256_mask:
15655   case X86::BI__builtin_ia32_pternlogq128_mask:
15656   case X86::BI__builtin_ia32_pternlogq256_mask:
15657     return EmitX86Ternlog(*this, /*ZeroMask*/false, Ops);
15658 
15659   case X86::BI__builtin_ia32_pternlogd512_maskz:
15660   case X86::BI__builtin_ia32_pternlogq512_maskz:
15661   case X86::BI__builtin_ia32_pternlogd128_maskz:
15662   case X86::BI__builtin_ia32_pternlogd256_maskz:
15663   case X86::BI__builtin_ia32_pternlogq128_maskz:
15664   case X86::BI__builtin_ia32_pternlogq256_maskz:
15665     return EmitX86Ternlog(*this, /*ZeroMask*/true, Ops);
15666 
15667   case X86::BI__builtin_ia32_vpshldd128:
15668   case X86::BI__builtin_ia32_vpshldd256:
15669   case X86::BI__builtin_ia32_vpshldd512:
15670   case X86::BI__builtin_ia32_vpshldq128:
15671   case X86::BI__builtin_ia32_vpshldq256:
15672   case X86::BI__builtin_ia32_vpshldq512:
15673   case X86::BI__builtin_ia32_vpshldw128:
15674   case X86::BI__builtin_ia32_vpshldw256:
15675   case X86::BI__builtin_ia32_vpshldw512:
15676     return EmitX86FunnelShift(*this, Ops[0], Ops[1], Ops[2], false);
15677 
15678   case X86::BI__builtin_ia32_vpshrdd128:
15679   case X86::BI__builtin_ia32_vpshrdd256:
15680   case X86::BI__builtin_ia32_vpshrdd512:
15681   case X86::BI__builtin_ia32_vpshrdq128:
15682   case X86::BI__builtin_ia32_vpshrdq256:
15683   case X86::BI__builtin_ia32_vpshrdq512:
15684   case X86::BI__builtin_ia32_vpshrdw128:
15685   case X86::BI__builtin_ia32_vpshrdw256:
15686   case X86::BI__builtin_ia32_vpshrdw512:
15687     // Ops 0 and 1 are swapped.
15688     return EmitX86FunnelShift(*this, Ops[1], Ops[0], Ops[2], true);
15689 
15690   case X86::BI__builtin_ia32_vpshldvd128:
15691   case X86::BI__builtin_ia32_vpshldvd256:
15692   case X86::BI__builtin_ia32_vpshldvd512:
15693   case X86::BI__builtin_ia32_vpshldvq128:
15694   case X86::BI__builtin_ia32_vpshldvq256:
15695   case X86::BI__builtin_ia32_vpshldvq512:
15696   case X86::BI__builtin_ia32_vpshldvw128:
15697   case X86::BI__builtin_ia32_vpshldvw256:
15698   case X86::BI__builtin_ia32_vpshldvw512:
15699     return EmitX86FunnelShift(*this, Ops[0], Ops[1], Ops[2], false);
15700 
15701   case X86::BI__builtin_ia32_vpshrdvd128:
15702   case X86::BI__builtin_ia32_vpshrdvd256:
15703   case X86::BI__builtin_ia32_vpshrdvd512:
15704   case X86::BI__builtin_ia32_vpshrdvq128:
15705   case X86::BI__builtin_ia32_vpshrdvq256:
15706   case X86::BI__builtin_ia32_vpshrdvq512:
15707   case X86::BI__builtin_ia32_vpshrdvw128:
15708   case X86::BI__builtin_ia32_vpshrdvw256:
15709   case X86::BI__builtin_ia32_vpshrdvw512:
15710     // Ops 0 and 1 are swapped.
15711     return EmitX86FunnelShift(*this, Ops[1], Ops[0], Ops[2], true);
15712 
15713   // Reductions
15714   case X86::BI__builtin_ia32_reduce_fadd_pd512:
15715   case X86::BI__builtin_ia32_reduce_fadd_ps512:
15716   case X86::BI__builtin_ia32_reduce_fadd_ph512:
15717   case X86::BI__builtin_ia32_reduce_fadd_ph256:
15718   case X86::BI__builtin_ia32_reduce_fadd_ph128: {
15719     Function *F =
15720         CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Ops[1]->getType());
15721     IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
15722     Builder.getFastMathFlags().setAllowReassoc();
15723     return Builder.CreateCall(F, {Ops[0], Ops[1]});
15724   }
15725   case X86::BI__builtin_ia32_reduce_fmul_pd512:
15726   case X86::BI__builtin_ia32_reduce_fmul_ps512:
15727   case X86::BI__builtin_ia32_reduce_fmul_ph512:
15728   case X86::BI__builtin_ia32_reduce_fmul_ph256:
15729   case X86::BI__builtin_ia32_reduce_fmul_ph128: {
15730     Function *F =
15731         CGM.getIntrinsic(Intrinsic::vector_reduce_fmul, Ops[1]->getType());
15732     IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
15733     Builder.getFastMathFlags().setAllowReassoc();
15734     return Builder.CreateCall(F, {Ops[0], Ops[1]});
15735   }
15736   case X86::BI__builtin_ia32_reduce_fmax_pd512:
15737   case X86::BI__builtin_ia32_reduce_fmax_ps512:
15738   case X86::BI__builtin_ia32_reduce_fmax_ph512:
15739   case X86::BI__builtin_ia32_reduce_fmax_ph256:
15740   case X86::BI__builtin_ia32_reduce_fmax_ph128: {
15741     Function *F =
15742         CGM.getIntrinsic(Intrinsic::vector_reduce_fmax, Ops[0]->getType());
15743     IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
15744     Builder.getFastMathFlags().setNoNaNs();
15745     return Builder.CreateCall(F, {Ops[0]});
15746   }
15747   case X86::BI__builtin_ia32_reduce_fmin_pd512:
15748   case X86::BI__builtin_ia32_reduce_fmin_ps512:
15749   case X86::BI__builtin_ia32_reduce_fmin_ph512:
15750   case X86::BI__builtin_ia32_reduce_fmin_ph256:
15751   case X86::BI__builtin_ia32_reduce_fmin_ph128: {
15752     Function *F =
15753         CGM.getIntrinsic(Intrinsic::vector_reduce_fmin, Ops[0]->getType());
15754     IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
15755     Builder.getFastMathFlags().setNoNaNs();
15756     return Builder.CreateCall(F, {Ops[0]});
15757   }
15758 
15759   // 3DNow!
15760   case X86::BI__builtin_ia32_pswapdsf:
15761   case X86::BI__builtin_ia32_pswapdsi: {
15762     llvm::Type *MMXTy = llvm::Type::getX86_MMXTy(getLLVMContext());
15763     Ops[0] = Builder.CreateBitCast(Ops[0], MMXTy, "cast");
15764     llvm::Function *F = CGM.getIntrinsic(Intrinsic::x86_3dnowa_pswapd);
15765     return Builder.CreateCall(F, Ops, "pswapd");
15766   }
15767   case X86::BI__builtin_ia32_rdrand16_step:
15768   case X86::BI__builtin_ia32_rdrand32_step:
15769   case X86::BI__builtin_ia32_rdrand64_step:
15770   case X86::BI__builtin_ia32_rdseed16_step:
15771   case X86::BI__builtin_ia32_rdseed32_step:
15772   case X86::BI__builtin_ia32_rdseed64_step: {
15773     Intrinsic::ID ID;
15774     switch (BuiltinID) {
15775     default: llvm_unreachable("Unsupported intrinsic!");
15776     case X86::BI__builtin_ia32_rdrand16_step:
15777       ID = Intrinsic::x86_rdrand_16;
15778       break;
15779     case X86::BI__builtin_ia32_rdrand32_step:
15780       ID = Intrinsic::x86_rdrand_32;
15781       break;
15782     case X86::BI__builtin_ia32_rdrand64_step:
15783       ID = Intrinsic::x86_rdrand_64;
15784       break;
15785     case X86::BI__builtin_ia32_rdseed16_step:
15786       ID = Intrinsic::x86_rdseed_16;
15787       break;
15788     case X86::BI__builtin_ia32_rdseed32_step:
15789       ID = Intrinsic::x86_rdseed_32;
15790       break;
15791     case X86::BI__builtin_ia32_rdseed64_step:
15792       ID = Intrinsic::x86_rdseed_64;
15793       break;
15794     }
15795 
15796     Value *Call = Builder.CreateCall(CGM.getIntrinsic(ID));
15797     Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 0),
15798                                       Ops[0]);
15799     return Builder.CreateExtractValue(Call, 1);
15800   }
15801   case X86::BI__builtin_ia32_addcarryx_u32:
15802   case X86::BI__builtin_ia32_addcarryx_u64:
15803   case X86::BI__builtin_ia32_subborrow_u32:
15804   case X86::BI__builtin_ia32_subborrow_u64: {
15805     Intrinsic::ID IID;
15806     switch (BuiltinID) {
15807     default: llvm_unreachable("Unsupported intrinsic!");
15808     case X86::BI__builtin_ia32_addcarryx_u32:
15809       IID = Intrinsic::x86_addcarry_32;
15810       break;
15811     case X86::BI__builtin_ia32_addcarryx_u64:
15812       IID = Intrinsic::x86_addcarry_64;
15813       break;
15814     case X86::BI__builtin_ia32_subborrow_u32:
15815       IID = Intrinsic::x86_subborrow_32;
15816       break;
15817     case X86::BI__builtin_ia32_subborrow_u64:
15818       IID = Intrinsic::x86_subborrow_64;
15819       break;
15820     }
15821 
15822     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID),
15823                                      { Ops[0], Ops[1], Ops[2] });
15824     Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 1),
15825                                       Ops[3]);
15826     return Builder.CreateExtractValue(Call, 0);
15827   }
15828 
15829   case X86::BI__builtin_ia32_fpclassps128_mask:
15830   case X86::BI__builtin_ia32_fpclassps256_mask:
15831   case X86::BI__builtin_ia32_fpclassps512_mask:
15832   case X86::BI__builtin_ia32_fpclassph128_mask:
15833   case X86::BI__builtin_ia32_fpclassph256_mask:
15834   case X86::BI__builtin_ia32_fpclassph512_mask:
15835   case X86::BI__builtin_ia32_fpclasspd128_mask:
15836   case X86::BI__builtin_ia32_fpclasspd256_mask:
15837   case X86::BI__builtin_ia32_fpclasspd512_mask: {
15838     unsigned NumElts =
15839         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
15840     Value *MaskIn = Ops[2];
15841     Ops.erase(&Ops[2]);
15842 
15843     Intrinsic::ID ID;
15844     switch (BuiltinID) {
15845     default: llvm_unreachable("Unsupported intrinsic!");
15846     case X86::BI__builtin_ia32_fpclassph128_mask:
15847       ID = Intrinsic::x86_avx512fp16_fpclass_ph_128;
15848       break;
15849     case X86::BI__builtin_ia32_fpclassph256_mask:
15850       ID = Intrinsic::x86_avx512fp16_fpclass_ph_256;
15851       break;
15852     case X86::BI__builtin_ia32_fpclassph512_mask:
15853       ID = Intrinsic::x86_avx512fp16_fpclass_ph_512;
15854       break;
15855     case X86::BI__builtin_ia32_fpclassps128_mask:
15856       ID = Intrinsic::x86_avx512_fpclass_ps_128;
15857       break;
15858     case X86::BI__builtin_ia32_fpclassps256_mask:
15859       ID = Intrinsic::x86_avx512_fpclass_ps_256;
15860       break;
15861     case X86::BI__builtin_ia32_fpclassps512_mask:
15862       ID = Intrinsic::x86_avx512_fpclass_ps_512;
15863       break;
15864     case X86::BI__builtin_ia32_fpclasspd128_mask:
15865       ID = Intrinsic::x86_avx512_fpclass_pd_128;
15866       break;
15867     case X86::BI__builtin_ia32_fpclasspd256_mask:
15868       ID = Intrinsic::x86_avx512_fpclass_pd_256;
15869       break;
15870     case X86::BI__builtin_ia32_fpclasspd512_mask:
15871       ID = Intrinsic::x86_avx512_fpclass_pd_512;
15872       break;
15873     }
15874 
15875     Value *Fpclass = Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
15876     return EmitX86MaskedCompareResult(*this, Fpclass, NumElts, MaskIn);
15877   }
15878 
15879   case X86::BI__builtin_ia32_vp2intersect_q_512:
15880   case X86::BI__builtin_ia32_vp2intersect_q_256:
15881   case X86::BI__builtin_ia32_vp2intersect_q_128:
15882   case X86::BI__builtin_ia32_vp2intersect_d_512:
15883   case X86::BI__builtin_ia32_vp2intersect_d_256:
15884   case X86::BI__builtin_ia32_vp2intersect_d_128: {
15885     unsigned NumElts =
15886         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
15887     Intrinsic::ID ID;
15888 
15889     switch (BuiltinID) {
15890     default: llvm_unreachable("Unsupported intrinsic!");
15891     case X86::BI__builtin_ia32_vp2intersect_q_512:
15892       ID = Intrinsic::x86_avx512_vp2intersect_q_512;
15893       break;
15894     case X86::BI__builtin_ia32_vp2intersect_q_256:
15895       ID = Intrinsic::x86_avx512_vp2intersect_q_256;
15896       break;
15897     case X86::BI__builtin_ia32_vp2intersect_q_128:
15898       ID = Intrinsic::x86_avx512_vp2intersect_q_128;
15899       break;
15900     case X86::BI__builtin_ia32_vp2intersect_d_512:
15901       ID = Intrinsic::x86_avx512_vp2intersect_d_512;
15902       break;
15903     case X86::BI__builtin_ia32_vp2intersect_d_256:
15904       ID = Intrinsic::x86_avx512_vp2intersect_d_256;
15905       break;
15906     case X86::BI__builtin_ia32_vp2intersect_d_128:
15907       ID = Intrinsic::x86_avx512_vp2intersect_d_128;
15908       break;
15909     }
15910 
15911     Value *Call = Builder.CreateCall(CGM.getIntrinsic(ID), {Ops[0], Ops[1]});
15912     Value *Result = Builder.CreateExtractValue(Call, 0);
15913     Result = EmitX86MaskedCompareResult(*this, Result, NumElts, nullptr);
15914     Builder.CreateDefaultAlignedStore(Result, Ops[2]);
15915 
15916     Result = Builder.CreateExtractValue(Call, 1);
15917     Result = EmitX86MaskedCompareResult(*this, Result, NumElts, nullptr);
15918     return Builder.CreateDefaultAlignedStore(Result, Ops[3]);
15919   }
15920 
15921   case X86::BI__builtin_ia32_vpmultishiftqb128:
15922   case X86::BI__builtin_ia32_vpmultishiftqb256:
15923   case X86::BI__builtin_ia32_vpmultishiftqb512: {
15924     Intrinsic::ID ID;
15925     switch (BuiltinID) {
15926     default: llvm_unreachable("Unsupported intrinsic!");
15927     case X86::BI__builtin_ia32_vpmultishiftqb128:
15928       ID = Intrinsic::x86_avx512_pmultishift_qb_128;
15929       break;
15930     case X86::BI__builtin_ia32_vpmultishiftqb256:
15931       ID = Intrinsic::x86_avx512_pmultishift_qb_256;
15932       break;
15933     case X86::BI__builtin_ia32_vpmultishiftqb512:
15934       ID = Intrinsic::x86_avx512_pmultishift_qb_512;
15935       break;
15936     }
15937 
15938     return Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
15939   }
15940 
15941   case X86::BI__builtin_ia32_vpshufbitqmb128_mask:
15942   case X86::BI__builtin_ia32_vpshufbitqmb256_mask:
15943   case X86::BI__builtin_ia32_vpshufbitqmb512_mask: {
15944     unsigned NumElts =
15945         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
15946     Value *MaskIn = Ops[2];
15947     Ops.erase(&Ops[2]);
15948 
15949     Intrinsic::ID ID;
15950     switch (BuiltinID) {
15951     default: llvm_unreachable("Unsupported intrinsic!");
15952     case X86::BI__builtin_ia32_vpshufbitqmb128_mask:
15953       ID = Intrinsic::x86_avx512_vpshufbitqmb_128;
15954       break;
15955     case X86::BI__builtin_ia32_vpshufbitqmb256_mask:
15956       ID = Intrinsic::x86_avx512_vpshufbitqmb_256;
15957       break;
15958     case X86::BI__builtin_ia32_vpshufbitqmb512_mask:
15959       ID = Intrinsic::x86_avx512_vpshufbitqmb_512;
15960       break;
15961     }
15962 
15963     Value *Shufbit = Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
15964     return EmitX86MaskedCompareResult(*this, Shufbit, NumElts, MaskIn);
15965   }
15966 
15967   // packed comparison intrinsics
15968   case X86::BI__builtin_ia32_cmpeqps:
15969   case X86::BI__builtin_ia32_cmpeqpd:
15970     return getVectorFCmpIR(CmpInst::FCMP_OEQ, /*IsSignaling*/false);
15971   case X86::BI__builtin_ia32_cmpltps:
15972   case X86::BI__builtin_ia32_cmpltpd:
15973     return getVectorFCmpIR(CmpInst::FCMP_OLT, /*IsSignaling*/true);
15974   case X86::BI__builtin_ia32_cmpleps:
15975   case X86::BI__builtin_ia32_cmplepd:
15976     return getVectorFCmpIR(CmpInst::FCMP_OLE, /*IsSignaling*/true);
15977   case X86::BI__builtin_ia32_cmpunordps:
15978   case X86::BI__builtin_ia32_cmpunordpd:
15979     return getVectorFCmpIR(CmpInst::FCMP_UNO, /*IsSignaling*/false);
15980   case X86::BI__builtin_ia32_cmpneqps:
15981   case X86::BI__builtin_ia32_cmpneqpd:
15982     return getVectorFCmpIR(CmpInst::FCMP_UNE, /*IsSignaling*/false);
15983   case X86::BI__builtin_ia32_cmpnltps:
15984   case X86::BI__builtin_ia32_cmpnltpd:
15985     return getVectorFCmpIR(CmpInst::FCMP_UGE, /*IsSignaling*/true);
15986   case X86::BI__builtin_ia32_cmpnleps:
15987   case X86::BI__builtin_ia32_cmpnlepd:
15988     return getVectorFCmpIR(CmpInst::FCMP_UGT, /*IsSignaling*/true);
15989   case X86::BI__builtin_ia32_cmpordps:
15990   case X86::BI__builtin_ia32_cmpordpd:
15991     return getVectorFCmpIR(CmpInst::FCMP_ORD, /*IsSignaling*/false);
15992   case X86::BI__builtin_ia32_cmpph128_mask:
15993   case X86::BI__builtin_ia32_cmpph256_mask:
15994   case X86::BI__builtin_ia32_cmpph512_mask:
15995   case X86::BI__builtin_ia32_cmpps128_mask:
15996   case X86::BI__builtin_ia32_cmpps256_mask:
15997   case X86::BI__builtin_ia32_cmpps512_mask:
15998   case X86::BI__builtin_ia32_cmppd128_mask:
15999   case X86::BI__builtin_ia32_cmppd256_mask:
16000   case X86::BI__builtin_ia32_cmppd512_mask:
16001     IsMaskFCmp = true;
16002     [[fallthrough]];
16003   case X86::BI__builtin_ia32_cmpps:
16004   case X86::BI__builtin_ia32_cmpps256:
16005   case X86::BI__builtin_ia32_cmppd:
16006   case X86::BI__builtin_ia32_cmppd256: {
16007     // Lowering vector comparisons to fcmp instructions, while
16008     // ignoring signalling behaviour requested
16009     // ignoring rounding mode requested
16010     // This is only possible if fp-model is not strict and FENV_ACCESS is off.
16011 
16012     // The third argument is the comparison condition, and integer in the
16013     // range [0, 31]
16014     unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x1f;
16015 
16016     // Lowering to IR fcmp instruction.
16017     // Ignoring requested signaling behaviour,
16018     // e.g. both _CMP_GT_OS & _CMP_GT_OQ are translated to FCMP_OGT.
16019     FCmpInst::Predicate Pred;
16020     bool IsSignaling;
16021     // Predicates for 16-31 repeat the 0-15 predicates. Only the signalling
16022     // behavior is inverted. We'll handle that after the switch.
16023     switch (CC & 0xf) {
16024     case 0x00: Pred = FCmpInst::FCMP_OEQ;   IsSignaling = false; break;
16025     case 0x01: Pred = FCmpInst::FCMP_OLT;   IsSignaling = true;  break;
16026     case 0x02: Pred = FCmpInst::FCMP_OLE;   IsSignaling = true;  break;
16027     case 0x03: Pred = FCmpInst::FCMP_UNO;   IsSignaling = false; break;
16028     case 0x04: Pred = FCmpInst::FCMP_UNE;   IsSignaling = false; break;
16029     case 0x05: Pred = FCmpInst::FCMP_UGE;   IsSignaling = true;  break;
16030     case 0x06: Pred = FCmpInst::FCMP_UGT;   IsSignaling = true;  break;
16031     case 0x07: Pred = FCmpInst::FCMP_ORD;   IsSignaling = false; break;
16032     case 0x08: Pred = FCmpInst::FCMP_UEQ;   IsSignaling = false; break;
16033     case 0x09: Pred = FCmpInst::FCMP_ULT;   IsSignaling = true;  break;
16034     case 0x0a: Pred = FCmpInst::FCMP_ULE;   IsSignaling = true;  break;
16035     case 0x0b: Pred = FCmpInst::FCMP_FALSE; IsSignaling = false; break;
16036     case 0x0c: Pred = FCmpInst::FCMP_ONE;   IsSignaling = false; break;
16037     case 0x0d: Pred = FCmpInst::FCMP_OGE;   IsSignaling = true;  break;
16038     case 0x0e: Pred = FCmpInst::FCMP_OGT;   IsSignaling = true;  break;
16039     case 0x0f: Pred = FCmpInst::FCMP_TRUE;  IsSignaling = false; break;
16040     default: llvm_unreachable("Unhandled CC");
16041     }
16042 
16043     // Invert the signalling behavior for 16-31.
16044     if (CC & 0x10)
16045       IsSignaling = !IsSignaling;
16046 
16047     // If the predicate is true or false and we're using constrained intrinsics,
16048     // we don't have a compare intrinsic we can use. Just use the legacy X86
16049     // specific intrinsic.
16050     // If the intrinsic is mask enabled and we're using constrained intrinsics,
16051     // use the legacy X86 specific intrinsic.
16052     if (Builder.getIsFPConstrained() &&
16053         (Pred == FCmpInst::FCMP_TRUE || Pred == FCmpInst::FCMP_FALSE ||
16054          IsMaskFCmp)) {
16055 
16056       Intrinsic::ID IID;
16057       switch (BuiltinID) {
16058       default: llvm_unreachable("Unexpected builtin");
16059       case X86::BI__builtin_ia32_cmpps:
16060         IID = Intrinsic::x86_sse_cmp_ps;
16061         break;
16062       case X86::BI__builtin_ia32_cmpps256:
16063         IID = Intrinsic::x86_avx_cmp_ps_256;
16064         break;
16065       case X86::BI__builtin_ia32_cmppd:
16066         IID = Intrinsic::x86_sse2_cmp_pd;
16067         break;
16068       case X86::BI__builtin_ia32_cmppd256:
16069         IID = Intrinsic::x86_avx_cmp_pd_256;
16070         break;
16071       case X86::BI__builtin_ia32_cmpph128_mask:
16072         IID = Intrinsic::x86_avx512fp16_mask_cmp_ph_128;
16073         break;
16074       case X86::BI__builtin_ia32_cmpph256_mask:
16075         IID = Intrinsic::x86_avx512fp16_mask_cmp_ph_256;
16076         break;
16077       case X86::BI__builtin_ia32_cmpph512_mask:
16078         IID = Intrinsic::x86_avx512fp16_mask_cmp_ph_512;
16079         break;
16080       case X86::BI__builtin_ia32_cmpps512_mask:
16081         IID = Intrinsic::x86_avx512_mask_cmp_ps_512;
16082         break;
16083       case X86::BI__builtin_ia32_cmppd512_mask:
16084         IID = Intrinsic::x86_avx512_mask_cmp_pd_512;
16085         break;
16086       case X86::BI__builtin_ia32_cmpps128_mask:
16087         IID = Intrinsic::x86_avx512_mask_cmp_ps_128;
16088         break;
16089       case X86::BI__builtin_ia32_cmpps256_mask:
16090         IID = Intrinsic::x86_avx512_mask_cmp_ps_256;
16091         break;
16092       case X86::BI__builtin_ia32_cmppd128_mask:
16093         IID = Intrinsic::x86_avx512_mask_cmp_pd_128;
16094         break;
16095       case X86::BI__builtin_ia32_cmppd256_mask:
16096         IID = Intrinsic::x86_avx512_mask_cmp_pd_256;
16097         break;
16098       }
16099 
16100       Function *Intr = CGM.getIntrinsic(IID);
16101       if (IsMaskFCmp) {
16102         unsigned NumElts =
16103             cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
16104         Ops[3] = getMaskVecValue(*this, Ops[3], NumElts);
16105         Value *Cmp = Builder.CreateCall(Intr, Ops);
16106         return EmitX86MaskedCompareResult(*this, Cmp, NumElts, nullptr);
16107       }
16108 
16109       return Builder.CreateCall(Intr, Ops);
16110     }
16111 
16112     // Builtins without the _mask suffix return a vector of integers
16113     // of the same width as the input vectors
16114     if (IsMaskFCmp) {
16115       // We ignore SAE if strict FP is disabled. We only keep precise
16116       // exception behavior under strict FP.
16117       // NOTE: If strict FP does ever go through here a CGFPOptionsRAII
16118       // object will be required.
16119       unsigned NumElts =
16120           cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
16121       Value *Cmp;
16122       if (IsSignaling)
16123         Cmp = Builder.CreateFCmpS(Pred, Ops[0], Ops[1]);
16124       else
16125         Cmp = Builder.CreateFCmp(Pred, Ops[0], Ops[1]);
16126       return EmitX86MaskedCompareResult(*this, Cmp, NumElts, Ops[3]);
16127     }
16128 
16129     return getVectorFCmpIR(Pred, IsSignaling);
16130   }
16131 
16132   // SSE scalar comparison intrinsics
16133   case X86::BI__builtin_ia32_cmpeqss:
16134     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 0);
16135   case X86::BI__builtin_ia32_cmpltss:
16136     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 1);
16137   case X86::BI__builtin_ia32_cmpless:
16138     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 2);
16139   case X86::BI__builtin_ia32_cmpunordss:
16140     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 3);
16141   case X86::BI__builtin_ia32_cmpneqss:
16142     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 4);
16143   case X86::BI__builtin_ia32_cmpnltss:
16144     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 5);
16145   case X86::BI__builtin_ia32_cmpnless:
16146     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 6);
16147   case X86::BI__builtin_ia32_cmpordss:
16148     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 7);
16149   case X86::BI__builtin_ia32_cmpeqsd:
16150     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 0);
16151   case X86::BI__builtin_ia32_cmpltsd:
16152     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 1);
16153   case X86::BI__builtin_ia32_cmplesd:
16154     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 2);
16155   case X86::BI__builtin_ia32_cmpunordsd:
16156     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 3);
16157   case X86::BI__builtin_ia32_cmpneqsd:
16158     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 4);
16159   case X86::BI__builtin_ia32_cmpnltsd:
16160     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 5);
16161   case X86::BI__builtin_ia32_cmpnlesd:
16162     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 6);
16163   case X86::BI__builtin_ia32_cmpordsd:
16164     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 7);
16165 
16166   // f16c half2float intrinsics
16167   case X86::BI__builtin_ia32_vcvtph2ps:
16168   case X86::BI__builtin_ia32_vcvtph2ps256:
16169   case X86::BI__builtin_ia32_vcvtph2ps_mask:
16170   case X86::BI__builtin_ia32_vcvtph2ps256_mask:
16171   case X86::BI__builtin_ia32_vcvtph2ps512_mask: {
16172     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
16173     return EmitX86CvtF16ToFloatExpr(*this, Ops, ConvertType(E->getType()));
16174   }
16175 
16176   // AVX512 bf16 intrinsics
16177   case X86::BI__builtin_ia32_cvtneps2bf16_128_mask: {
16178     Ops[2] = getMaskVecValue(
16179         *this, Ops[2],
16180         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements());
16181     Intrinsic::ID IID = Intrinsic::x86_avx512bf16_mask_cvtneps2bf16_128;
16182     return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
16183   }
16184   case X86::BI__builtin_ia32_cvtsbf162ss_32:
16185     return Builder.CreateFPExt(Ops[0], Builder.getFloatTy());
16186 
16187   case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
16188   case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: {
16189     Intrinsic::ID IID;
16190     switch (BuiltinID) {
16191     default: llvm_unreachable("Unsupported intrinsic!");
16192     case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
16193       IID = Intrinsic::x86_avx512bf16_cvtneps2bf16_256;
16194       break;
16195     case X86::BI__builtin_ia32_cvtneps2bf16_512_mask:
16196       IID = Intrinsic::x86_avx512bf16_cvtneps2bf16_512;
16197       break;
16198     }
16199     Value *Res = Builder.CreateCall(CGM.getIntrinsic(IID), Ops[0]);
16200     return EmitX86Select(*this, Ops[2], Res, Ops[1]);
16201   }
16202 
16203   case X86::BI__cpuid:
16204   case X86::BI__cpuidex: {
16205     Value *FuncId = EmitScalarExpr(E->getArg(1));
16206     Value *SubFuncId = BuiltinID == X86::BI__cpuidex
16207                            ? EmitScalarExpr(E->getArg(2))
16208                            : llvm::ConstantInt::get(Int32Ty, 0);
16209 
16210     llvm::StructType *CpuidRetTy =
16211         llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty, Int32Ty);
16212     llvm::FunctionType *FTy =
16213         llvm::FunctionType::get(CpuidRetTy, {Int32Ty, Int32Ty}, false);
16214 
16215     StringRef Asm, Constraints;
16216     if (getTarget().getTriple().getArch() == llvm::Triple::x86) {
16217       Asm = "cpuid";
16218       Constraints = "={ax},={bx},={cx},={dx},{ax},{cx}";
16219     } else {
16220       // x86-64 uses %rbx as the base register, so preserve it.
16221       Asm = "xchgq %rbx, ${1:q}\n"
16222             "cpuid\n"
16223             "xchgq %rbx, ${1:q}";
16224       Constraints = "={ax},=r,={cx},={dx},0,2";
16225     }
16226 
16227     llvm::InlineAsm *IA = llvm::InlineAsm::get(FTy, Asm, Constraints,
16228                                                /*hasSideEffects=*/false);
16229     Value *IACall = Builder.CreateCall(IA, {FuncId, SubFuncId});
16230     Value *BasePtr = EmitScalarExpr(E->getArg(0));
16231     Value *Store = nullptr;
16232     for (unsigned i = 0; i < 4; i++) {
16233       Value *Extracted = Builder.CreateExtractValue(IACall, i);
16234       Value *StorePtr = Builder.CreateConstInBoundsGEP1_32(Int32Ty, BasePtr, i);
16235       Store = Builder.CreateAlignedStore(Extracted, StorePtr, getIntAlign());
16236     }
16237 
16238     // Return the last store instruction to signal that we have emitted the
16239     // the intrinsic.
16240     return Store;
16241   }
16242 
16243   case X86::BI__emul:
16244   case X86::BI__emulu: {
16245     llvm::Type *Int64Ty = llvm::IntegerType::get(getLLVMContext(), 64);
16246     bool isSigned = (BuiltinID == X86::BI__emul);
16247     Value *LHS = Builder.CreateIntCast(Ops[0], Int64Ty, isSigned);
16248     Value *RHS = Builder.CreateIntCast(Ops[1], Int64Ty, isSigned);
16249     return Builder.CreateMul(LHS, RHS, "", !isSigned, isSigned);
16250   }
16251   case X86::BI__mulh:
16252   case X86::BI__umulh:
16253   case X86::BI_mul128:
16254   case X86::BI_umul128: {
16255     llvm::Type *ResType = ConvertType(E->getType());
16256     llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
16257 
16258     bool IsSigned = (BuiltinID == X86::BI__mulh || BuiltinID == X86::BI_mul128);
16259     Value *LHS = Builder.CreateIntCast(Ops[0], Int128Ty, IsSigned);
16260     Value *RHS = Builder.CreateIntCast(Ops[1], Int128Ty, IsSigned);
16261 
16262     Value *MulResult, *HigherBits;
16263     if (IsSigned) {
16264       MulResult = Builder.CreateNSWMul(LHS, RHS);
16265       HigherBits = Builder.CreateAShr(MulResult, 64);
16266     } else {
16267       MulResult = Builder.CreateNUWMul(LHS, RHS);
16268       HigherBits = Builder.CreateLShr(MulResult, 64);
16269     }
16270     HigherBits = Builder.CreateIntCast(HigherBits, ResType, IsSigned);
16271 
16272     if (BuiltinID == X86::BI__mulh || BuiltinID == X86::BI__umulh)
16273       return HigherBits;
16274 
16275     Address HighBitsAddress = EmitPointerWithAlignment(E->getArg(2));
16276     Builder.CreateStore(HigherBits, HighBitsAddress);
16277     return Builder.CreateIntCast(MulResult, ResType, IsSigned);
16278   }
16279 
16280   case X86::BI__faststorefence: {
16281     return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
16282                                llvm::SyncScope::System);
16283   }
16284   case X86::BI__shiftleft128:
16285   case X86::BI__shiftright128: {
16286     llvm::Function *F = CGM.getIntrinsic(
16287         BuiltinID == X86::BI__shiftleft128 ? Intrinsic::fshl : Intrinsic::fshr,
16288         Int64Ty);
16289     // Flip low/high ops and zero-extend amount to matching type.
16290     // shiftleft128(Low, High, Amt) -> fshl(High, Low, Amt)
16291     // shiftright128(Low, High, Amt) -> fshr(High, Low, Amt)
16292     std::swap(Ops[0], Ops[1]);
16293     Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty);
16294     return Builder.CreateCall(F, Ops);
16295   }
16296   case X86::BI_ReadWriteBarrier:
16297   case X86::BI_ReadBarrier:
16298   case X86::BI_WriteBarrier: {
16299     return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
16300                                llvm::SyncScope::SingleThread);
16301   }
16302 
16303   case X86::BI_AddressOfReturnAddress: {
16304     Function *F =
16305         CGM.getIntrinsic(Intrinsic::addressofreturnaddress, AllocaInt8PtrTy);
16306     return Builder.CreateCall(F);
16307   }
16308   case X86::BI__stosb: {
16309     // We treat __stosb as a volatile memset - it may not generate "rep stosb"
16310     // instruction, but it will create a memset that won't be optimized away.
16311     return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true);
16312   }
16313   case X86::BI__ud2:
16314     // llvm.trap makes a ud2a instruction on x86.
16315     return EmitTrapCall(Intrinsic::trap);
16316   case X86::BI__int2c: {
16317     // This syscall signals a driver assertion failure in x86 NT kernels.
16318     llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, false);
16319     llvm::InlineAsm *IA =
16320         llvm::InlineAsm::get(FTy, "int $$0x2c", "", /*hasSideEffects=*/true);
16321     llvm::AttributeList NoReturnAttr = llvm::AttributeList::get(
16322         getLLVMContext(), llvm::AttributeList::FunctionIndex,
16323         llvm::Attribute::NoReturn);
16324     llvm::CallInst *CI = Builder.CreateCall(IA);
16325     CI->setAttributes(NoReturnAttr);
16326     return CI;
16327   }
16328   case X86::BI__readfsbyte:
16329   case X86::BI__readfsword:
16330   case X86::BI__readfsdword:
16331   case X86::BI__readfsqword: {
16332     llvm::Type *IntTy = ConvertType(E->getType());
16333     Value *Ptr = Builder.CreateIntToPtr(
16334         Ops[0], llvm::PointerType::get(getLLVMContext(), 257));
16335     LoadInst *Load = Builder.CreateAlignedLoad(
16336         IntTy, Ptr, getContext().getTypeAlignInChars(E->getType()));
16337     Load->setVolatile(true);
16338     return Load;
16339   }
16340   case X86::BI__readgsbyte:
16341   case X86::BI__readgsword:
16342   case X86::BI__readgsdword:
16343   case X86::BI__readgsqword: {
16344     llvm::Type *IntTy = ConvertType(E->getType());
16345     Value *Ptr = Builder.CreateIntToPtr(
16346         Ops[0], llvm::PointerType::get(getLLVMContext(), 256));
16347     LoadInst *Load = Builder.CreateAlignedLoad(
16348         IntTy, Ptr, getContext().getTypeAlignInChars(E->getType()));
16349     Load->setVolatile(true);
16350     return Load;
16351   }
16352   case X86::BI__builtin_ia32_encodekey128_u32: {
16353     Intrinsic::ID IID = Intrinsic::x86_encodekey128;
16354 
16355     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[0], Ops[1]});
16356 
16357     for (int i = 0; i < 3; ++i) {
16358       Value *Extract = Builder.CreateExtractValue(Call, i + 1);
16359       Value *Ptr = Builder.CreateConstGEP1_32(Int8Ty, Ops[2], i * 16);
16360       Builder.CreateAlignedStore(Extract, Ptr, Align(1));
16361     }
16362 
16363     return Builder.CreateExtractValue(Call, 0);
16364   }
16365   case X86::BI__builtin_ia32_encodekey256_u32: {
16366     Intrinsic::ID IID = Intrinsic::x86_encodekey256;
16367 
16368     Value *Call =
16369         Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[0], Ops[1], Ops[2]});
16370 
16371     for (int i = 0; i < 4; ++i) {
16372       Value *Extract = Builder.CreateExtractValue(Call, i + 1);
16373       Value *Ptr = Builder.CreateConstGEP1_32(Int8Ty, Ops[3], i * 16);
16374       Builder.CreateAlignedStore(Extract, Ptr, Align(1));
16375     }
16376 
16377     return Builder.CreateExtractValue(Call, 0);
16378   }
16379   case X86::BI__builtin_ia32_aesenc128kl_u8:
16380   case X86::BI__builtin_ia32_aesdec128kl_u8:
16381   case X86::BI__builtin_ia32_aesenc256kl_u8:
16382   case X86::BI__builtin_ia32_aesdec256kl_u8: {
16383     Intrinsic::ID IID;
16384     StringRef BlockName;
16385     switch (BuiltinID) {
16386     default:
16387       llvm_unreachable("Unexpected builtin");
16388     case X86::BI__builtin_ia32_aesenc128kl_u8:
16389       IID = Intrinsic::x86_aesenc128kl;
16390       BlockName = "aesenc128kl";
16391       break;
16392     case X86::BI__builtin_ia32_aesdec128kl_u8:
16393       IID = Intrinsic::x86_aesdec128kl;
16394       BlockName = "aesdec128kl";
16395       break;
16396     case X86::BI__builtin_ia32_aesenc256kl_u8:
16397       IID = Intrinsic::x86_aesenc256kl;
16398       BlockName = "aesenc256kl";
16399       break;
16400     case X86::BI__builtin_ia32_aesdec256kl_u8:
16401       IID = Intrinsic::x86_aesdec256kl;
16402       BlockName = "aesdec256kl";
16403       break;
16404     }
16405 
16406     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[1], Ops[2]});
16407 
16408     BasicBlock *NoError =
16409         createBasicBlock(BlockName + "_no_error", this->CurFn);
16410     BasicBlock *Error = createBasicBlock(BlockName + "_error", this->CurFn);
16411     BasicBlock *End = createBasicBlock(BlockName + "_end", this->CurFn);
16412 
16413     Value *Ret = Builder.CreateExtractValue(Call, 0);
16414     Value *Succ = Builder.CreateTrunc(Ret, Builder.getInt1Ty());
16415     Value *Out = Builder.CreateExtractValue(Call, 1);
16416     Builder.CreateCondBr(Succ, NoError, Error);
16417 
16418     Builder.SetInsertPoint(NoError);
16419     Builder.CreateDefaultAlignedStore(Out, Ops[0]);
16420     Builder.CreateBr(End);
16421 
16422     Builder.SetInsertPoint(Error);
16423     Constant *Zero = llvm::Constant::getNullValue(Out->getType());
16424     Builder.CreateDefaultAlignedStore(Zero, Ops[0]);
16425     Builder.CreateBr(End);
16426 
16427     Builder.SetInsertPoint(End);
16428     return Builder.CreateExtractValue(Call, 0);
16429   }
16430   case X86::BI__builtin_ia32_aesencwide128kl_u8:
16431   case X86::BI__builtin_ia32_aesdecwide128kl_u8:
16432   case X86::BI__builtin_ia32_aesencwide256kl_u8:
16433   case X86::BI__builtin_ia32_aesdecwide256kl_u8: {
16434     Intrinsic::ID IID;
16435     StringRef BlockName;
16436     switch (BuiltinID) {
16437     case X86::BI__builtin_ia32_aesencwide128kl_u8:
16438       IID = Intrinsic::x86_aesencwide128kl;
16439       BlockName = "aesencwide128kl";
16440       break;
16441     case X86::BI__builtin_ia32_aesdecwide128kl_u8:
16442       IID = Intrinsic::x86_aesdecwide128kl;
16443       BlockName = "aesdecwide128kl";
16444       break;
16445     case X86::BI__builtin_ia32_aesencwide256kl_u8:
16446       IID = Intrinsic::x86_aesencwide256kl;
16447       BlockName = "aesencwide256kl";
16448       break;
16449     case X86::BI__builtin_ia32_aesdecwide256kl_u8:
16450       IID = Intrinsic::x86_aesdecwide256kl;
16451       BlockName = "aesdecwide256kl";
16452       break;
16453     }
16454 
16455     llvm::Type *Ty = FixedVectorType::get(Builder.getInt64Ty(), 2);
16456     Value *InOps[9];
16457     InOps[0] = Ops[2];
16458     for (int i = 0; i != 8; ++i) {
16459       Value *Ptr = Builder.CreateConstGEP1_32(Ty, Ops[1], i);
16460       InOps[i + 1] = Builder.CreateAlignedLoad(Ty, Ptr, Align(16));
16461     }
16462 
16463     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), InOps);
16464 
16465     BasicBlock *NoError =
16466         createBasicBlock(BlockName + "_no_error", this->CurFn);
16467     BasicBlock *Error = createBasicBlock(BlockName + "_error", this->CurFn);
16468     BasicBlock *End = createBasicBlock(BlockName + "_end", this->CurFn);
16469 
16470     Value *Ret = Builder.CreateExtractValue(Call, 0);
16471     Value *Succ = Builder.CreateTrunc(Ret, Builder.getInt1Ty());
16472     Builder.CreateCondBr(Succ, NoError, Error);
16473 
16474     Builder.SetInsertPoint(NoError);
16475     for (int i = 0; i != 8; ++i) {
16476       Value *Extract = Builder.CreateExtractValue(Call, i + 1);
16477       Value *Ptr = Builder.CreateConstGEP1_32(Extract->getType(), Ops[0], i);
16478       Builder.CreateAlignedStore(Extract, Ptr, Align(16));
16479     }
16480     Builder.CreateBr(End);
16481 
16482     Builder.SetInsertPoint(Error);
16483     for (int i = 0; i != 8; ++i) {
16484       Value *Out = Builder.CreateExtractValue(Call, i + 1);
16485       Constant *Zero = llvm::Constant::getNullValue(Out->getType());
16486       Value *Ptr = Builder.CreateConstGEP1_32(Out->getType(), Ops[0], i);
16487       Builder.CreateAlignedStore(Zero, Ptr, Align(16));
16488     }
16489     Builder.CreateBr(End);
16490 
16491     Builder.SetInsertPoint(End);
16492     return Builder.CreateExtractValue(Call, 0);
16493   }
16494   case X86::BI__builtin_ia32_vfcmaddcph512_mask:
16495     IsConjFMA = true;
16496     [[fallthrough]];
16497   case X86::BI__builtin_ia32_vfmaddcph512_mask: {
16498     Intrinsic::ID IID = IsConjFMA
16499                             ? Intrinsic::x86_avx512fp16_mask_vfcmadd_cph_512
16500                             : Intrinsic::x86_avx512fp16_mask_vfmadd_cph_512;
16501     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
16502     return EmitX86Select(*this, Ops[3], Call, Ops[0]);
16503   }
16504   case X86::BI__builtin_ia32_vfcmaddcsh_round_mask:
16505     IsConjFMA = true;
16506     [[fallthrough]];
16507   case X86::BI__builtin_ia32_vfmaddcsh_round_mask: {
16508     Intrinsic::ID IID = IsConjFMA ? Intrinsic::x86_avx512fp16_mask_vfcmadd_csh
16509                                   : Intrinsic::x86_avx512fp16_mask_vfmadd_csh;
16510     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
16511     Value *And = Builder.CreateAnd(Ops[3], llvm::ConstantInt::get(Int8Ty, 1));
16512     return EmitX86Select(*this, And, Call, Ops[0]);
16513   }
16514   case X86::BI__builtin_ia32_vfcmaddcsh_round_mask3:
16515     IsConjFMA = true;
16516     [[fallthrough]];
16517   case X86::BI__builtin_ia32_vfmaddcsh_round_mask3: {
16518     Intrinsic::ID IID = IsConjFMA ? Intrinsic::x86_avx512fp16_mask_vfcmadd_csh
16519                                   : Intrinsic::x86_avx512fp16_mask_vfmadd_csh;
16520     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
16521     static constexpr int Mask[] = {0, 5, 6, 7};
16522     return Builder.CreateShuffleVector(Call, Ops[2], Mask);
16523   }
16524   case X86::BI__builtin_ia32_prefetchi:
16525     return Builder.CreateCall(
16526         CGM.getIntrinsic(Intrinsic::prefetch, Ops[0]->getType()),
16527         {Ops[0], llvm::ConstantInt::get(Int32Ty, 0), Ops[1],
16528          llvm::ConstantInt::get(Int32Ty, 0)});
16529   }
16530 }
16531 
16532 Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
16533                                            const CallExpr *E) {
16534   // Do not emit the builtin arguments in the arguments of a function call,
16535   // because the evaluation order of function arguments is not specified in C++.
16536   // This is important when testing to ensure the arguments are emitted in the
16537   // same order every time. Eg:
16538   // Instead of:
16539   //   return Builder.CreateFDiv(EmitScalarExpr(E->getArg(0)),
16540   //                             EmitScalarExpr(E->getArg(1)), "swdiv");
16541   // Use:
16542   //   Value *Op0 = EmitScalarExpr(E->getArg(0));
16543   //   Value *Op1 = EmitScalarExpr(E->getArg(1));
16544   //   return Builder.CreateFDiv(Op0, Op1, "swdiv")
16545 
16546   Intrinsic::ID ID = Intrinsic::not_intrinsic;
16547 
16548   switch (BuiltinID) {
16549   default: return nullptr;
16550 
16551   // __builtin_ppc_get_timebase is GCC 4.8+'s PowerPC-specific name for what we
16552   // call __builtin_readcyclecounter.
16553   case PPC::BI__builtin_ppc_get_timebase:
16554     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::readcyclecounter));
16555 
16556   // vec_ld, vec_xl_be, vec_lvsl, vec_lvsr
16557   case PPC::BI__builtin_altivec_lvx:
16558   case PPC::BI__builtin_altivec_lvxl:
16559   case PPC::BI__builtin_altivec_lvebx:
16560   case PPC::BI__builtin_altivec_lvehx:
16561   case PPC::BI__builtin_altivec_lvewx:
16562   case PPC::BI__builtin_altivec_lvsl:
16563   case PPC::BI__builtin_altivec_lvsr:
16564   case PPC::BI__builtin_vsx_lxvd2x:
16565   case PPC::BI__builtin_vsx_lxvw4x:
16566   case PPC::BI__builtin_vsx_lxvd2x_be:
16567   case PPC::BI__builtin_vsx_lxvw4x_be:
16568   case PPC::BI__builtin_vsx_lxvl:
16569   case PPC::BI__builtin_vsx_lxvll:
16570   {
16571     SmallVector<Value *, 2> Ops;
16572     Ops.push_back(EmitScalarExpr(E->getArg(0)));
16573     Ops.push_back(EmitScalarExpr(E->getArg(1)));
16574     if (!(BuiltinID == PPC::BI__builtin_vsx_lxvl ||
16575           BuiltinID == PPC::BI__builtin_vsx_lxvll)) {
16576       Ops[0] = Builder.CreateGEP(Int8Ty, Ops[1], Ops[0]);
16577       Ops.pop_back();
16578     }
16579 
16580     switch (BuiltinID) {
16581     default: llvm_unreachable("Unsupported ld/lvsl/lvsr intrinsic!");
16582     case PPC::BI__builtin_altivec_lvx:
16583       ID = Intrinsic::ppc_altivec_lvx;
16584       break;
16585     case PPC::BI__builtin_altivec_lvxl:
16586       ID = Intrinsic::ppc_altivec_lvxl;
16587       break;
16588     case PPC::BI__builtin_altivec_lvebx:
16589       ID = Intrinsic::ppc_altivec_lvebx;
16590       break;
16591     case PPC::BI__builtin_altivec_lvehx:
16592       ID = Intrinsic::ppc_altivec_lvehx;
16593       break;
16594     case PPC::BI__builtin_altivec_lvewx:
16595       ID = Intrinsic::ppc_altivec_lvewx;
16596       break;
16597     case PPC::BI__builtin_altivec_lvsl:
16598       ID = Intrinsic::ppc_altivec_lvsl;
16599       break;
16600     case PPC::BI__builtin_altivec_lvsr:
16601       ID = Intrinsic::ppc_altivec_lvsr;
16602       break;
16603     case PPC::BI__builtin_vsx_lxvd2x:
16604       ID = Intrinsic::ppc_vsx_lxvd2x;
16605       break;
16606     case PPC::BI__builtin_vsx_lxvw4x:
16607       ID = Intrinsic::ppc_vsx_lxvw4x;
16608       break;
16609     case PPC::BI__builtin_vsx_lxvd2x_be:
16610       ID = Intrinsic::ppc_vsx_lxvd2x_be;
16611       break;
16612     case PPC::BI__builtin_vsx_lxvw4x_be:
16613       ID = Intrinsic::ppc_vsx_lxvw4x_be;
16614       break;
16615     case PPC::BI__builtin_vsx_lxvl:
16616       ID = Intrinsic::ppc_vsx_lxvl;
16617       break;
16618     case PPC::BI__builtin_vsx_lxvll:
16619       ID = Intrinsic::ppc_vsx_lxvll;
16620       break;
16621     }
16622     llvm::Function *F = CGM.getIntrinsic(ID);
16623     return Builder.CreateCall(F, Ops, "");
16624   }
16625 
16626   // vec_st, vec_xst_be
16627   case PPC::BI__builtin_altivec_stvx:
16628   case PPC::BI__builtin_altivec_stvxl:
16629   case PPC::BI__builtin_altivec_stvebx:
16630   case PPC::BI__builtin_altivec_stvehx:
16631   case PPC::BI__builtin_altivec_stvewx:
16632   case PPC::BI__builtin_vsx_stxvd2x:
16633   case PPC::BI__builtin_vsx_stxvw4x:
16634   case PPC::BI__builtin_vsx_stxvd2x_be:
16635   case PPC::BI__builtin_vsx_stxvw4x_be:
16636   case PPC::BI__builtin_vsx_stxvl:
16637   case PPC::BI__builtin_vsx_stxvll:
16638   {
16639     SmallVector<Value *, 3> Ops;
16640     Ops.push_back(EmitScalarExpr(E->getArg(0)));
16641     Ops.push_back(EmitScalarExpr(E->getArg(1)));
16642     Ops.push_back(EmitScalarExpr(E->getArg(2)));
16643     if (!(BuiltinID == PPC::BI__builtin_vsx_stxvl ||
16644           BuiltinID == PPC::BI__builtin_vsx_stxvll)) {
16645       Ops[1] = Builder.CreateGEP(Int8Ty, Ops[2], Ops[1]);
16646       Ops.pop_back();
16647     }
16648 
16649     switch (BuiltinID) {
16650     default: llvm_unreachable("Unsupported st intrinsic!");
16651     case PPC::BI__builtin_altivec_stvx:
16652       ID = Intrinsic::ppc_altivec_stvx;
16653       break;
16654     case PPC::BI__builtin_altivec_stvxl:
16655       ID = Intrinsic::ppc_altivec_stvxl;
16656       break;
16657     case PPC::BI__builtin_altivec_stvebx:
16658       ID = Intrinsic::ppc_altivec_stvebx;
16659       break;
16660     case PPC::BI__builtin_altivec_stvehx:
16661       ID = Intrinsic::ppc_altivec_stvehx;
16662       break;
16663     case PPC::BI__builtin_altivec_stvewx:
16664       ID = Intrinsic::ppc_altivec_stvewx;
16665       break;
16666     case PPC::BI__builtin_vsx_stxvd2x:
16667       ID = Intrinsic::ppc_vsx_stxvd2x;
16668       break;
16669     case PPC::BI__builtin_vsx_stxvw4x:
16670       ID = Intrinsic::ppc_vsx_stxvw4x;
16671       break;
16672     case PPC::BI__builtin_vsx_stxvd2x_be:
16673       ID = Intrinsic::ppc_vsx_stxvd2x_be;
16674       break;
16675     case PPC::BI__builtin_vsx_stxvw4x_be:
16676       ID = Intrinsic::ppc_vsx_stxvw4x_be;
16677       break;
16678     case PPC::BI__builtin_vsx_stxvl:
16679       ID = Intrinsic::ppc_vsx_stxvl;
16680       break;
16681     case PPC::BI__builtin_vsx_stxvll:
16682       ID = Intrinsic::ppc_vsx_stxvll;
16683       break;
16684     }
16685     llvm::Function *F = CGM.getIntrinsic(ID);
16686     return Builder.CreateCall(F, Ops, "");
16687   }
16688   case PPC::BI__builtin_vsx_ldrmb: {
16689     // Essentially boils down to performing an unaligned VMX load sequence so
16690     // as to avoid crossing a page boundary and then shuffling the elements
16691     // into the right side of the vector register.
16692     Value *Op0 = EmitScalarExpr(E->getArg(0));
16693     Value *Op1 = EmitScalarExpr(E->getArg(1));
16694     int64_t NumBytes = cast<ConstantInt>(Op1)->getZExtValue();
16695     llvm::Type *ResTy = ConvertType(E->getType());
16696     bool IsLE = getTarget().isLittleEndian();
16697 
16698     // If the user wants the entire vector, just load the entire vector.
16699     if (NumBytes == 16) {
16700       Value *LD =
16701           Builder.CreateLoad(Address(Op0, ResTy, CharUnits::fromQuantity(1)));
16702       if (!IsLE)
16703         return LD;
16704 
16705       // Reverse the bytes on LE.
16706       SmallVector<int, 16> RevMask;
16707       for (int Idx = 0; Idx < 16; Idx++)
16708         RevMask.push_back(15 - Idx);
16709       return Builder.CreateShuffleVector(LD, LD, RevMask);
16710     }
16711 
16712     llvm::Function *Lvx = CGM.getIntrinsic(Intrinsic::ppc_altivec_lvx);
16713     llvm::Function *Lvs = CGM.getIntrinsic(IsLE ? Intrinsic::ppc_altivec_lvsr
16714                                                 : Intrinsic::ppc_altivec_lvsl);
16715     llvm::Function *Vperm = CGM.getIntrinsic(Intrinsic::ppc_altivec_vperm);
16716     Value *HiMem = Builder.CreateGEP(
16717         Int8Ty, Op0, ConstantInt::get(Op1->getType(), NumBytes - 1));
16718     Value *LoLd = Builder.CreateCall(Lvx, Op0, "ld.lo");
16719     Value *HiLd = Builder.CreateCall(Lvx, HiMem, "ld.hi");
16720     Value *Mask1 = Builder.CreateCall(Lvs, Op0, "mask1");
16721 
16722     Op0 = IsLE ? HiLd : LoLd;
16723     Op1 = IsLE ? LoLd : HiLd;
16724     Value *AllElts = Builder.CreateCall(Vperm, {Op0, Op1, Mask1}, "shuffle1");
16725     Constant *Zero = llvm::Constant::getNullValue(IsLE ? ResTy : AllElts->getType());
16726 
16727     if (IsLE) {
16728       SmallVector<int, 16> Consts;
16729       for (int Idx = 0; Idx < 16; Idx++) {
16730         int Val = (NumBytes - Idx - 1 >= 0) ? (NumBytes - Idx - 1)
16731                                             : 16 - (NumBytes - Idx);
16732         Consts.push_back(Val);
16733       }
16734       return Builder.CreateShuffleVector(Builder.CreateBitCast(AllElts, ResTy),
16735                                          Zero, Consts);
16736     }
16737     SmallVector<Constant *, 16> Consts;
16738     for (int Idx = 0; Idx < 16; Idx++)
16739       Consts.push_back(Builder.getInt8(NumBytes + Idx));
16740     Value *Mask2 = ConstantVector::get(Consts);
16741     return Builder.CreateBitCast(
16742         Builder.CreateCall(Vperm, {Zero, AllElts, Mask2}, "shuffle2"), ResTy);
16743   }
16744   case PPC::BI__builtin_vsx_strmb: {
16745     Value *Op0 = EmitScalarExpr(E->getArg(0));
16746     Value *Op1 = EmitScalarExpr(E->getArg(1));
16747     Value *Op2 = EmitScalarExpr(E->getArg(2));
16748     int64_t NumBytes = cast<ConstantInt>(Op1)->getZExtValue();
16749     bool IsLE = getTarget().isLittleEndian();
16750     auto StoreSubVec = [&](unsigned Width, unsigned Offset, unsigned EltNo) {
16751       // Storing the whole vector, simply store it on BE and reverse bytes and
16752       // store on LE.
16753       if (Width == 16) {
16754         Value *StVec = Op2;
16755         if (IsLE) {
16756           SmallVector<int, 16> RevMask;
16757           for (int Idx = 0; Idx < 16; Idx++)
16758             RevMask.push_back(15 - Idx);
16759           StVec = Builder.CreateShuffleVector(Op2, Op2, RevMask);
16760         }
16761         return Builder.CreateStore(
16762             StVec, Address(Op0, Op2->getType(), CharUnits::fromQuantity(1)));
16763       }
16764       auto *ConvTy = Int64Ty;
16765       unsigned NumElts = 0;
16766       switch (Width) {
16767       default:
16768         llvm_unreachable("width for stores must be a power of 2");
16769       case 8:
16770         ConvTy = Int64Ty;
16771         NumElts = 2;
16772         break;
16773       case 4:
16774         ConvTy = Int32Ty;
16775         NumElts = 4;
16776         break;
16777       case 2:
16778         ConvTy = Int16Ty;
16779         NumElts = 8;
16780         break;
16781       case 1:
16782         ConvTy = Int8Ty;
16783         NumElts = 16;
16784         break;
16785       }
16786       Value *Vec = Builder.CreateBitCast(
16787           Op2, llvm::FixedVectorType::get(ConvTy, NumElts));
16788       Value *Ptr =
16789           Builder.CreateGEP(Int8Ty, Op0, ConstantInt::get(Int64Ty, Offset));
16790       Value *Elt = Builder.CreateExtractElement(Vec, EltNo);
16791       if (IsLE && Width > 1) {
16792         Function *F = CGM.getIntrinsic(Intrinsic::bswap, ConvTy);
16793         Elt = Builder.CreateCall(F, Elt);
16794       }
16795       return Builder.CreateStore(
16796           Elt, Address(Ptr, ConvTy, CharUnits::fromQuantity(1)));
16797     };
16798     unsigned Stored = 0;
16799     unsigned RemainingBytes = NumBytes;
16800     Value *Result;
16801     if (NumBytes == 16)
16802       return StoreSubVec(16, 0, 0);
16803     if (NumBytes >= 8) {
16804       Result = StoreSubVec(8, NumBytes - 8, IsLE ? 0 : 1);
16805       RemainingBytes -= 8;
16806       Stored += 8;
16807     }
16808     if (RemainingBytes >= 4) {
16809       Result = StoreSubVec(4, NumBytes - Stored - 4,
16810                            IsLE ? (Stored >> 2) : 3 - (Stored >> 2));
16811       RemainingBytes -= 4;
16812       Stored += 4;
16813     }
16814     if (RemainingBytes >= 2) {
16815       Result = StoreSubVec(2, NumBytes - Stored - 2,
16816                            IsLE ? (Stored >> 1) : 7 - (Stored >> 1));
16817       RemainingBytes -= 2;
16818       Stored += 2;
16819     }
16820     if (RemainingBytes)
16821       Result =
16822           StoreSubVec(1, NumBytes - Stored - 1, IsLE ? Stored : 15 - Stored);
16823     return Result;
16824   }
16825   // Square root
16826   case PPC::BI__builtin_vsx_xvsqrtsp:
16827   case PPC::BI__builtin_vsx_xvsqrtdp: {
16828     llvm::Type *ResultType = ConvertType(E->getType());
16829     Value *X = EmitScalarExpr(E->getArg(0));
16830     if (Builder.getIsFPConstrained()) {
16831       llvm::Function *F = CGM.getIntrinsic(
16832           Intrinsic::experimental_constrained_sqrt, ResultType);
16833       return Builder.CreateConstrainedFPCall(F, X);
16834     } else {
16835       llvm::Function *F = CGM.getIntrinsic(Intrinsic::sqrt, ResultType);
16836       return Builder.CreateCall(F, X);
16837     }
16838   }
16839   // Count leading zeros
16840   case PPC::BI__builtin_altivec_vclzb:
16841   case PPC::BI__builtin_altivec_vclzh:
16842   case PPC::BI__builtin_altivec_vclzw:
16843   case PPC::BI__builtin_altivec_vclzd: {
16844     llvm::Type *ResultType = ConvertType(E->getType());
16845     Value *X = EmitScalarExpr(E->getArg(0));
16846     Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
16847     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ResultType);
16848     return Builder.CreateCall(F, {X, Undef});
16849   }
16850   case PPC::BI__builtin_altivec_vctzb:
16851   case PPC::BI__builtin_altivec_vctzh:
16852   case PPC::BI__builtin_altivec_vctzw:
16853   case PPC::BI__builtin_altivec_vctzd: {
16854     llvm::Type *ResultType = ConvertType(E->getType());
16855     Value *X = EmitScalarExpr(E->getArg(0));
16856     Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
16857     Function *F = CGM.getIntrinsic(Intrinsic::cttz, ResultType);
16858     return Builder.CreateCall(F, {X, Undef});
16859   }
16860   case PPC::BI__builtin_altivec_vinsd:
16861   case PPC::BI__builtin_altivec_vinsw:
16862   case PPC::BI__builtin_altivec_vinsd_elt:
16863   case PPC::BI__builtin_altivec_vinsw_elt: {
16864     llvm::Type *ResultType = ConvertType(E->getType());
16865     Value *Op0 = EmitScalarExpr(E->getArg(0));
16866     Value *Op1 = EmitScalarExpr(E->getArg(1));
16867     Value *Op2 = EmitScalarExpr(E->getArg(2));
16868 
16869     bool IsUnaligned = (BuiltinID == PPC::BI__builtin_altivec_vinsw ||
16870                         BuiltinID == PPC::BI__builtin_altivec_vinsd);
16871 
16872     bool Is32bit = (BuiltinID == PPC::BI__builtin_altivec_vinsw ||
16873                     BuiltinID == PPC::BI__builtin_altivec_vinsw_elt);
16874 
16875     // The third argument must be a compile time constant.
16876     ConstantInt *ArgCI = dyn_cast<ConstantInt>(Op2);
16877     assert(ArgCI &&
16878            "Third Arg to vinsw/vinsd intrinsic must be a constant integer!");
16879 
16880     // Valid value for the third argument is dependent on the input type and
16881     // builtin called.
16882     int ValidMaxValue = 0;
16883     if (IsUnaligned)
16884       ValidMaxValue = (Is32bit) ? 12 : 8;
16885     else
16886       ValidMaxValue = (Is32bit) ? 3 : 1;
16887 
16888     // Get value of third argument.
16889     int64_t ConstArg = ArgCI->getSExtValue();
16890 
16891     // Compose range checking error message.
16892     std::string RangeErrMsg = IsUnaligned ? "byte" : "element";
16893     RangeErrMsg += " number " + llvm::to_string(ConstArg);
16894     RangeErrMsg += " is outside of the valid range [0, ";
16895     RangeErrMsg += llvm::to_string(ValidMaxValue) + "]";
16896 
16897     // Issue error if third argument is not within the valid range.
16898     if (ConstArg < 0 || ConstArg > ValidMaxValue)
16899       CGM.Error(E->getExprLoc(), RangeErrMsg);
16900 
16901     // Input to vec_replace_elt is an element index, convert to byte index.
16902     if (!IsUnaligned) {
16903       ConstArg *= Is32bit ? 4 : 8;
16904       // Fix the constant according to endianess.
16905       if (getTarget().isLittleEndian())
16906         ConstArg = (Is32bit ? 12 : 8) - ConstArg;
16907     }
16908 
16909     ID = Is32bit ? Intrinsic::ppc_altivec_vinsw : Intrinsic::ppc_altivec_vinsd;
16910     Op2 = ConstantInt::getSigned(Int32Ty, ConstArg);
16911     // Casting input to vector int as per intrinsic definition.
16912     Op0 =
16913         Is32bit
16914             ? Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int32Ty, 4))
16915             : Builder.CreateBitCast(Op0,
16916                                     llvm::FixedVectorType::get(Int64Ty, 2));
16917     return Builder.CreateBitCast(
16918         Builder.CreateCall(CGM.getIntrinsic(ID), {Op0, Op1, Op2}), ResultType);
16919   }
16920   case PPC::BI__builtin_altivec_vpopcntb:
16921   case PPC::BI__builtin_altivec_vpopcnth:
16922   case PPC::BI__builtin_altivec_vpopcntw:
16923   case PPC::BI__builtin_altivec_vpopcntd: {
16924     llvm::Type *ResultType = ConvertType(E->getType());
16925     Value *X = EmitScalarExpr(E->getArg(0));
16926     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ResultType);
16927     return Builder.CreateCall(F, X);
16928   }
16929   case PPC::BI__builtin_altivec_vadduqm:
16930   case PPC::BI__builtin_altivec_vsubuqm: {
16931     Value *Op0 = EmitScalarExpr(E->getArg(0));
16932     Value *Op1 = EmitScalarExpr(E->getArg(1));
16933     llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
16934     Op0 = Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int128Ty, 1));
16935     Op1 = Builder.CreateBitCast(Op1, llvm::FixedVectorType::get(Int128Ty, 1));
16936     if (BuiltinID == PPC::BI__builtin_altivec_vadduqm)
16937       return Builder.CreateAdd(Op0, Op1, "vadduqm");
16938     else
16939       return Builder.CreateSub(Op0, Op1, "vsubuqm");
16940   }
16941   case PPC::BI__builtin_altivec_vaddcuq_c:
16942   case PPC::BI__builtin_altivec_vsubcuq_c: {
16943     SmallVector<Value *, 2> Ops;
16944     Value *Op0 = EmitScalarExpr(E->getArg(0));
16945     Value *Op1 = EmitScalarExpr(E->getArg(1));
16946     llvm::Type *V1I128Ty = llvm::FixedVectorType::get(
16947         llvm::IntegerType::get(getLLVMContext(), 128), 1);
16948     Ops.push_back(Builder.CreateBitCast(Op0, V1I128Ty));
16949     Ops.push_back(Builder.CreateBitCast(Op1, V1I128Ty));
16950     ID = (BuiltinID == PPC::BI__builtin_altivec_vaddcuq_c)
16951              ? Intrinsic::ppc_altivec_vaddcuq
16952              : Intrinsic::ppc_altivec_vsubcuq;
16953     return Builder.CreateCall(CGM.getIntrinsic(ID), Ops, "");
16954   }
16955   case PPC::BI__builtin_altivec_vaddeuqm_c:
16956   case PPC::BI__builtin_altivec_vaddecuq_c:
16957   case PPC::BI__builtin_altivec_vsubeuqm_c:
16958   case PPC::BI__builtin_altivec_vsubecuq_c: {
16959     SmallVector<Value *, 3> Ops;
16960     Value *Op0 = EmitScalarExpr(E->getArg(0));
16961     Value *Op1 = EmitScalarExpr(E->getArg(1));
16962     Value *Op2 = EmitScalarExpr(E->getArg(2));
16963     llvm::Type *V1I128Ty = llvm::FixedVectorType::get(
16964         llvm::IntegerType::get(getLLVMContext(), 128), 1);
16965     Ops.push_back(Builder.CreateBitCast(Op0, V1I128Ty));
16966     Ops.push_back(Builder.CreateBitCast(Op1, V1I128Ty));
16967     Ops.push_back(Builder.CreateBitCast(Op2, V1I128Ty));
16968     switch (BuiltinID) {
16969     default:
16970       llvm_unreachable("Unsupported intrinsic!");
16971     case PPC::BI__builtin_altivec_vaddeuqm_c:
16972       ID = Intrinsic::ppc_altivec_vaddeuqm;
16973       break;
16974     case PPC::BI__builtin_altivec_vaddecuq_c:
16975       ID = Intrinsic::ppc_altivec_vaddecuq;
16976       break;
16977     case PPC::BI__builtin_altivec_vsubeuqm_c:
16978       ID = Intrinsic::ppc_altivec_vsubeuqm;
16979       break;
16980     case PPC::BI__builtin_altivec_vsubecuq_c:
16981       ID = Intrinsic::ppc_altivec_vsubecuq;
16982       break;
16983     }
16984     return Builder.CreateCall(CGM.getIntrinsic(ID), Ops, "");
16985   }
16986   // Rotate and insert under mask operation.
16987   // __rldimi(rs, is, shift, mask)
16988   // (rotl64(rs, shift) & mask) | (is & ~mask)
16989   // __rlwimi(rs, is, shift, mask)
16990   // (rotl(rs, shift) & mask) | (is & ~mask)
16991   case PPC::BI__builtin_ppc_rldimi:
16992   case PPC::BI__builtin_ppc_rlwimi: {
16993     Value *Op0 = EmitScalarExpr(E->getArg(0));
16994     Value *Op1 = EmitScalarExpr(E->getArg(1));
16995     Value *Op2 = EmitScalarExpr(E->getArg(2));
16996     Value *Op3 = EmitScalarExpr(E->getArg(3));
16997     llvm::Type *Ty = Op0->getType();
16998     Function *F = CGM.getIntrinsic(Intrinsic::fshl, Ty);
16999     if (BuiltinID == PPC::BI__builtin_ppc_rldimi)
17000       Op2 = Builder.CreateZExt(Op2, Int64Ty);
17001     Value *Shift = Builder.CreateCall(F, {Op0, Op0, Op2});
17002     Value *X = Builder.CreateAnd(Shift, Op3);
17003     Value *Y = Builder.CreateAnd(Op1, Builder.CreateNot(Op3));
17004     return Builder.CreateOr(X, Y);
17005   }
17006   // Rotate and insert under mask operation.
17007   // __rlwnm(rs, shift, mask)
17008   // rotl(rs, shift) & mask
17009   case PPC::BI__builtin_ppc_rlwnm: {
17010     Value *Op0 = EmitScalarExpr(E->getArg(0));
17011     Value *Op1 = EmitScalarExpr(E->getArg(1));
17012     Value *Op2 = EmitScalarExpr(E->getArg(2));
17013     llvm::Type *Ty = Op0->getType();
17014     Function *F = CGM.getIntrinsic(Intrinsic::fshl, Ty);
17015     Value *Shift = Builder.CreateCall(F, {Op0, Op0, Op1});
17016     return Builder.CreateAnd(Shift, Op2);
17017   }
17018   case PPC::BI__builtin_ppc_poppar4:
17019   case PPC::BI__builtin_ppc_poppar8: {
17020     Value *Op0 = EmitScalarExpr(E->getArg(0));
17021     llvm::Type *ArgType = Op0->getType();
17022     Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType);
17023     Value *Tmp = Builder.CreateCall(F, Op0);
17024 
17025     llvm::Type *ResultType = ConvertType(E->getType());
17026     Value *Result = Builder.CreateAnd(Tmp, llvm::ConstantInt::get(ArgType, 1));
17027     if (Result->getType() != ResultType)
17028       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
17029                                      "cast");
17030     return Result;
17031   }
17032   case PPC::BI__builtin_ppc_cmpb: {
17033     Value *Op0 = EmitScalarExpr(E->getArg(0));
17034     Value *Op1 = EmitScalarExpr(E->getArg(1));
17035     if (getTarget().getTriple().isPPC64()) {
17036       Function *F =
17037           CGM.getIntrinsic(Intrinsic::ppc_cmpb, {Int64Ty, Int64Ty, Int64Ty});
17038       return Builder.CreateCall(F, {Op0, Op1}, "cmpb");
17039     }
17040     // For 32 bit, emit the code as below:
17041     // %conv = trunc i64 %a to i32
17042     // %conv1 = trunc i64 %b to i32
17043     // %shr = lshr i64 %a, 32
17044     // %conv2 = trunc i64 %shr to i32
17045     // %shr3 = lshr i64 %b, 32
17046     // %conv4 = trunc i64 %shr3 to i32
17047     // %0 = tail call i32 @llvm.ppc.cmpb32(i32 %conv, i32 %conv1)
17048     // %conv5 = zext i32 %0 to i64
17049     // %1 = tail call i32 @llvm.ppc.cmpb32(i32 %conv2, i32 %conv4)
17050     // %conv614 = zext i32 %1 to i64
17051     // %shl = shl nuw i64 %conv614, 32
17052     // %or = or i64 %shl, %conv5
17053     // ret i64 %or
17054     Function *F =
17055         CGM.getIntrinsic(Intrinsic::ppc_cmpb, {Int32Ty, Int32Ty, Int32Ty});
17056     Value *ArgOneLo = Builder.CreateTrunc(Op0, Int32Ty);
17057     Value *ArgTwoLo = Builder.CreateTrunc(Op1, Int32Ty);
17058     Constant *ShiftAmt = ConstantInt::get(Int64Ty, 32);
17059     Value *ArgOneHi =
17060         Builder.CreateTrunc(Builder.CreateLShr(Op0, ShiftAmt), Int32Ty);
17061     Value *ArgTwoHi =
17062         Builder.CreateTrunc(Builder.CreateLShr(Op1, ShiftAmt), Int32Ty);
17063     Value *ResLo = Builder.CreateZExt(
17064         Builder.CreateCall(F, {ArgOneLo, ArgTwoLo}, "cmpb"), Int64Ty);
17065     Value *ResHiShift = Builder.CreateZExt(
17066         Builder.CreateCall(F, {ArgOneHi, ArgTwoHi}, "cmpb"), Int64Ty);
17067     Value *ResHi = Builder.CreateShl(ResHiShift, ShiftAmt);
17068     return Builder.CreateOr(ResLo, ResHi);
17069   }
17070   // Copy sign
17071   case PPC::BI__builtin_vsx_xvcpsgnsp:
17072   case PPC::BI__builtin_vsx_xvcpsgndp: {
17073     llvm::Type *ResultType = ConvertType(E->getType());
17074     Value *X = EmitScalarExpr(E->getArg(0));
17075     Value *Y = EmitScalarExpr(E->getArg(1));
17076     ID = Intrinsic::copysign;
17077     llvm::Function *F = CGM.getIntrinsic(ID, ResultType);
17078     return Builder.CreateCall(F, {X, Y});
17079   }
17080   // Rounding/truncation
17081   case PPC::BI__builtin_vsx_xvrspip:
17082   case PPC::BI__builtin_vsx_xvrdpip:
17083   case PPC::BI__builtin_vsx_xvrdpim:
17084   case PPC::BI__builtin_vsx_xvrspim:
17085   case PPC::BI__builtin_vsx_xvrdpi:
17086   case PPC::BI__builtin_vsx_xvrspi:
17087   case PPC::BI__builtin_vsx_xvrdpic:
17088   case PPC::BI__builtin_vsx_xvrspic:
17089   case PPC::BI__builtin_vsx_xvrdpiz:
17090   case PPC::BI__builtin_vsx_xvrspiz: {
17091     llvm::Type *ResultType = ConvertType(E->getType());
17092     Value *X = EmitScalarExpr(E->getArg(0));
17093     if (BuiltinID == PPC::BI__builtin_vsx_xvrdpim ||
17094         BuiltinID == PPC::BI__builtin_vsx_xvrspim)
17095       ID = Builder.getIsFPConstrained()
17096                ? Intrinsic::experimental_constrained_floor
17097                : Intrinsic::floor;
17098     else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpi ||
17099              BuiltinID == PPC::BI__builtin_vsx_xvrspi)
17100       ID = Builder.getIsFPConstrained()
17101                ? Intrinsic::experimental_constrained_round
17102                : Intrinsic::round;
17103     else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpic ||
17104              BuiltinID == PPC::BI__builtin_vsx_xvrspic)
17105       ID = Builder.getIsFPConstrained()
17106                ? Intrinsic::experimental_constrained_rint
17107                : Intrinsic::rint;
17108     else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpip ||
17109              BuiltinID == PPC::BI__builtin_vsx_xvrspip)
17110       ID = Builder.getIsFPConstrained()
17111                ? Intrinsic::experimental_constrained_ceil
17112                : Intrinsic::ceil;
17113     else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpiz ||
17114              BuiltinID == PPC::BI__builtin_vsx_xvrspiz)
17115       ID = Builder.getIsFPConstrained()
17116                ? Intrinsic::experimental_constrained_trunc
17117                : Intrinsic::trunc;
17118     llvm::Function *F = CGM.getIntrinsic(ID, ResultType);
17119     return Builder.getIsFPConstrained() ? Builder.CreateConstrainedFPCall(F, X)
17120                                         : Builder.CreateCall(F, X);
17121   }
17122 
17123   // Absolute value
17124   case PPC::BI__builtin_vsx_xvabsdp:
17125   case PPC::BI__builtin_vsx_xvabssp: {
17126     llvm::Type *ResultType = ConvertType(E->getType());
17127     Value *X = EmitScalarExpr(E->getArg(0));
17128     llvm::Function *F = CGM.getIntrinsic(Intrinsic::fabs, ResultType);
17129     return Builder.CreateCall(F, X);
17130   }
17131 
17132   // Fastmath by default
17133   case PPC::BI__builtin_ppc_recipdivf:
17134   case PPC::BI__builtin_ppc_recipdivd:
17135   case PPC::BI__builtin_ppc_rsqrtf:
17136   case PPC::BI__builtin_ppc_rsqrtd: {
17137     FastMathFlags FMF = Builder.getFastMathFlags();
17138     Builder.getFastMathFlags().setFast();
17139     llvm::Type *ResultType = ConvertType(E->getType());
17140     Value *X = EmitScalarExpr(E->getArg(0));
17141 
17142     if (BuiltinID == PPC::BI__builtin_ppc_recipdivf ||
17143         BuiltinID == PPC::BI__builtin_ppc_recipdivd) {
17144       Value *Y = EmitScalarExpr(E->getArg(1));
17145       Value *FDiv = Builder.CreateFDiv(X, Y, "recipdiv");
17146       Builder.getFastMathFlags() &= (FMF);
17147       return FDiv;
17148     }
17149     auto *One = ConstantFP::get(ResultType, 1.0);
17150     llvm::Function *F = CGM.getIntrinsic(Intrinsic::sqrt, ResultType);
17151     Value *FDiv = Builder.CreateFDiv(One, Builder.CreateCall(F, X), "rsqrt");
17152     Builder.getFastMathFlags() &= (FMF);
17153     return FDiv;
17154   }
17155   case PPC::BI__builtin_ppc_alignx: {
17156     Value *Op0 = EmitScalarExpr(E->getArg(0));
17157     Value *Op1 = EmitScalarExpr(E->getArg(1));
17158     ConstantInt *AlignmentCI = cast<ConstantInt>(Op0);
17159     if (AlignmentCI->getValue().ugt(llvm::Value::MaximumAlignment))
17160       AlignmentCI = ConstantInt::get(AlignmentCI->getIntegerType(),
17161                                      llvm::Value::MaximumAlignment);
17162 
17163     emitAlignmentAssumption(Op1, E->getArg(1),
17164                             /*The expr loc is sufficient.*/ SourceLocation(),
17165                             AlignmentCI, nullptr);
17166     return Op1;
17167   }
17168   case PPC::BI__builtin_ppc_rdlam: {
17169     Value *Op0 = EmitScalarExpr(E->getArg(0));
17170     Value *Op1 = EmitScalarExpr(E->getArg(1));
17171     Value *Op2 = EmitScalarExpr(E->getArg(2));
17172     llvm::Type *Ty = Op0->getType();
17173     Value *ShiftAmt = Builder.CreateIntCast(Op1, Ty, false);
17174     Function *F = CGM.getIntrinsic(Intrinsic::fshl, Ty);
17175     Value *Rotate = Builder.CreateCall(F, {Op0, Op0, ShiftAmt});
17176     return Builder.CreateAnd(Rotate, Op2);
17177   }
17178   case PPC::BI__builtin_ppc_load2r: {
17179     Function *F = CGM.getIntrinsic(Intrinsic::ppc_load2r);
17180     Value *Op0 = EmitScalarExpr(E->getArg(0));
17181     Value *LoadIntrinsic = Builder.CreateCall(F, {Op0});
17182     return Builder.CreateTrunc(LoadIntrinsic, Int16Ty);
17183   }
17184   // FMA variations
17185   case PPC::BI__builtin_ppc_fnmsub:
17186   case PPC::BI__builtin_ppc_fnmsubs:
17187   case PPC::BI__builtin_vsx_xvmaddadp:
17188   case PPC::BI__builtin_vsx_xvmaddasp:
17189   case PPC::BI__builtin_vsx_xvnmaddadp:
17190   case PPC::BI__builtin_vsx_xvnmaddasp:
17191   case PPC::BI__builtin_vsx_xvmsubadp:
17192   case PPC::BI__builtin_vsx_xvmsubasp:
17193   case PPC::BI__builtin_vsx_xvnmsubadp:
17194   case PPC::BI__builtin_vsx_xvnmsubasp: {
17195     llvm::Type *ResultType = ConvertType(E->getType());
17196     Value *X = EmitScalarExpr(E->getArg(0));
17197     Value *Y = EmitScalarExpr(E->getArg(1));
17198     Value *Z = EmitScalarExpr(E->getArg(2));
17199     llvm::Function *F;
17200     if (Builder.getIsFPConstrained())
17201       F = CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, ResultType);
17202     else
17203       F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
17204     switch (BuiltinID) {
17205       case PPC::BI__builtin_vsx_xvmaddadp:
17206       case PPC::BI__builtin_vsx_xvmaddasp:
17207         if (Builder.getIsFPConstrained())
17208           return Builder.CreateConstrainedFPCall(F, {X, Y, Z});
17209         else
17210           return Builder.CreateCall(F, {X, Y, Z});
17211       case PPC::BI__builtin_vsx_xvnmaddadp:
17212       case PPC::BI__builtin_vsx_xvnmaddasp:
17213         if (Builder.getIsFPConstrained())
17214           return Builder.CreateFNeg(
17215               Builder.CreateConstrainedFPCall(F, {X, Y, Z}), "neg");
17216         else
17217           return Builder.CreateFNeg(Builder.CreateCall(F, {X, Y, Z}), "neg");
17218       case PPC::BI__builtin_vsx_xvmsubadp:
17219       case PPC::BI__builtin_vsx_xvmsubasp:
17220         if (Builder.getIsFPConstrained())
17221           return Builder.CreateConstrainedFPCall(
17222               F, {X, Y, Builder.CreateFNeg(Z, "neg")});
17223         else
17224           return Builder.CreateCall(F, {X, Y, Builder.CreateFNeg(Z, "neg")});
17225       case PPC::BI__builtin_ppc_fnmsub:
17226       case PPC::BI__builtin_ppc_fnmsubs:
17227       case PPC::BI__builtin_vsx_xvnmsubadp:
17228       case PPC::BI__builtin_vsx_xvnmsubasp:
17229         if (Builder.getIsFPConstrained())
17230           return Builder.CreateFNeg(
17231               Builder.CreateConstrainedFPCall(
17232                   F, {X, Y, Builder.CreateFNeg(Z, "neg")}),
17233               "neg");
17234         else
17235           return Builder.CreateCall(
17236               CGM.getIntrinsic(Intrinsic::ppc_fnmsub, ResultType), {X, Y, Z});
17237       }
17238     llvm_unreachable("Unknown FMA operation");
17239     return nullptr; // Suppress no-return warning
17240   }
17241 
17242   case PPC::BI__builtin_vsx_insertword: {
17243     Value *Op0 = EmitScalarExpr(E->getArg(0));
17244     Value *Op1 = EmitScalarExpr(E->getArg(1));
17245     Value *Op2 = EmitScalarExpr(E->getArg(2));
17246     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_vsx_xxinsertw);
17247 
17248     // Third argument is a compile time constant int. It must be clamped to
17249     // to the range [0, 12].
17250     ConstantInt *ArgCI = dyn_cast<ConstantInt>(Op2);
17251     assert(ArgCI &&
17252            "Third arg to xxinsertw intrinsic must be constant integer");
17253     const int64_t MaxIndex = 12;
17254     int64_t Index = std::clamp(ArgCI->getSExtValue(), (int64_t)0, MaxIndex);
17255 
17256     // The builtin semantics don't exactly match the xxinsertw instructions
17257     // semantics (which ppc_vsx_xxinsertw follows). The builtin extracts the
17258     // word from the first argument, and inserts it in the second argument. The
17259     // instruction extracts the word from its second input register and inserts
17260     // it into its first input register, so swap the first and second arguments.
17261     std::swap(Op0, Op1);
17262 
17263     // Need to cast the second argument from a vector of unsigned int to a
17264     // vector of long long.
17265     Op1 = Builder.CreateBitCast(Op1, llvm::FixedVectorType::get(Int64Ty, 2));
17266 
17267     if (getTarget().isLittleEndian()) {
17268       // Reverse the double words in the vector we will extract from.
17269       Op0 = Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int64Ty, 2));
17270       Op0 = Builder.CreateShuffleVector(Op0, Op0, ArrayRef<int>{1, 0});
17271 
17272       // Reverse the index.
17273       Index = MaxIndex - Index;
17274     }
17275 
17276     // Intrinsic expects the first arg to be a vector of int.
17277     Op0 = Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int32Ty, 4));
17278     Op2 = ConstantInt::getSigned(Int32Ty, Index);
17279     return Builder.CreateCall(F, {Op0, Op1, Op2});
17280   }
17281 
17282   case PPC::BI__builtin_vsx_extractuword: {
17283     Value *Op0 = EmitScalarExpr(E->getArg(0));
17284     Value *Op1 = EmitScalarExpr(E->getArg(1));
17285     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_vsx_xxextractuw);
17286 
17287     // Intrinsic expects the first argument to be a vector of doublewords.
17288     Op0 = Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int64Ty, 2));
17289 
17290     // The second argument is a compile time constant int that needs to
17291     // be clamped to the range [0, 12].
17292     ConstantInt *ArgCI = dyn_cast<ConstantInt>(Op1);
17293     assert(ArgCI &&
17294            "Second Arg to xxextractuw intrinsic must be a constant integer!");
17295     const int64_t MaxIndex = 12;
17296     int64_t Index = std::clamp(ArgCI->getSExtValue(), (int64_t)0, MaxIndex);
17297 
17298     if (getTarget().isLittleEndian()) {
17299       // Reverse the index.
17300       Index = MaxIndex - Index;
17301       Op1 = ConstantInt::getSigned(Int32Ty, Index);
17302 
17303       // Emit the call, then reverse the double words of the results vector.
17304       Value *Call = Builder.CreateCall(F, {Op0, Op1});
17305 
17306       Value *ShuffleCall =
17307           Builder.CreateShuffleVector(Call, Call, ArrayRef<int>{1, 0});
17308       return ShuffleCall;
17309     } else {
17310       Op1 = ConstantInt::getSigned(Int32Ty, Index);
17311       return Builder.CreateCall(F, {Op0, Op1});
17312     }
17313   }
17314 
17315   case PPC::BI__builtin_vsx_xxpermdi: {
17316     Value *Op0 = EmitScalarExpr(E->getArg(0));
17317     Value *Op1 = EmitScalarExpr(E->getArg(1));
17318     Value *Op2 = EmitScalarExpr(E->getArg(2));
17319     ConstantInt *ArgCI = dyn_cast<ConstantInt>(Op2);
17320     assert(ArgCI && "Third arg must be constant integer!");
17321 
17322     unsigned Index = ArgCI->getZExtValue();
17323     Op0 = Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int64Ty, 2));
17324     Op1 = Builder.CreateBitCast(Op1, llvm::FixedVectorType::get(Int64Ty, 2));
17325 
17326     // Account for endianness by treating this as just a shuffle. So we use the
17327     // same indices for both LE and BE in order to produce expected results in
17328     // both cases.
17329     int ElemIdx0 = (Index & 2) >> 1;
17330     int ElemIdx1 = 2 + (Index & 1);
17331 
17332     int ShuffleElts[2] = {ElemIdx0, ElemIdx1};
17333     Value *ShuffleCall = Builder.CreateShuffleVector(Op0, Op1, ShuffleElts);
17334     QualType BIRetType = E->getType();
17335     auto RetTy = ConvertType(BIRetType);
17336     return Builder.CreateBitCast(ShuffleCall, RetTy);
17337   }
17338 
17339   case PPC::BI__builtin_vsx_xxsldwi: {
17340     Value *Op0 = EmitScalarExpr(E->getArg(0));
17341     Value *Op1 = EmitScalarExpr(E->getArg(1));
17342     Value *Op2 = EmitScalarExpr(E->getArg(2));
17343     ConstantInt *ArgCI = dyn_cast<ConstantInt>(Op2);
17344     assert(ArgCI && "Third argument must be a compile time constant");
17345     unsigned Index = ArgCI->getZExtValue() & 0x3;
17346     Op0 = Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int32Ty, 4));
17347     Op1 = Builder.CreateBitCast(Op1, llvm::FixedVectorType::get(Int32Ty, 4));
17348 
17349     // Create a shuffle mask
17350     int ElemIdx0;
17351     int ElemIdx1;
17352     int ElemIdx2;
17353     int ElemIdx3;
17354     if (getTarget().isLittleEndian()) {
17355       // Little endian element N comes from element 8+N-Index of the
17356       // concatenated wide vector (of course, using modulo arithmetic on
17357       // the total number of elements).
17358       ElemIdx0 = (8 - Index) % 8;
17359       ElemIdx1 = (9 - Index) % 8;
17360       ElemIdx2 = (10 - Index) % 8;
17361       ElemIdx3 = (11 - Index) % 8;
17362     } else {
17363       // Big endian ElemIdx<N> = Index + N
17364       ElemIdx0 = Index;
17365       ElemIdx1 = Index + 1;
17366       ElemIdx2 = Index + 2;
17367       ElemIdx3 = Index + 3;
17368     }
17369 
17370     int ShuffleElts[4] = {ElemIdx0, ElemIdx1, ElemIdx2, ElemIdx3};
17371     Value *ShuffleCall = Builder.CreateShuffleVector(Op0, Op1, ShuffleElts);
17372     QualType BIRetType = E->getType();
17373     auto RetTy = ConvertType(BIRetType);
17374     return Builder.CreateBitCast(ShuffleCall, RetTy);
17375   }
17376 
17377   case PPC::BI__builtin_pack_vector_int128: {
17378     Value *Op0 = EmitScalarExpr(E->getArg(0));
17379     Value *Op1 = EmitScalarExpr(E->getArg(1));
17380     bool isLittleEndian = getTarget().isLittleEndian();
17381     Value *PoisonValue =
17382         llvm::PoisonValue::get(llvm::FixedVectorType::get(Op0->getType(), 2));
17383     Value *Res = Builder.CreateInsertElement(
17384         PoisonValue, Op0, (uint64_t)(isLittleEndian ? 1 : 0));
17385     Res = Builder.CreateInsertElement(Res, Op1,
17386                                       (uint64_t)(isLittleEndian ? 0 : 1));
17387     return Builder.CreateBitCast(Res, ConvertType(E->getType()));
17388   }
17389 
17390   case PPC::BI__builtin_unpack_vector_int128: {
17391     Value *Op0 = EmitScalarExpr(E->getArg(0));
17392     Value *Op1 = EmitScalarExpr(E->getArg(1));
17393     ConstantInt *Index = cast<ConstantInt>(Op1);
17394     Value *Unpacked = Builder.CreateBitCast(
17395         Op0, llvm::FixedVectorType::get(ConvertType(E->getType()), 2));
17396 
17397     if (getTarget().isLittleEndian())
17398       Index =
17399           ConstantInt::get(Index->getIntegerType(), 1 - Index->getZExtValue());
17400 
17401     return Builder.CreateExtractElement(Unpacked, Index);
17402   }
17403 
17404   case PPC::BI__builtin_ppc_sthcx: {
17405     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_sthcx);
17406     Value *Op0 = EmitScalarExpr(E->getArg(0));
17407     Value *Op1 = Builder.CreateSExt(EmitScalarExpr(E->getArg(1)), Int32Ty);
17408     return Builder.CreateCall(F, {Op0, Op1});
17409   }
17410 
17411   // The PPC MMA builtins take a pointer to a __vector_quad as an argument.
17412   // Some of the MMA instructions accumulate their result into an existing
17413   // accumulator whereas the others generate a new accumulator. So we need to
17414   // use custom code generation to expand a builtin call with a pointer to a
17415   // load (if the corresponding instruction accumulates its result) followed by
17416   // the call to the intrinsic and a store of the result.
17417 #define CUSTOM_BUILTIN(Name, Intr, Types, Accumulate, Feature) \
17418   case PPC::BI__builtin_##Name:
17419 #include "clang/Basic/BuiltinsPPC.def"
17420   {
17421     SmallVector<Value *, 4> Ops;
17422     for (unsigned i = 0, e = E->getNumArgs(); i != e; i++)
17423       if (E->getArg(i)->getType()->isArrayType())
17424         Ops.push_back(EmitArrayToPointerDecay(E->getArg(i)).getPointer());
17425       else
17426         Ops.push_back(EmitScalarExpr(E->getArg(i)));
17427     // The first argument of these two builtins is a pointer used to store their
17428     // result. However, the llvm intrinsics return their result in multiple
17429     // return values. So, here we emit code extracting these values from the
17430     // intrinsic results and storing them using that pointer.
17431     if (BuiltinID == PPC::BI__builtin_mma_disassemble_acc ||
17432         BuiltinID == PPC::BI__builtin_vsx_disassemble_pair ||
17433         BuiltinID == PPC::BI__builtin_mma_disassemble_pair) {
17434       unsigned NumVecs = 2;
17435       auto Intrinsic = Intrinsic::ppc_vsx_disassemble_pair;
17436       if (BuiltinID == PPC::BI__builtin_mma_disassemble_acc) {
17437         NumVecs = 4;
17438         Intrinsic = Intrinsic::ppc_mma_disassemble_acc;
17439       }
17440       llvm::Function *F = CGM.getIntrinsic(Intrinsic);
17441       Address Addr = EmitPointerWithAlignment(E->getArg(1));
17442       Value *Vec = Builder.CreateLoad(Addr);
17443       Value *Call = Builder.CreateCall(F, {Vec});
17444       llvm::Type *VTy = llvm::FixedVectorType::get(Int8Ty, 16);
17445       Value *Ptr = Ops[0];
17446       for (unsigned i=0; i<NumVecs; i++) {
17447         Value *Vec = Builder.CreateExtractValue(Call, i);
17448         llvm::ConstantInt* Index = llvm::ConstantInt::get(IntTy, i);
17449         Value *GEP = Builder.CreateInBoundsGEP(VTy, Ptr, Index);
17450         Builder.CreateAlignedStore(Vec, GEP, MaybeAlign(16));
17451       }
17452       return Call;
17453     }
17454     if (BuiltinID == PPC::BI__builtin_vsx_build_pair ||
17455         BuiltinID == PPC::BI__builtin_mma_build_acc) {
17456       // Reverse the order of the operands for LE, so the
17457       // same builtin call can be used on both LE and BE
17458       // without the need for the programmer to swap operands.
17459       // The operands are reversed starting from the second argument,
17460       // the first operand is the pointer to the pair/accumulator
17461       // that is being built.
17462       if (getTarget().isLittleEndian())
17463         std::reverse(Ops.begin() + 1, Ops.end());
17464     }
17465     bool Accumulate;
17466     switch (BuiltinID) {
17467   #define CUSTOM_BUILTIN(Name, Intr, Types, Acc, Feature) \
17468     case PPC::BI__builtin_##Name: \
17469       ID = Intrinsic::ppc_##Intr; \
17470       Accumulate = Acc; \
17471       break;
17472   #include "clang/Basic/BuiltinsPPC.def"
17473     }
17474     if (BuiltinID == PPC::BI__builtin_vsx_lxvp ||
17475         BuiltinID == PPC::BI__builtin_vsx_stxvp ||
17476         BuiltinID == PPC::BI__builtin_mma_lxvp ||
17477         BuiltinID == PPC::BI__builtin_mma_stxvp) {
17478       if (BuiltinID == PPC::BI__builtin_vsx_lxvp ||
17479           BuiltinID == PPC::BI__builtin_mma_lxvp) {
17480         Ops[0] = Builder.CreateGEP(Int8Ty, Ops[1], Ops[0]);
17481       } else {
17482         Ops[1] = Builder.CreateGEP(Int8Ty, Ops[2], Ops[1]);
17483       }
17484       Ops.pop_back();
17485       llvm::Function *F = CGM.getIntrinsic(ID);
17486       return Builder.CreateCall(F, Ops, "");
17487     }
17488     SmallVector<Value*, 4> CallOps;
17489     if (Accumulate) {
17490       Address Addr = EmitPointerWithAlignment(E->getArg(0));
17491       Value *Acc = Builder.CreateLoad(Addr);
17492       CallOps.push_back(Acc);
17493     }
17494     for (unsigned i=1; i<Ops.size(); i++)
17495       CallOps.push_back(Ops[i]);
17496     llvm::Function *F = CGM.getIntrinsic(ID);
17497     Value *Call = Builder.CreateCall(F, CallOps);
17498     return Builder.CreateAlignedStore(Call, Ops[0], MaybeAlign(64));
17499   }
17500 
17501   case PPC::BI__builtin_ppc_compare_and_swap:
17502   case PPC::BI__builtin_ppc_compare_and_swaplp: {
17503     Address Addr = EmitPointerWithAlignment(E->getArg(0));
17504     Address OldValAddr = EmitPointerWithAlignment(E->getArg(1));
17505     Value *OldVal = Builder.CreateLoad(OldValAddr);
17506     QualType AtomicTy = E->getArg(0)->getType()->getPointeeType();
17507     LValue LV = MakeAddrLValue(Addr, AtomicTy);
17508     Value *Op2 = EmitScalarExpr(E->getArg(2));
17509     auto Pair = EmitAtomicCompareExchange(
17510         LV, RValue::get(OldVal), RValue::get(Op2), E->getExprLoc(),
17511         llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Monotonic, true);
17512     // Unlike c11's atomic_compare_exchange, according to
17513     // https://www.ibm.com/docs/en/xl-c-and-cpp-aix/16.1?topic=functions-compare-swap-compare-swaplp
17514     // > In either case, the contents of the memory location specified by addr
17515     // > are copied into the memory location specified by old_val_addr.
17516     // But it hasn't specified storing to OldValAddr is atomic or not and
17517     // which order to use. Now following XL's codegen, treat it as a normal
17518     // store.
17519     Value *LoadedVal = Pair.first.getScalarVal();
17520     Builder.CreateStore(LoadedVal, OldValAddr);
17521     return Builder.CreateZExt(Pair.second, Builder.getInt32Ty());
17522   }
17523   case PPC::BI__builtin_ppc_fetch_and_add:
17524   case PPC::BI__builtin_ppc_fetch_and_addlp: {
17525     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E,
17526                                  llvm::AtomicOrdering::Monotonic);
17527   }
17528   case PPC::BI__builtin_ppc_fetch_and_and:
17529   case PPC::BI__builtin_ppc_fetch_and_andlp: {
17530     return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E,
17531                                  llvm::AtomicOrdering::Monotonic);
17532   }
17533 
17534   case PPC::BI__builtin_ppc_fetch_and_or:
17535   case PPC::BI__builtin_ppc_fetch_and_orlp: {
17536     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E,
17537                                  llvm::AtomicOrdering::Monotonic);
17538   }
17539   case PPC::BI__builtin_ppc_fetch_and_swap:
17540   case PPC::BI__builtin_ppc_fetch_and_swaplp: {
17541     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E,
17542                                  llvm::AtomicOrdering::Monotonic);
17543   }
17544   case PPC::BI__builtin_ppc_ldarx:
17545   case PPC::BI__builtin_ppc_lwarx:
17546   case PPC::BI__builtin_ppc_lharx:
17547   case PPC::BI__builtin_ppc_lbarx:
17548     return emitPPCLoadReserveIntrinsic(*this, BuiltinID, E);
17549   case PPC::BI__builtin_ppc_mfspr: {
17550     Value *Op0 = EmitScalarExpr(E->getArg(0));
17551     llvm::Type *RetType = CGM.getDataLayout().getTypeSizeInBits(VoidPtrTy) == 32
17552                               ? Int32Ty
17553                               : Int64Ty;
17554     Function *F = CGM.getIntrinsic(Intrinsic::ppc_mfspr, RetType);
17555     return Builder.CreateCall(F, {Op0});
17556   }
17557   case PPC::BI__builtin_ppc_mtspr: {
17558     Value *Op0 = EmitScalarExpr(E->getArg(0));
17559     Value *Op1 = EmitScalarExpr(E->getArg(1));
17560     llvm::Type *RetType = CGM.getDataLayout().getTypeSizeInBits(VoidPtrTy) == 32
17561                               ? Int32Ty
17562                               : Int64Ty;
17563     Function *F = CGM.getIntrinsic(Intrinsic::ppc_mtspr, RetType);
17564     return Builder.CreateCall(F, {Op0, Op1});
17565   }
17566   case PPC::BI__builtin_ppc_popcntb: {
17567     Value *ArgValue = EmitScalarExpr(E->getArg(0));
17568     llvm::Type *ArgType = ArgValue->getType();
17569     Function *F = CGM.getIntrinsic(Intrinsic::ppc_popcntb, {ArgType, ArgType});
17570     return Builder.CreateCall(F, {ArgValue}, "popcntb");
17571   }
17572   case PPC::BI__builtin_ppc_mtfsf: {
17573     // The builtin takes a uint32 that needs to be cast to an
17574     // f64 to be passed to the intrinsic.
17575     Value *Op0 = EmitScalarExpr(E->getArg(0));
17576     Value *Op1 = EmitScalarExpr(E->getArg(1));
17577     Value *Cast = Builder.CreateUIToFP(Op1, DoubleTy);
17578     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_mtfsf);
17579     return Builder.CreateCall(F, {Op0, Cast}, "");
17580   }
17581 
17582   case PPC::BI__builtin_ppc_swdiv_nochk:
17583   case PPC::BI__builtin_ppc_swdivs_nochk: {
17584     Value *Op0 = EmitScalarExpr(E->getArg(0));
17585     Value *Op1 = EmitScalarExpr(E->getArg(1));
17586     FastMathFlags FMF = Builder.getFastMathFlags();
17587     Builder.getFastMathFlags().setFast();
17588     Value *FDiv = Builder.CreateFDiv(Op0, Op1, "swdiv_nochk");
17589     Builder.getFastMathFlags() &= (FMF);
17590     return FDiv;
17591   }
17592   case PPC::BI__builtin_ppc_fric:
17593     return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
17594                            *this, E, Intrinsic::rint,
17595                            Intrinsic::experimental_constrained_rint))
17596         .getScalarVal();
17597   case PPC::BI__builtin_ppc_frim:
17598   case PPC::BI__builtin_ppc_frims:
17599     return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
17600                            *this, E, Intrinsic::floor,
17601                            Intrinsic::experimental_constrained_floor))
17602         .getScalarVal();
17603   case PPC::BI__builtin_ppc_frin:
17604   case PPC::BI__builtin_ppc_frins:
17605     return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
17606                            *this, E, Intrinsic::round,
17607                            Intrinsic::experimental_constrained_round))
17608         .getScalarVal();
17609   case PPC::BI__builtin_ppc_frip:
17610   case PPC::BI__builtin_ppc_frips:
17611     return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
17612                            *this, E, Intrinsic::ceil,
17613                            Intrinsic::experimental_constrained_ceil))
17614         .getScalarVal();
17615   case PPC::BI__builtin_ppc_friz:
17616   case PPC::BI__builtin_ppc_frizs:
17617     return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
17618                            *this, E, Intrinsic::trunc,
17619                            Intrinsic::experimental_constrained_trunc))
17620         .getScalarVal();
17621   case PPC::BI__builtin_ppc_fsqrt:
17622   case PPC::BI__builtin_ppc_fsqrts:
17623     return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
17624                            *this, E, Intrinsic::sqrt,
17625                            Intrinsic::experimental_constrained_sqrt))
17626         .getScalarVal();
17627   case PPC::BI__builtin_ppc_test_data_class: {
17628     Value *Op0 = EmitScalarExpr(E->getArg(0));
17629     Value *Op1 = EmitScalarExpr(E->getArg(1));
17630     return Builder.CreateCall(
17631         CGM.getIntrinsic(Intrinsic::ppc_test_data_class, Op0->getType()),
17632         {Op0, Op1}, "test_data_class");
17633   }
17634   case PPC::BI__builtin_ppc_maxfe: {
17635     Value *Op0 = EmitScalarExpr(E->getArg(0));
17636     Value *Op1 = EmitScalarExpr(E->getArg(1));
17637     Value *Op2 = EmitScalarExpr(E->getArg(2));
17638     Value *Op3 = EmitScalarExpr(E->getArg(3));
17639     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_maxfe),
17640                               {Op0, Op1, Op2, Op3});
17641   }
17642   case PPC::BI__builtin_ppc_maxfl: {
17643     Value *Op0 = EmitScalarExpr(E->getArg(0));
17644     Value *Op1 = EmitScalarExpr(E->getArg(1));
17645     Value *Op2 = EmitScalarExpr(E->getArg(2));
17646     Value *Op3 = EmitScalarExpr(E->getArg(3));
17647     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_maxfl),
17648                               {Op0, Op1, Op2, Op3});
17649   }
17650   case PPC::BI__builtin_ppc_maxfs: {
17651     Value *Op0 = EmitScalarExpr(E->getArg(0));
17652     Value *Op1 = EmitScalarExpr(E->getArg(1));
17653     Value *Op2 = EmitScalarExpr(E->getArg(2));
17654     Value *Op3 = EmitScalarExpr(E->getArg(3));
17655     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_maxfs),
17656                               {Op0, Op1, Op2, Op3});
17657   }
17658   case PPC::BI__builtin_ppc_minfe: {
17659     Value *Op0 = EmitScalarExpr(E->getArg(0));
17660     Value *Op1 = EmitScalarExpr(E->getArg(1));
17661     Value *Op2 = EmitScalarExpr(E->getArg(2));
17662     Value *Op3 = EmitScalarExpr(E->getArg(3));
17663     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_minfe),
17664                               {Op0, Op1, Op2, Op3});
17665   }
17666   case PPC::BI__builtin_ppc_minfl: {
17667     Value *Op0 = EmitScalarExpr(E->getArg(0));
17668     Value *Op1 = EmitScalarExpr(E->getArg(1));
17669     Value *Op2 = EmitScalarExpr(E->getArg(2));
17670     Value *Op3 = EmitScalarExpr(E->getArg(3));
17671     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_minfl),
17672                               {Op0, Op1, Op2, Op3});
17673   }
17674   case PPC::BI__builtin_ppc_minfs: {
17675     Value *Op0 = EmitScalarExpr(E->getArg(0));
17676     Value *Op1 = EmitScalarExpr(E->getArg(1));
17677     Value *Op2 = EmitScalarExpr(E->getArg(2));
17678     Value *Op3 = EmitScalarExpr(E->getArg(3));
17679     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_minfs),
17680                               {Op0, Op1, Op2, Op3});
17681   }
17682   case PPC::BI__builtin_ppc_swdiv:
17683   case PPC::BI__builtin_ppc_swdivs: {
17684     Value *Op0 = EmitScalarExpr(E->getArg(0));
17685     Value *Op1 = EmitScalarExpr(E->getArg(1));
17686     return Builder.CreateFDiv(Op0, Op1, "swdiv");
17687   }
17688   case PPC::BI__builtin_ppc_set_fpscr_rn:
17689     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_setrnd),
17690                               {EmitScalarExpr(E->getArg(0))});
17691   case PPC::BI__builtin_ppc_mffs:
17692     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_readflm));
17693   }
17694 }
17695 
17696 namespace {
17697 // If \p E is not null pointer, insert address space cast to match return
17698 // type of \p E if necessary.
17699 Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF,
17700                              const CallExpr *E = nullptr) {
17701   auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_dispatch_ptr);
17702   auto *Call = CGF.Builder.CreateCall(F);
17703   Call->addRetAttr(
17704       Attribute::getWithDereferenceableBytes(Call->getContext(), 64));
17705   Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4)));
17706   if (!E)
17707     return Call;
17708   QualType BuiltinRetType = E->getType();
17709   auto *RetTy = cast<llvm::PointerType>(CGF.ConvertType(BuiltinRetType));
17710   if (RetTy == Call->getType())
17711     return Call;
17712   return CGF.Builder.CreateAddrSpaceCast(Call, RetTy);
17713 }
17714 
17715 Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) {
17716   auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_implicitarg_ptr);
17717   auto *Call = CGF.Builder.CreateCall(F);
17718   Call->addRetAttr(
17719       Attribute::getWithDereferenceableBytes(Call->getContext(), 256));
17720   Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(8)));
17721   return Call;
17722 }
17723 
17724 // \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
17725 /// Emit code based on Code Object ABI version.
17726 /// COV_4    : Emit code to use dispatch ptr
17727 /// COV_5    : Emit code to use implicitarg ptr
17728 /// COV_NONE : Emit code to load a global variable "__oclc_ABI_version"
17729 ///            and use its value for COV_4 or COV_5 approach. It is used for
17730 ///            compiling device libraries in an ABI-agnostic way.
17731 ///
17732 /// Note: "__oclc_ABI_version" is supposed to be emitted and intialized by
17733 ///       clang during compilation of user code.
17734 Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
17735   llvm::LoadInst *LD;
17736 
17737   auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion;
17738 
17739   if (Cov == CodeObjectVersionKind::COV_None) {
17740     StringRef Name = "__oclc_ABI_version";
17741     auto *ABIVersionC = CGF.CGM.getModule().getNamedGlobal(Name);
17742     if (!ABIVersionC)
17743       ABIVersionC = new llvm::GlobalVariable(
17744           CGF.CGM.getModule(), CGF.Int32Ty, false,
17745           llvm::GlobalValue::ExternalLinkage, nullptr, Name, nullptr,
17746           llvm::GlobalVariable::NotThreadLocal,
17747           CGF.CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant));
17748 
17749     // This load will be eliminated by the IPSCCP because it is constant
17750     // weak_odr without externally_initialized. Either changing it to weak or
17751     // adding externally_initialized will keep the load.
17752     Value *ABIVersion = CGF.Builder.CreateAlignedLoad(CGF.Int32Ty, ABIVersionC,
17753                                                       CGF.CGM.getIntAlign());
17754 
17755     Value *IsCOV5 = CGF.Builder.CreateICmpSGE(
17756         ABIVersion,
17757         llvm::ConstantInt::get(CGF.Int32Ty, CodeObjectVersionKind::COV_5));
17758 
17759     // Indexing the implicit kernarg segment.
17760     Value *ImplicitGEP = CGF.Builder.CreateConstGEP1_32(
17761         CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2);
17762 
17763     // Indexing the HSA kernel_dispatch_packet struct.
17764     Value *DispatchGEP = CGF.Builder.CreateConstGEP1_32(
17765         CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2);
17766 
17767     auto Result = CGF.Builder.CreateSelect(IsCOV5, ImplicitGEP, DispatchGEP);
17768     LD = CGF.Builder.CreateLoad(
17769         Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(2)));
17770   } else {
17771     Value *GEP = nullptr;
17772     if (Cov == CodeObjectVersionKind::COV_5) {
17773       // Indexing the implicit kernarg segment.
17774       GEP = CGF.Builder.CreateConstGEP1_32(
17775           CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2);
17776     } else {
17777       // Indexing the HSA kernel_dispatch_packet struct.
17778       GEP = CGF.Builder.CreateConstGEP1_32(
17779           CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2);
17780     }
17781     LD = CGF.Builder.CreateLoad(
17782         Address(GEP, CGF.Int16Ty, CharUnits::fromQuantity(2)));
17783   }
17784 
17785   llvm::MDBuilder MDHelper(CGF.getLLVMContext());
17786   llvm::MDNode *RNode = MDHelper.createRange(APInt(16, 1),
17787       APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1));
17788   LD->setMetadata(llvm::LLVMContext::MD_range, RNode);
17789   LD->setMetadata(llvm::LLVMContext::MD_noundef,
17790                   llvm::MDNode::get(CGF.getLLVMContext(), std::nullopt));
17791   LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
17792                   llvm::MDNode::get(CGF.getLLVMContext(), std::nullopt));
17793   return LD;
17794 }
17795 
17796 // \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
17797 Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned Index) {
17798   const unsigned XOffset = 12;
17799   auto *DP = EmitAMDGPUDispatchPtr(CGF);
17800   // Indexing the HSA kernel_dispatch_packet struct.
17801   auto *Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index * 4);
17802   auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset);
17803   auto *LD = CGF.Builder.CreateLoad(
17804       Address(GEP, CGF.Int32Ty, CharUnits::fromQuantity(4)));
17805   LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
17806                   llvm::MDNode::get(CGF.getLLVMContext(), std::nullopt));
17807   return LD;
17808 }
17809 } // namespace
17810 
17811 // For processing memory ordering and memory scope arguments of various
17812 // amdgcn builtins.
17813 // \p Order takes a C++11 comptabile memory-ordering specifier and converts
17814 // it into LLVM's memory ordering specifier using atomic C ABI, and writes
17815 // to \p AO. \p Scope takes a const char * and converts it into AMDGCN
17816 // specific SyncScopeID and writes it to \p SSID.
17817 void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
17818                                               llvm::AtomicOrdering &AO,
17819                                               llvm::SyncScope::ID &SSID) {
17820   int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
17821 
17822   // Map C11/C++11 memory ordering to LLVM memory ordering
17823   assert(llvm::isValidAtomicOrderingCABI(ord));
17824   switch (static_cast<llvm::AtomicOrderingCABI>(ord)) {
17825   case llvm::AtomicOrderingCABI::acquire:
17826   case llvm::AtomicOrderingCABI::consume:
17827     AO = llvm::AtomicOrdering::Acquire;
17828     break;
17829   case llvm::AtomicOrderingCABI::release:
17830     AO = llvm::AtomicOrdering::Release;
17831     break;
17832   case llvm::AtomicOrderingCABI::acq_rel:
17833     AO = llvm::AtomicOrdering::AcquireRelease;
17834     break;
17835   case llvm::AtomicOrderingCABI::seq_cst:
17836     AO = llvm::AtomicOrdering::SequentiallyConsistent;
17837     break;
17838   case llvm::AtomicOrderingCABI::relaxed:
17839     AO = llvm::AtomicOrdering::Monotonic;
17840     break;
17841   }
17842 
17843   StringRef scp;
17844   llvm::getConstantStringInfo(Scope, scp);
17845   SSID = getLLVMContext().getOrInsertSyncScopeID(scp);
17846 }
17847 
17848 llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments,
17849                                                           unsigned Idx,
17850                                                           const CallExpr *E) {
17851   llvm::Value *Arg = nullptr;
17852   if ((ICEArguments & (1 << Idx)) == 0) {
17853     Arg = EmitScalarExpr(E->getArg(Idx));
17854   } else {
17855     // If this is required to be a constant, constant fold it so that we
17856     // know that the generated intrinsic gets a ConstantInt.
17857     std::optional<llvm::APSInt> Result =
17858         E->getArg(Idx)->getIntegerConstantExpr(getContext());
17859     assert(Result && "Expected argument to be a constant");
17860     Arg = llvm::ConstantInt::get(getLLVMContext(), *Result);
17861   }
17862   return Arg;
17863 }
17864 
17865 Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
17866                                               const CallExpr *E) {
17867   llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
17868   llvm::SyncScope::ID SSID;
17869   switch (BuiltinID) {
17870   case AMDGPU::BI__builtin_amdgcn_div_scale:
17871   case AMDGPU::BI__builtin_amdgcn_div_scalef: {
17872     // Translate from the intrinsics's struct return to the builtin's out
17873     // argument.
17874 
17875     Address FlagOutPtr = EmitPointerWithAlignment(E->getArg(3));
17876 
17877     llvm::Value *X = EmitScalarExpr(E->getArg(0));
17878     llvm::Value *Y = EmitScalarExpr(E->getArg(1));
17879     llvm::Value *Z = EmitScalarExpr(E->getArg(2));
17880 
17881     llvm::Function *Callee = CGM.getIntrinsic(Intrinsic::amdgcn_div_scale,
17882                                            X->getType());
17883 
17884     llvm::Value *Tmp = Builder.CreateCall(Callee, {X, Y, Z});
17885 
17886     llvm::Value *Result = Builder.CreateExtractValue(Tmp, 0);
17887     llvm::Value *Flag = Builder.CreateExtractValue(Tmp, 1);
17888 
17889     llvm::Type *RealFlagType = FlagOutPtr.getElementType();
17890 
17891     llvm::Value *FlagExt = Builder.CreateZExt(Flag, RealFlagType);
17892     Builder.CreateStore(FlagExt, FlagOutPtr);
17893     return Result;
17894   }
17895   case AMDGPU::BI__builtin_amdgcn_div_fmas:
17896   case AMDGPU::BI__builtin_amdgcn_div_fmasf: {
17897     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
17898     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
17899     llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
17900     llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
17901 
17902     llvm::Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_div_fmas,
17903                                       Src0->getType());
17904     llvm::Value *Src3ToBool = Builder.CreateIsNotNull(Src3);
17905     return Builder.CreateCall(F, {Src0, Src1, Src2, Src3ToBool});
17906   }
17907 
17908   case AMDGPU::BI__builtin_amdgcn_ds_swizzle:
17909     return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_ds_swizzle);
17910   case AMDGPU::BI__builtin_amdgcn_mov_dpp8:
17911     return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_mov_dpp8);
17912   case AMDGPU::BI__builtin_amdgcn_mov_dpp:
17913   case AMDGPU::BI__builtin_amdgcn_update_dpp: {
17914     llvm::SmallVector<llvm::Value *, 6> Args;
17915     // Find out if any arguments are required to be integer constant
17916     // expressions.
17917     unsigned ICEArguments = 0;
17918     ASTContext::GetBuiltinTypeError Error;
17919     getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
17920     assert(Error == ASTContext::GE_None && "Should not codegen an error");
17921     for (unsigned I = 0; I != E->getNumArgs(); ++I) {
17922       Args.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, I, E));
17923     }
17924     assert(Args.size() == 5 || Args.size() == 6);
17925     if (Args.size() == 5)
17926       Args.insert(Args.begin(), llvm::PoisonValue::get(Args[0]->getType()));
17927     Function *F =
17928         CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
17929     return Builder.CreateCall(F, Args);
17930   }
17931   case AMDGPU::BI__builtin_amdgcn_div_fixup:
17932   case AMDGPU::BI__builtin_amdgcn_div_fixupf:
17933   case AMDGPU::BI__builtin_amdgcn_div_fixuph:
17934     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_div_fixup);
17935   case AMDGPU::BI__builtin_amdgcn_trig_preop:
17936   case AMDGPU::BI__builtin_amdgcn_trig_preopf:
17937     return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_trig_preop);
17938   case AMDGPU::BI__builtin_amdgcn_rcp:
17939   case AMDGPU::BI__builtin_amdgcn_rcpf:
17940   case AMDGPU::BI__builtin_amdgcn_rcph:
17941     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rcp);
17942   case AMDGPU::BI__builtin_amdgcn_sqrt:
17943   case AMDGPU::BI__builtin_amdgcn_sqrtf:
17944   case AMDGPU::BI__builtin_amdgcn_sqrth:
17945     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_sqrt);
17946   case AMDGPU::BI__builtin_amdgcn_rsq:
17947   case AMDGPU::BI__builtin_amdgcn_rsqf:
17948   case AMDGPU::BI__builtin_amdgcn_rsqh:
17949     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rsq);
17950   case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
17951   case AMDGPU::BI__builtin_amdgcn_rsq_clampf:
17952     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rsq_clamp);
17953   case AMDGPU::BI__builtin_amdgcn_sinf:
17954   case AMDGPU::BI__builtin_amdgcn_sinh:
17955     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_sin);
17956   case AMDGPU::BI__builtin_amdgcn_cosf:
17957   case AMDGPU::BI__builtin_amdgcn_cosh:
17958     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_cos);
17959   case AMDGPU::BI__builtin_amdgcn_dispatch_ptr:
17960     return EmitAMDGPUDispatchPtr(*this, E);
17961   case AMDGPU::BI__builtin_amdgcn_logf:
17962     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log);
17963   case AMDGPU::BI__builtin_amdgcn_exp2f:
17964     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_exp2);
17965   case AMDGPU::BI__builtin_amdgcn_log_clampf:
17966     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log_clamp);
17967   case AMDGPU::BI__builtin_amdgcn_ldexp:
17968   case AMDGPU::BI__builtin_amdgcn_ldexpf: {
17969     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
17970     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
17971     llvm::Function *F =
17972         CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Src1->getType()});
17973     return Builder.CreateCall(F, {Src0, Src1});
17974   }
17975   case AMDGPU::BI__builtin_amdgcn_ldexph: {
17976     // The raw instruction has a different behavior for out of bounds exponent
17977     // values (implicit truncation instead of saturate to short_min/short_max).
17978     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
17979     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
17980     llvm::Function *F =
17981         CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Int16Ty});
17982     return Builder.CreateCall(F, {Src0, Builder.CreateTrunc(Src1, Int16Ty)});
17983   }
17984   case AMDGPU::BI__builtin_amdgcn_frexp_mant:
17985   case AMDGPU::BI__builtin_amdgcn_frexp_mantf:
17986   case AMDGPU::BI__builtin_amdgcn_frexp_manth:
17987     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_frexp_mant);
17988   case AMDGPU::BI__builtin_amdgcn_frexp_exp:
17989   case AMDGPU::BI__builtin_amdgcn_frexp_expf: {
17990     Value *Src0 = EmitScalarExpr(E->getArg(0));
17991     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
17992                                 { Builder.getInt32Ty(), Src0->getType() });
17993     return Builder.CreateCall(F, Src0);
17994   }
17995   case AMDGPU::BI__builtin_amdgcn_frexp_exph: {
17996     Value *Src0 = EmitScalarExpr(E->getArg(0));
17997     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
17998                                 { Builder.getInt16Ty(), Src0->getType() });
17999     return Builder.CreateCall(F, Src0);
18000   }
18001   case AMDGPU::BI__builtin_amdgcn_fract:
18002   case AMDGPU::BI__builtin_amdgcn_fractf:
18003   case AMDGPU::BI__builtin_amdgcn_fracth:
18004     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_fract);
18005   case AMDGPU::BI__builtin_amdgcn_lerp:
18006     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_lerp);
18007   case AMDGPU::BI__builtin_amdgcn_ubfe:
18008     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_ubfe);
18009   case AMDGPU::BI__builtin_amdgcn_sbfe:
18010     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_sbfe);
18011   case AMDGPU::BI__builtin_amdgcn_ballot_w32:
18012   case AMDGPU::BI__builtin_amdgcn_ballot_w64: {
18013     llvm::Type *ResultType = ConvertType(E->getType());
18014     llvm::Value *Src = EmitScalarExpr(E->getArg(0));
18015     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType });
18016     return Builder.CreateCall(F, { Src });
18017   }
18018   case AMDGPU::BI__builtin_amdgcn_uicmp:
18019   case AMDGPU::BI__builtin_amdgcn_uicmpl:
18020   case AMDGPU::BI__builtin_amdgcn_sicmp:
18021   case AMDGPU::BI__builtin_amdgcn_sicmpl: {
18022     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
18023     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
18024     llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
18025 
18026     // FIXME-GFX10: How should 32 bit mask be handled?
18027     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_icmp,
18028       { Builder.getInt64Ty(), Src0->getType() });
18029     return Builder.CreateCall(F, { Src0, Src1, Src2 });
18030   }
18031   case AMDGPU::BI__builtin_amdgcn_fcmp:
18032   case AMDGPU::BI__builtin_amdgcn_fcmpf: {
18033     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
18034     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
18035     llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
18036 
18037     // FIXME-GFX10: How should 32 bit mask be handled?
18038     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_fcmp,
18039       { Builder.getInt64Ty(), Src0->getType() });
18040     return Builder.CreateCall(F, { Src0, Src1, Src2 });
18041   }
18042   case AMDGPU::BI__builtin_amdgcn_class:
18043   case AMDGPU::BI__builtin_amdgcn_classf:
18044   case AMDGPU::BI__builtin_amdgcn_classh:
18045     return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_class);
18046   case AMDGPU::BI__builtin_amdgcn_fmed3f:
18047   case AMDGPU::BI__builtin_amdgcn_fmed3h:
18048     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_fmed3);
18049   case AMDGPU::BI__builtin_amdgcn_ds_append:
18050   case AMDGPU::BI__builtin_amdgcn_ds_consume: {
18051     Intrinsic::ID Intrin = BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_append ?
18052       Intrinsic::amdgcn_ds_append : Intrinsic::amdgcn_ds_consume;
18053     Value *Src0 = EmitScalarExpr(E->getArg(0));
18054     Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
18055     return Builder.CreateCall(F, { Src0, Builder.getFalse() });
18056   }
18057   case AMDGPU::BI__builtin_amdgcn_ds_faddf:
18058   case AMDGPU::BI__builtin_amdgcn_ds_fminf:
18059   case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
18060     Intrinsic::ID Intrin;
18061     switch (BuiltinID) {
18062     case AMDGPU::BI__builtin_amdgcn_ds_faddf:
18063       Intrin = Intrinsic::amdgcn_ds_fadd;
18064       break;
18065     case AMDGPU::BI__builtin_amdgcn_ds_fminf:
18066       Intrin = Intrinsic::amdgcn_ds_fmin;
18067       break;
18068     case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
18069       Intrin = Intrinsic::amdgcn_ds_fmax;
18070       break;
18071     }
18072     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
18073     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
18074     llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
18075     llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
18076     llvm::Value *Src4 = EmitScalarExpr(E->getArg(4));
18077     llvm::Function *F = CGM.getIntrinsic(Intrin, { Src1->getType() });
18078     llvm::FunctionType *FTy = F->getFunctionType();
18079     llvm::Type *PTy = FTy->getParamType(0);
18080     Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
18081     return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
18082   }
18083   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18084   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
18085   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
18086   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
18087   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
18088   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
18089   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
18090   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
18091   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
18092   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: {
18093     Intrinsic::ID IID;
18094     llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
18095     switch (BuiltinID) {
18096     case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
18097       ArgTy = llvm::Type::getFloatTy(getLLVMContext());
18098       IID = Intrinsic::amdgcn_global_atomic_fadd;
18099       break;
18100     case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
18101       ArgTy = llvm::FixedVectorType::get(
18102           llvm::Type::getHalfTy(getLLVMContext()), 2);
18103       IID = Intrinsic::amdgcn_global_atomic_fadd;
18104       break;
18105     case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18106       IID = Intrinsic::amdgcn_global_atomic_fadd;
18107       break;
18108     case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
18109       IID = Intrinsic::amdgcn_global_atomic_fmin;
18110       break;
18111     case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
18112       IID = Intrinsic::amdgcn_global_atomic_fmax;
18113       break;
18114     case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
18115       IID = Intrinsic::amdgcn_flat_atomic_fadd;
18116       break;
18117     case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
18118       IID = Intrinsic::amdgcn_flat_atomic_fmin;
18119       break;
18120     case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
18121       IID = Intrinsic::amdgcn_flat_atomic_fmax;
18122       break;
18123     case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
18124       ArgTy = llvm::Type::getFloatTy(getLLVMContext());
18125       IID = Intrinsic::amdgcn_flat_atomic_fadd;
18126       break;
18127     case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
18128       ArgTy = llvm::FixedVectorType::get(
18129           llvm::Type::getHalfTy(getLLVMContext()), 2);
18130       IID = Intrinsic::amdgcn_flat_atomic_fadd;
18131       break;
18132     }
18133     llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
18134     llvm::Value *Val = EmitScalarExpr(E->getArg(1));
18135     llvm::Function *F =
18136         CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()});
18137     return Builder.CreateCall(F, {Addr, Val});
18138   }
18139   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
18140   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
18141     Intrinsic::ID IID;
18142     switch (BuiltinID) {
18143     case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
18144       IID = Intrinsic::amdgcn_global_atomic_fadd_v2bf16;
18145       break;
18146     case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
18147       IID = Intrinsic::amdgcn_flat_atomic_fadd_v2bf16;
18148       break;
18149     }
18150     llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
18151     llvm::Value *Val = EmitScalarExpr(E->getArg(1));
18152     llvm::Function *F = CGM.getIntrinsic(IID, {Addr->getType()});
18153     return Builder.CreateCall(F, {Addr, Val});
18154   }
18155   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
18156   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
18157   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16: {
18158     Intrinsic::ID IID;
18159     llvm::Type *ArgTy;
18160     switch (BuiltinID) {
18161     case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
18162       ArgTy = llvm::Type::getFloatTy(getLLVMContext());
18163       IID = Intrinsic::amdgcn_ds_fadd;
18164       break;
18165     case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
18166       ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
18167       IID = Intrinsic::amdgcn_ds_fadd;
18168       break;
18169     case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
18170       ArgTy = llvm::FixedVectorType::get(
18171           llvm::Type::getHalfTy(getLLVMContext()), 2);
18172       IID = Intrinsic::amdgcn_ds_fadd;
18173       break;
18174     }
18175     llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
18176     llvm::Value *Val = EmitScalarExpr(E->getArg(1));
18177     llvm::Constant *ZeroI32 = llvm::ConstantInt::getIntegerValue(
18178         llvm::Type::getInt32Ty(getLLVMContext()), APInt(32, 0, true));
18179     llvm::Constant *ZeroI1 = llvm::ConstantInt::getIntegerValue(
18180         llvm::Type::getInt1Ty(getLLVMContext()), APInt(1, 0));
18181     llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
18182     return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
18183   }
18184   case AMDGPU::BI__builtin_amdgcn_global_load_tr_i32:
18185   case AMDGPU::BI__builtin_amdgcn_global_load_tr_v2i32:
18186   case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4f16:
18187   case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4i16:
18188   case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8f16:
18189   case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8i16: {
18190 
18191     llvm::Type *ArgTy;
18192     switch (BuiltinID) {
18193     case AMDGPU::BI__builtin_amdgcn_global_load_tr_i32:
18194       ArgTy = llvm::Type::getInt32Ty(getLLVMContext());
18195       break;
18196     case AMDGPU::BI__builtin_amdgcn_global_load_tr_v2i32:
18197       ArgTy = llvm::FixedVectorType::get(
18198           llvm::Type::getInt32Ty(getLLVMContext()), 2);
18199       break;
18200     case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4f16:
18201       ArgTy = llvm::FixedVectorType::get(
18202           llvm::Type::getHalfTy(getLLVMContext()), 4);
18203       break;
18204     case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4i16:
18205       ArgTy = llvm::FixedVectorType::get(
18206           llvm::Type::getInt16Ty(getLLVMContext()), 4);
18207       break;
18208     case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8f16:
18209       ArgTy = llvm::FixedVectorType::get(
18210           llvm::Type::getHalfTy(getLLVMContext()), 8);
18211       break;
18212     case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8i16:
18213       ArgTy = llvm::FixedVectorType::get(
18214           llvm::Type::getInt16Ty(getLLVMContext()), 8);
18215       break;
18216     }
18217 
18218     llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
18219     llvm::Function *F =
18220         CGM.getIntrinsic(Intrinsic::amdgcn_global_load_tr, {ArgTy});
18221     return Builder.CreateCall(F, {Addr});
18222   }
18223   case AMDGPU::BI__builtin_amdgcn_read_exec:
18224     return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false);
18225   case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
18226     return EmitAMDGCNBallotForExec(*this, E, Int32Ty, Int32Ty, false);
18227   case AMDGPU::BI__builtin_amdgcn_read_exec_hi:
18228     return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, true);
18229   case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray:
18230   case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_h:
18231   case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_l:
18232   case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_lh: {
18233     llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0));
18234     llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1));
18235     llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(2));
18236     llvm::Value *RayDir = EmitScalarExpr(E->getArg(3));
18237     llvm::Value *RayInverseDir = EmitScalarExpr(E->getArg(4));
18238     llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(5));
18239 
18240     // The builtins take these arguments as vec4 where the last element is
18241     // ignored. The intrinsic takes them as vec3.
18242     RayOrigin = Builder.CreateShuffleVector(RayOrigin, RayOrigin,
18243                                             ArrayRef<int>{0, 1, 2});
18244     RayDir =
18245         Builder.CreateShuffleVector(RayDir, RayDir, ArrayRef<int>{0, 1, 2});
18246     RayInverseDir = Builder.CreateShuffleVector(RayInverseDir, RayInverseDir,
18247                                                 ArrayRef<int>{0, 1, 2});
18248 
18249     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_image_bvh_intersect_ray,
18250                                    {NodePtr->getType(), RayDir->getType()});
18251     return Builder.CreateCall(F, {NodePtr, RayExtent, RayOrigin, RayDir,
18252                                   RayInverseDir, TextureDescr});
18253   }
18254 
18255   case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn: {
18256     SmallVector<Value *, 4> Args;
18257     for (int i = 0, e = E->getNumArgs(); i != e; ++i)
18258       Args.push_back(EmitScalarExpr(E->getArg(i)));
18259 
18260     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ds_bvh_stack_rtn);
18261     Value *Call = Builder.CreateCall(F, Args);
18262     Value *Rtn = Builder.CreateExtractValue(Call, 0);
18263     Value *A = Builder.CreateExtractValue(Call, 1);
18264     llvm::Type *RetTy = ConvertType(E->getType());
18265     Value *I0 = Builder.CreateInsertElement(PoisonValue::get(RetTy), Rtn,
18266                                             (uint64_t)0);
18267     return Builder.CreateInsertElement(I0, A, 1);
18268   }
18269 
18270   case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
18271   case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
18272   case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
18273   case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
18274   case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
18275   case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
18276   case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
18277   case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
18278   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
18279   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
18280   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
18281   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
18282   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
18283   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
18284   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
18285   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
18286   case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
18287   case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
18288   case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
18289   case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
18290   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
18291   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
18292   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
18293   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
18294   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
18295   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
18296   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
18297   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
18298   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
18299   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
18300   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
18301   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
18302   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
18303   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
18304   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
18305   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
18306   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
18307   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
18308   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
18309   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
18310   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
18311   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
18312   case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
18313   case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
18314   case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
18315   case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
18316   case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
18317   case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
18318   case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
18319   case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
18320   case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
18321   case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
18322   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
18323   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
18324   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
18325   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
18326   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
18327   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
18328   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
18329   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: {
18330 
18331     // These operations perform a matrix multiplication and accumulation of
18332     // the form:
18333     //             D = A * B + C
18334     // We need to specify one type for matrices AB and one for matrices CD.
18335     // Sparse matrix operations can have different types for A and B as well as
18336     // an additional type for sparsity index.
18337     // Destination type should be put before types used for source operands.
18338     SmallVector<unsigned, 2> ArgsForMatchingMatrixTypes;
18339     // On GFX12, the intrinsics with 16-bit accumulator use a packed layout.
18340     // There is no need for the variable opsel argument, so always set it to
18341     // "false".
18342     bool AppendFalseForOpselArg = false;
18343     unsigned BuiltinWMMAOp;
18344 
18345     switch (BuiltinID) {
18346     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
18347     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
18348     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
18349     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
18350       ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
18351       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
18352       break;
18353     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
18354     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
18355     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
18356     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
18357       ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
18358       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
18359       break;
18360     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
18361     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
18362       AppendFalseForOpselArg = true;
18363       LLVM_FALLTHROUGH;
18364     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
18365     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
18366       ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
18367       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
18368       break;
18369     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
18370     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
18371       AppendFalseForOpselArg = true;
18372       LLVM_FALLTHROUGH;
18373     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
18374     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
18375       ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
18376       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
18377       break;
18378     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
18379     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
18380       ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
18381       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
18382       break;
18383     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
18384     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
18385       ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
18386       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
18387       break;
18388     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
18389     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
18390     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
18391     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
18392       ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
18393       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
18394       break;
18395     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
18396     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
18397     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
18398     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
18399       ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
18400       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
18401       break;
18402     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
18403     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
18404       ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
18405       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8;
18406       break;
18407     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
18408     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
18409       ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
18410       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8;
18411       break;
18412     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
18413     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
18414       ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
18415       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8;
18416       break;
18417     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
18418     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
18419       ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
18420       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8;
18421       break;
18422     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
18423     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
18424       ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
18425       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4;
18426       break;
18427     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
18428     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
18429       ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
18430       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16;
18431       break;
18432     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
18433     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
18434       ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
18435       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16;
18436       break;
18437     case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
18438     case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
18439       ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
18440       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16;
18441       break;
18442     case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
18443     case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
18444       ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
18445       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16;
18446       break;
18447     case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
18448     case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
18449       ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
18450       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8;
18451       break;
18452     case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
18453     case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
18454       ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
18455       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4;
18456       break;
18457     case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
18458     case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
18459       ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
18460       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4;
18461       break;
18462     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
18463     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
18464       ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
18465       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8;
18466       break;
18467     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
18468     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
18469       ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
18470       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8;
18471       break;
18472     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
18473     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
18474       ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
18475       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8;
18476       break;
18477     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
18478     case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
18479       ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
18480       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
18481       break;
18482     }
18483 
18484     SmallVector<Value *, 6> Args;
18485     for (int i = 0, e = E->getNumArgs(); i != e; ++i)
18486       Args.push_back(EmitScalarExpr(E->getArg(i)));
18487     if (AppendFalseForOpselArg)
18488       Args.push_back(Builder.getFalse());
18489 
18490     SmallVector<llvm::Type *, 6> ArgTypes;
18491     for (auto ArgIdx : ArgsForMatchingMatrixTypes)
18492       ArgTypes.push_back(Args[ArgIdx]->getType());
18493 
18494     Function *F = CGM.getIntrinsic(BuiltinWMMAOp, ArgTypes);
18495     return Builder.CreateCall(F, Args);
18496   }
18497 
18498   // amdgcn workitem
18499   case AMDGPU::BI__builtin_amdgcn_workitem_id_x:
18500     return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_x, 0, 1024);
18501   case AMDGPU::BI__builtin_amdgcn_workitem_id_y:
18502     return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_y, 0, 1024);
18503   case AMDGPU::BI__builtin_amdgcn_workitem_id_z:
18504     return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_z, 0, 1024);
18505 
18506   // amdgcn workgroup size
18507   case AMDGPU::BI__builtin_amdgcn_workgroup_size_x:
18508     return EmitAMDGPUWorkGroupSize(*this, 0);
18509   case AMDGPU::BI__builtin_amdgcn_workgroup_size_y:
18510     return EmitAMDGPUWorkGroupSize(*this, 1);
18511   case AMDGPU::BI__builtin_amdgcn_workgroup_size_z:
18512     return EmitAMDGPUWorkGroupSize(*this, 2);
18513 
18514   // amdgcn grid size
18515   case AMDGPU::BI__builtin_amdgcn_grid_size_x:
18516     return EmitAMDGPUGridSize(*this, 0);
18517   case AMDGPU::BI__builtin_amdgcn_grid_size_y:
18518     return EmitAMDGPUGridSize(*this, 1);
18519   case AMDGPU::BI__builtin_amdgcn_grid_size_z:
18520     return EmitAMDGPUGridSize(*this, 2);
18521 
18522   // r600 intrinsics
18523   case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
18524   case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
18525     return emitUnaryBuiltin(*this, E, Intrinsic::r600_recipsqrt_ieee);
18526   case AMDGPU::BI__builtin_r600_read_tidig_x:
18527     return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_x, 0, 1024);
18528   case AMDGPU::BI__builtin_r600_read_tidig_y:
18529     return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_y, 0, 1024);
18530   case AMDGPU::BI__builtin_r600_read_tidig_z:
18531     return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_z, 0, 1024);
18532   case AMDGPU::BI__builtin_amdgcn_alignbit: {
18533     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
18534     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
18535     llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
18536     Function *F = CGM.getIntrinsic(Intrinsic::fshr, Src0->getType());
18537     return Builder.CreateCall(F, { Src0, Src1, Src2 });
18538   }
18539   case AMDGPU::BI__builtin_amdgcn_fence: {
18540     ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(0)),
18541                             EmitScalarExpr(E->getArg(1)), AO, SSID);
18542     return Builder.CreateFence(AO, SSID);
18543   }
18544   case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
18545   case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
18546   case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
18547   case AMDGPU::BI__builtin_amdgcn_atomic_dec64: {
18548     llvm::AtomicRMWInst::BinOp BinOp;
18549     switch (BuiltinID) {
18550     case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
18551     case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
18552       BinOp = llvm::AtomicRMWInst::UIncWrap;
18553       break;
18554     case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
18555     case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
18556       BinOp = llvm::AtomicRMWInst::UDecWrap;
18557       break;
18558     }
18559 
18560     Address Ptr = CheckAtomicAlignment(*this, E);
18561     Value *Val = EmitScalarExpr(E->getArg(1));
18562 
18563     ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
18564                             EmitScalarExpr(E->getArg(3)), AO, SSID);
18565 
18566     QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
18567     bool Volatile =
18568         PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
18569 
18570     llvm::AtomicRMWInst *RMW =
18571         Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
18572     if (Volatile)
18573       RMW->setVolatile(true);
18574     return RMW;
18575   }
18576   case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
18577   case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: {
18578     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
18579     llvm::Type *ResultType = ConvertType(E->getType());
18580     // s_sendmsg_rtn is mangled using return type only.
18581     Function *F =
18582         CGM.getIntrinsic(Intrinsic::amdgcn_s_sendmsg_rtn, {ResultType});
18583     return Builder.CreateCall(F, {Arg});
18584   }
18585   default:
18586     return nullptr;
18587   }
18588 }
18589 
18590 /// Handle a SystemZ function in which the final argument is a pointer
18591 /// to an int that receives the post-instruction CC value.  At the LLVM level
18592 /// this is represented as a function that returns a {result, cc} pair.
18593 static Value *EmitSystemZIntrinsicWithCC(CodeGenFunction &CGF,
18594                                          unsigned IntrinsicID,
18595                                          const CallExpr *E) {
18596   unsigned NumArgs = E->getNumArgs() - 1;
18597   SmallVector<Value *, 8> Args(NumArgs);
18598   for (unsigned I = 0; I < NumArgs; ++I)
18599     Args[I] = CGF.EmitScalarExpr(E->getArg(I));
18600   Address CCPtr = CGF.EmitPointerWithAlignment(E->getArg(NumArgs));
18601   Function *F = CGF.CGM.getIntrinsic(IntrinsicID);
18602   Value *Call = CGF.Builder.CreateCall(F, Args);
18603   Value *CC = CGF.Builder.CreateExtractValue(Call, 1);
18604   CGF.Builder.CreateStore(CC, CCPtr);
18605   return CGF.Builder.CreateExtractValue(Call, 0);
18606 }
18607 
18608 Value *CodeGenFunction::EmitSystemZBuiltinExpr(unsigned BuiltinID,
18609                                                const CallExpr *E) {
18610   switch (BuiltinID) {
18611   case SystemZ::BI__builtin_tbegin: {
18612     Value *TDB = EmitScalarExpr(E->getArg(0));
18613     Value *Control = llvm::ConstantInt::get(Int32Ty, 0xff0c);
18614     Function *F = CGM.getIntrinsic(Intrinsic::s390_tbegin);
18615     return Builder.CreateCall(F, {TDB, Control});
18616   }
18617   case SystemZ::BI__builtin_tbegin_nofloat: {
18618     Value *TDB = EmitScalarExpr(E->getArg(0));
18619     Value *Control = llvm::ConstantInt::get(Int32Ty, 0xff0c);
18620     Function *F = CGM.getIntrinsic(Intrinsic::s390_tbegin_nofloat);
18621     return Builder.CreateCall(F, {TDB, Control});
18622   }
18623   case SystemZ::BI__builtin_tbeginc: {
18624     Value *TDB = llvm::ConstantPointerNull::get(Int8PtrTy);
18625     Value *Control = llvm::ConstantInt::get(Int32Ty, 0xff08);
18626     Function *F = CGM.getIntrinsic(Intrinsic::s390_tbeginc);
18627     return Builder.CreateCall(F, {TDB, Control});
18628   }
18629   case SystemZ::BI__builtin_tabort: {
18630     Value *Data = EmitScalarExpr(E->getArg(0));
18631     Function *F = CGM.getIntrinsic(Intrinsic::s390_tabort);
18632     return Builder.CreateCall(F, Builder.CreateSExt(Data, Int64Ty, "tabort"));
18633   }
18634   case SystemZ::BI__builtin_non_tx_store: {
18635     Value *Address = EmitScalarExpr(E->getArg(0));
18636     Value *Data = EmitScalarExpr(E->getArg(1));
18637     Function *F = CGM.getIntrinsic(Intrinsic::s390_ntstg);
18638     return Builder.CreateCall(F, {Data, Address});
18639   }
18640 
18641   // Vector builtins.  Note that most vector builtins are mapped automatically
18642   // to target-specific LLVM intrinsics.  The ones handled specially here can
18643   // be represented via standard LLVM IR, which is preferable to enable common
18644   // LLVM optimizations.
18645 
18646   case SystemZ::BI__builtin_s390_vpopctb:
18647   case SystemZ::BI__builtin_s390_vpopcth:
18648   case SystemZ::BI__builtin_s390_vpopctf:
18649   case SystemZ::BI__builtin_s390_vpopctg: {
18650     llvm::Type *ResultType = ConvertType(E->getType());
18651     Value *X = EmitScalarExpr(E->getArg(0));
18652     Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ResultType);
18653     return Builder.CreateCall(F, X);
18654   }
18655 
18656   case SystemZ::BI__builtin_s390_vclzb:
18657   case SystemZ::BI__builtin_s390_vclzh:
18658   case SystemZ::BI__builtin_s390_vclzf:
18659   case SystemZ::BI__builtin_s390_vclzg: {
18660     llvm::Type *ResultType = ConvertType(E->getType());
18661     Value *X = EmitScalarExpr(E->getArg(0));
18662     Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
18663     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ResultType);
18664     return Builder.CreateCall(F, {X, Undef});
18665   }
18666 
18667   case SystemZ::BI__builtin_s390_vctzb:
18668   case SystemZ::BI__builtin_s390_vctzh:
18669   case SystemZ::BI__builtin_s390_vctzf:
18670   case SystemZ::BI__builtin_s390_vctzg: {
18671     llvm::Type *ResultType = ConvertType(E->getType());
18672     Value *X = EmitScalarExpr(E->getArg(0));
18673     Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
18674     Function *F = CGM.getIntrinsic(Intrinsic::cttz, ResultType);
18675     return Builder.CreateCall(F, {X, Undef});
18676   }
18677 
18678   case SystemZ::BI__builtin_s390_verllb:
18679   case SystemZ::BI__builtin_s390_verllh:
18680   case SystemZ::BI__builtin_s390_verllf:
18681   case SystemZ::BI__builtin_s390_verllg: {
18682     llvm::Type *ResultType = ConvertType(E->getType());
18683     llvm::Value *Src = EmitScalarExpr(E->getArg(0));
18684     llvm::Value *Amt = EmitScalarExpr(E->getArg(1));
18685     // Splat scalar rotate amount to vector type.
18686     unsigned NumElts = cast<llvm::FixedVectorType>(ResultType)->getNumElements();
18687     Amt = Builder.CreateIntCast(Amt, ResultType->getScalarType(), false);
18688     Amt = Builder.CreateVectorSplat(NumElts, Amt);
18689     Function *F = CGM.getIntrinsic(Intrinsic::fshl, ResultType);
18690     return Builder.CreateCall(F, { Src, Src, Amt });
18691   }
18692 
18693   case SystemZ::BI__builtin_s390_verllvb:
18694   case SystemZ::BI__builtin_s390_verllvh:
18695   case SystemZ::BI__builtin_s390_verllvf:
18696   case SystemZ::BI__builtin_s390_verllvg: {
18697     llvm::Type *ResultType = ConvertType(E->getType());
18698     llvm::Value *Src = EmitScalarExpr(E->getArg(0));
18699     llvm::Value *Amt = EmitScalarExpr(E->getArg(1));
18700     Function *F = CGM.getIntrinsic(Intrinsic::fshl, ResultType);
18701     return Builder.CreateCall(F, { Src, Src, Amt });
18702   }
18703 
18704   case SystemZ::BI__builtin_s390_vfsqsb:
18705   case SystemZ::BI__builtin_s390_vfsqdb: {
18706     llvm::Type *ResultType = ConvertType(E->getType());
18707     Value *X = EmitScalarExpr(E->getArg(0));
18708     if (Builder.getIsFPConstrained()) {
18709       Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt, ResultType);
18710       return Builder.CreateConstrainedFPCall(F, { X });
18711     } else {
18712       Function *F = CGM.getIntrinsic(Intrinsic::sqrt, ResultType);
18713       return Builder.CreateCall(F, X);
18714     }
18715   }
18716   case SystemZ::BI__builtin_s390_vfmasb:
18717   case SystemZ::BI__builtin_s390_vfmadb: {
18718     llvm::Type *ResultType = ConvertType(E->getType());
18719     Value *X = EmitScalarExpr(E->getArg(0));
18720     Value *Y = EmitScalarExpr(E->getArg(1));
18721     Value *Z = EmitScalarExpr(E->getArg(2));
18722     if (Builder.getIsFPConstrained()) {
18723       Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, ResultType);
18724       return Builder.CreateConstrainedFPCall(F, {X, Y, Z});
18725     } else {
18726       Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
18727       return Builder.CreateCall(F, {X, Y, Z});
18728     }
18729   }
18730   case SystemZ::BI__builtin_s390_vfmssb:
18731   case SystemZ::BI__builtin_s390_vfmsdb: {
18732     llvm::Type *ResultType = ConvertType(E->getType());
18733     Value *X = EmitScalarExpr(E->getArg(0));
18734     Value *Y = EmitScalarExpr(E->getArg(1));
18735     Value *Z = EmitScalarExpr(E->getArg(2));
18736     if (Builder.getIsFPConstrained()) {
18737       Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, ResultType);
18738       return Builder.CreateConstrainedFPCall(F, {X, Y, Builder.CreateFNeg(Z, "neg")});
18739     } else {
18740       Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
18741       return Builder.CreateCall(F, {X, Y, Builder.CreateFNeg(Z, "neg")});
18742     }
18743   }
18744   case SystemZ::BI__builtin_s390_vfnmasb:
18745   case SystemZ::BI__builtin_s390_vfnmadb: {
18746     llvm::Type *ResultType = ConvertType(E->getType());
18747     Value *X = EmitScalarExpr(E->getArg(0));
18748     Value *Y = EmitScalarExpr(E->getArg(1));
18749     Value *Z = EmitScalarExpr(E->getArg(2));
18750     if (Builder.getIsFPConstrained()) {
18751       Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, ResultType);
18752       return Builder.CreateFNeg(Builder.CreateConstrainedFPCall(F, {X, Y,  Z}), "neg");
18753     } else {
18754       Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
18755       return Builder.CreateFNeg(Builder.CreateCall(F, {X, Y, Z}), "neg");
18756     }
18757   }
18758   case SystemZ::BI__builtin_s390_vfnmssb:
18759   case SystemZ::BI__builtin_s390_vfnmsdb: {
18760     llvm::Type *ResultType = ConvertType(E->getType());
18761     Value *X = EmitScalarExpr(E->getArg(0));
18762     Value *Y = EmitScalarExpr(E->getArg(1));
18763     Value *Z = EmitScalarExpr(E->getArg(2));
18764     if (Builder.getIsFPConstrained()) {
18765       Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, ResultType);
18766       Value *NegZ = Builder.CreateFNeg(Z, "sub");
18767       return Builder.CreateFNeg(Builder.CreateConstrainedFPCall(F, {X, Y, NegZ}));
18768     } else {
18769       Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
18770       Value *NegZ = Builder.CreateFNeg(Z, "neg");
18771       return Builder.CreateFNeg(Builder.CreateCall(F, {X, Y, NegZ}));
18772     }
18773   }
18774   case SystemZ::BI__builtin_s390_vflpsb:
18775   case SystemZ::BI__builtin_s390_vflpdb: {
18776     llvm::Type *ResultType = ConvertType(E->getType());
18777     Value *X = EmitScalarExpr(E->getArg(0));
18778     Function *F = CGM.getIntrinsic(Intrinsic::fabs, ResultType);
18779     return Builder.CreateCall(F, X);
18780   }
18781   case SystemZ::BI__builtin_s390_vflnsb:
18782   case SystemZ::BI__builtin_s390_vflndb: {
18783     llvm::Type *ResultType = ConvertType(E->getType());
18784     Value *X = EmitScalarExpr(E->getArg(0));
18785     Function *F = CGM.getIntrinsic(Intrinsic::fabs, ResultType);
18786     return Builder.CreateFNeg(Builder.CreateCall(F, X), "neg");
18787   }
18788   case SystemZ::BI__builtin_s390_vfisb:
18789   case SystemZ::BI__builtin_s390_vfidb: {
18790     llvm::Type *ResultType = ConvertType(E->getType());
18791     Value *X = EmitScalarExpr(E->getArg(0));
18792     // Constant-fold the M4 and M5 mask arguments.
18793     llvm::APSInt M4 = *E->getArg(1)->getIntegerConstantExpr(getContext());
18794     llvm::APSInt M5 = *E->getArg(2)->getIntegerConstantExpr(getContext());
18795     // Check whether this instance can be represented via a LLVM standard
18796     // intrinsic.  We only support some combinations of M4 and M5.
18797     Intrinsic::ID ID = Intrinsic::not_intrinsic;
18798     Intrinsic::ID CI;
18799     switch (M4.getZExtValue()) {
18800     default: break;
18801     case 0:  // IEEE-inexact exception allowed
18802       switch (M5.getZExtValue()) {
18803       default: break;
18804       case 0: ID = Intrinsic::rint;
18805               CI = Intrinsic::experimental_constrained_rint; break;
18806       }
18807       break;
18808     case 4:  // IEEE-inexact exception suppressed
18809       switch (M5.getZExtValue()) {
18810       default: break;
18811       case 0: ID = Intrinsic::nearbyint;
18812               CI = Intrinsic::experimental_constrained_nearbyint; break;
18813       case 1: ID = Intrinsic::round;
18814               CI = Intrinsic::experimental_constrained_round; break;
18815       case 5: ID = Intrinsic::trunc;
18816               CI = Intrinsic::experimental_constrained_trunc; break;
18817       case 6: ID = Intrinsic::ceil;
18818               CI = Intrinsic::experimental_constrained_ceil; break;
18819       case 7: ID = Intrinsic::floor;
18820               CI = Intrinsic::experimental_constrained_floor; break;
18821       }
18822       break;
18823     }
18824     if (ID != Intrinsic::not_intrinsic) {
18825       if (Builder.getIsFPConstrained()) {
18826         Function *F = CGM.getIntrinsic(CI, ResultType);
18827         return Builder.CreateConstrainedFPCall(F, X);
18828       } else {
18829         Function *F = CGM.getIntrinsic(ID, ResultType);
18830         return Builder.CreateCall(F, X);
18831       }
18832     }
18833     switch (BuiltinID) { // FIXME: constrained version?
18834       case SystemZ::BI__builtin_s390_vfisb: ID = Intrinsic::s390_vfisb; break;
18835       case SystemZ::BI__builtin_s390_vfidb: ID = Intrinsic::s390_vfidb; break;
18836       default: llvm_unreachable("Unknown BuiltinID");
18837     }
18838     Function *F = CGM.getIntrinsic(ID);
18839     Value *M4Value = llvm::ConstantInt::get(getLLVMContext(), M4);
18840     Value *M5Value = llvm::ConstantInt::get(getLLVMContext(), M5);
18841     return Builder.CreateCall(F, {X, M4Value, M5Value});
18842   }
18843   case SystemZ::BI__builtin_s390_vfmaxsb:
18844   case SystemZ::BI__builtin_s390_vfmaxdb: {
18845     llvm::Type *ResultType = ConvertType(E->getType());
18846     Value *X = EmitScalarExpr(E->getArg(0));
18847     Value *Y = EmitScalarExpr(E->getArg(1));
18848     // Constant-fold the M4 mask argument.
18849     llvm::APSInt M4 = *E->getArg(2)->getIntegerConstantExpr(getContext());
18850     // Check whether this instance can be represented via a LLVM standard
18851     // intrinsic.  We only support some values of M4.
18852     Intrinsic::ID ID = Intrinsic::not_intrinsic;
18853     Intrinsic::ID CI;
18854     switch (M4.getZExtValue()) {
18855     default: break;
18856     case 4: ID = Intrinsic::maxnum;
18857             CI = Intrinsic::experimental_constrained_maxnum; break;
18858     }
18859     if (ID != Intrinsic::not_intrinsic) {
18860       if (Builder.getIsFPConstrained()) {
18861         Function *F = CGM.getIntrinsic(CI, ResultType);
18862         return Builder.CreateConstrainedFPCall(F, {X, Y});
18863       } else {
18864         Function *F = CGM.getIntrinsic(ID, ResultType);
18865         return Builder.CreateCall(F, {X, Y});
18866       }
18867     }
18868     switch (BuiltinID) {
18869       case SystemZ::BI__builtin_s390_vfmaxsb: ID = Intrinsic::s390_vfmaxsb; break;
18870       case SystemZ::BI__builtin_s390_vfmaxdb: ID = Intrinsic::s390_vfmaxdb; break;
18871       default: llvm_unreachable("Unknown BuiltinID");
18872     }
18873     Function *F = CGM.getIntrinsic(ID);
18874     Value *M4Value = llvm::ConstantInt::get(getLLVMContext(), M4);
18875     return Builder.CreateCall(F, {X, Y, M4Value});
18876   }
18877   case SystemZ::BI__builtin_s390_vfminsb:
18878   case SystemZ::BI__builtin_s390_vfmindb: {
18879     llvm::Type *ResultType = ConvertType(E->getType());
18880     Value *X = EmitScalarExpr(E->getArg(0));
18881     Value *Y = EmitScalarExpr(E->getArg(1));
18882     // Constant-fold the M4 mask argument.
18883     llvm::APSInt M4 = *E->getArg(2)->getIntegerConstantExpr(getContext());
18884     // Check whether this instance can be represented via a LLVM standard
18885     // intrinsic.  We only support some values of M4.
18886     Intrinsic::ID ID = Intrinsic::not_intrinsic;
18887     Intrinsic::ID CI;
18888     switch (M4.getZExtValue()) {
18889     default: break;
18890     case 4: ID = Intrinsic::minnum;
18891             CI = Intrinsic::experimental_constrained_minnum; break;
18892     }
18893     if (ID != Intrinsic::not_intrinsic) {
18894       if (Builder.getIsFPConstrained()) {
18895         Function *F = CGM.getIntrinsic(CI, ResultType);
18896         return Builder.CreateConstrainedFPCall(F, {X, Y});
18897       } else {
18898         Function *F = CGM.getIntrinsic(ID, ResultType);
18899         return Builder.CreateCall(F, {X, Y});
18900       }
18901     }
18902     switch (BuiltinID) {
18903       case SystemZ::BI__builtin_s390_vfminsb: ID = Intrinsic::s390_vfminsb; break;
18904       case SystemZ::BI__builtin_s390_vfmindb: ID = Intrinsic::s390_vfmindb; break;
18905       default: llvm_unreachable("Unknown BuiltinID");
18906     }
18907     Function *F = CGM.getIntrinsic(ID);
18908     Value *M4Value = llvm::ConstantInt::get(getLLVMContext(), M4);
18909     return Builder.CreateCall(F, {X, Y, M4Value});
18910   }
18911 
18912   case SystemZ::BI__builtin_s390_vlbrh:
18913   case SystemZ::BI__builtin_s390_vlbrf:
18914   case SystemZ::BI__builtin_s390_vlbrg: {
18915     llvm::Type *ResultType = ConvertType(E->getType());
18916     Value *X = EmitScalarExpr(E->getArg(0));
18917     Function *F = CGM.getIntrinsic(Intrinsic::bswap, ResultType);
18918     return Builder.CreateCall(F, X);
18919   }
18920 
18921   // Vector intrinsics that output the post-instruction CC value.
18922 
18923 #define INTRINSIC_WITH_CC(NAME) \
18924     case SystemZ::BI__builtin_##NAME: \
18925       return EmitSystemZIntrinsicWithCC(*this, Intrinsic::NAME, E)
18926 
18927   INTRINSIC_WITH_CC(s390_vpkshs);
18928   INTRINSIC_WITH_CC(s390_vpksfs);
18929   INTRINSIC_WITH_CC(s390_vpksgs);
18930 
18931   INTRINSIC_WITH_CC(s390_vpklshs);
18932   INTRINSIC_WITH_CC(s390_vpklsfs);
18933   INTRINSIC_WITH_CC(s390_vpklsgs);
18934 
18935   INTRINSIC_WITH_CC(s390_vceqbs);
18936   INTRINSIC_WITH_CC(s390_vceqhs);
18937   INTRINSIC_WITH_CC(s390_vceqfs);
18938   INTRINSIC_WITH_CC(s390_vceqgs);
18939 
18940   INTRINSIC_WITH_CC(s390_vchbs);
18941   INTRINSIC_WITH_CC(s390_vchhs);
18942   INTRINSIC_WITH_CC(s390_vchfs);
18943   INTRINSIC_WITH_CC(s390_vchgs);
18944 
18945   INTRINSIC_WITH_CC(s390_vchlbs);
18946   INTRINSIC_WITH_CC(s390_vchlhs);
18947   INTRINSIC_WITH_CC(s390_vchlfs);
18948   INTRINSIC_WITH_CC(s390_vchlgs);
18949 
18950   INTRINSIC_WITH_CC(s390_vfaebs);
18951   INTRINSIC_WITH_CC(s390_vfaehs);
18952   INTRINSIC_WITH_CC(s390_vfaefs);
18953 
18954   INTRINSIC_WITH_CC(s390_vfaezbs);
18955   INTRINSIC_WITH_CC(s390_vfaezhs);
18956   INTRINSIC_WITH_CC(s390_vfaezfs);
18957 
18958   INTRINSIC_WITH_CC(s390_vfeebs);
18959   INTRINSIC_WITH_CC(s390_vfeehs);
18960   INTRINSIC_WITH_CC(s390_vfeefs);
18961 
18962   INTRINSIC_WITH_CC(s390_vfeezbs);
18963   INTRINSIC_WITH_CC(s390_vfeezhs);
18964   INTRINSIC_WITH_CC(s390_vfeezfs);
18965 
18966   INTRINSIC_WITH_CC(s390_vfenebs);
18967   INTRINSIC_WITH_CC(s390_vfenehs);
18968   INTRINSIC_WITH_CC(s390_vfenefs);
18969 
18970   INTRINSIC_WITH_CC(s390_vfenezbs);
18971   INTRINSIC_WITH_CC(s390_vfenezhs);
18972   INTRINSIC_WITH_CC(s390_vfenezfs);
18973 
18974   INTRINSIC_WITH_CC(s390_vistrbs);
18975   INTRINSIC_WITH_CC(s390_vistrhs);
18976   INTRINSIC_WITH_CC(s390_vistrfs);
18977 
18978   INTRINSIC_WITH_CC(s390_vstrcbs);
18979   INTRINSIC_WITH_CC(s390_vstrchs);
18980   INTRINSIC_WITH_CC(s390_vstrcfs);
18981 
18982   INTRINSIC_WITH_CC(s390_vstrczbs);
18983   INTRINSIC_WITH_CC(s390_vstrczhs);
18984   INTRINSIC_WITH_CC(s390_vstrczfs);
18985 
18986   INTRINSIC_WITH_CC(s390_vfcesbs);
18987   INTRINSIC_WITH_CC(s390_vfcedbs);
18988   INTRINSIC_WITH_CC(s390_vfchsbs);
18989   INTRINSIC_WITH_CC(s390_vfchdbs);
18990   INTRINSIC_WITH_CC(s390_vfchesbs);
18991   INTRINSIC_WITH_CC(s390_vfchedbs);
18992 
18993   INTRINSIC_WITH_CC(s390_vftcisb);
18994   INTRINSIC_WITH_CC(s390_vftcidb);
18995 
18996   INTRINSIC_WITH_CC(s390_vstrsb);
18997   INTRINSIC_WITH_CC(s390_vstrsh);
18998   INTRINSIC_WITH_CC(s390_vstrsf);
18999 
19000   INTRINSIC_WITH_CC(s390_vstrszb);
19001   INTRINSIC_WITH_CC(s390_vstrszh);
19002   INTRINSIC_WITH_CC(s390_vstrszf);
19003 
19004 #undef INTRINSIC_WITH_CC
19005 
19006   default:
19007     return nullptr;
19008   }
19009 }
19010 
19011 namespace {
19012 // Helper classes for mapping MMA builtins to particular LLVM intrinsic variant.
19013 struct NVPTXMmaLdstInfo {
19014   unsigned NumResults;  // Number of elements to load/store
19015   // Intrinsic IDs for row/col variants. 0 if particular layout is unsupported.
19016   unsigned IID_col;
19017   unsigned IID_row;
19018 };
19019 
19020 #define MMA_INTR(geom_op_type, layout) \
19021   Intrinsic::nvvm_wmma_##geom_op_type##_##layout##_stride
19022 #define MMA_LDST(n, geom_op_type)                                              \
19023   { n, MMA_INTR(geom_op_type, col), MMA_INTR(geom_op_type, row) }
19024 
19025 static NVPTXMmaLdstInfo getNVPTXMmaLdstInfo(unsigned BuiltinID) {
19026   switch (BuiltinID) {
19027   // FP MMA loads
19028   case NVPTX::BI__hmma_m16n16k16_ld_a:
19029     return MMA_LDST(8, m16n16k16_load_a_f16);
19030   case NVPTX::BI__hmma_m16n16k16_ld_b:
19031     return MMA_LDST(8, m16n16k16_load_b_f16);
19032   case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
19033     return MMA_LDST(4, m16n16k16_load_c_f16);
19034   case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
19035     return MMA_LDST(8, m16n16k16_load_c_f32);
19036   case NVPTX::BI__hmma_m32n8k16_ld_a:
19037     return MMA_LDST(8, m32n8k16_load_a_f16);
19038   case NVPTX::BI__hmma_m32n8k16_ld_b:
19039     return MMA_LDST(8, m32n8k16_load_b_f16);
19040   case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
19041     return MMA_LDST(4, m32n8k16_load_c_f16);
19042   case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
19043     return MMA_LDST(8, m32n8k16_load_c_f32);
19044   case NVPTX::BI__hmma_m8n32k16_ld_a:
19045     return MMA_LDST(8, m8n32k16_load_a_f16);
19046   case NVPTX::BI__hmma_m8n32k16_ld_b:
19047     return MMA_LDST(8, m8n32k16_load_b_f16);
19048   case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
19049     return MMA_LDST(4, m8n32k16_load_c_f16);
19050   case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
19051     return MMA_LDST(8, m8n32k16_load_c_f32);
19052 
19053   // Integer MMA loads
19054   case NVPTX::BI__imma_m16n16k16_ld_a_s8:
19055     return MMA_LDST(2, m16n16k16_load_a_s8);
19056   case NVPTX::BI__imma_m16n16k16_ld_a_u8:
19057     return MMA_LDST(2, m16n16k16_load_a_u8);
19058   case NVPTX::BI__imma_m16n16k16_ld_b_s8:
19059     return MMA_LDST(2, m16n16k16_load_b_s8);
19060   case NVPTX::BI__imma_m16n16k16_ld_b_u8:
19061     return MMA_LDST(2, m16n16k16_load_b_u8);
19062   case NVPTX::BI__imma_m16n16k16_ld_c:
19063     return MMA_LDST(8, m16n16k16_load_c_s32);
19064   case NVPTX::BI__imma_m32n8k16_ld_a_s8:
19065     return MMA_LDST(4, m32n8k16_load_a_s8);
19066   case NVPTX::BI__imma_m32n8k16_ld_a_u8:
19067     return MMA_LDST(4, m32n8k16_load_a_u8);
19068   case NVPTX::BI__imma_m32n8k16_ld_b_s8:
19069     return MMA_LDST(1, m32n8k16_load_b_s8);
19070   case NVPTX::BI__imma_m32n8k16_ld_b_u8:
19071     return MMA_LDST(1, m32n8k16_load_b_u8);
19072   case NVPTX::BI__imma_m32n8k16_ld_c:
19073     return MMA_LDST(8, m32n8k16_load_c_s32);
19074   case NVPTX::BI__imma_m8n32k16_ld_a_s8:
19075     return MMA_LDST(1, m8n32k16_load_a_s8);
19076   case NVPTX::BI__imma_m8n32k16_ld_a_u8:
19077     return MMA_LDST(1, m8n32k16_load_a_u8);
19078   case NVPTX::BI__imma_m8n32k16_ld_b_s8:
19079     return MMA_LDST(4, m8n32k16_load_b_s8);
19080   case NVPTX::BI__imma_m8n32k16_ld_b_u8:
19081     return MMA_LDST(4, m8n32k16_load_b_u8);
19082   case NVPTX::BI__imma_m8n32k16_ld_c:
19083     return MMA_LDST(8, m8n32k16_load_c_s32);
19084 
19085   // Sub-integer MMA loads.
19086   // Only row/col layout is supported by A/B fragments.
19087   case NVPTX::BI__imma_m8n8k32_ld_a_s4:
19088     return {1, 0, MMA_INTR(m8n8k32_load_a_s4, row)};
19089   case NVPTX::BI__imma_m8n8k32_ld_a_u4:
19090     return {1, 0, MMA_INTR(m8n8k32_load_a_u4, row)};
19091   case NVPTX::BI__imma_m8n8k32_ld_b_s4:
19092     return {1, MMA_INTR(m8n8k32_load_b_s4, col), 0};
19093   case NVPTX::BI__imma_m8n8k32_ld_b_u4:
19094     return {1, MMA_INTR(m8n8k32_load_b_u4, col), 0};
19095   case NVPTX::BI__imma_m8n8k32_ld_c:
19096     return MMA_LDST(2, m8n8k32_load_c_s32);
19097   case NVPTX::BI__bmma_m8n8k128_ld_a_b1:
19098     return {1, 0, MMA_INTR(m8n8k128_load_a_b1, row)};
19099   case NVPTX::BI__bmma_m8n8k128_ld_b_b1:
19100     return {1, MMA_INTR(m8n8k128_load_b_b1, col), 0};
19101   case NVPTX::BI__bmma_m8n8k128_ld_c:
19102     return MMA_LDST(2, m8n8k128_load_c_s32);
19103 
19104   // Double MMA loads
19105   case NVPTX::BI__dmma_m8n8k4_ld_a:
19106     return MMA_LDST(1, m8n8k4_load_a_f64);
19107   case NVPTX::BI__dmma_m8n8k4_ld_b:
19108     return MMA_LDST(1, m8n8k4_load_b_f64);
19109   case NVPTX::BI__dmma_m8n8k4_ld_c:
19110     return MMA_LDST(2, m8n8k4_load_c_f64);
19111 
19112   // Alternate float MMA loads
19113   case NVPTX::BI__mma_bf16_m16n16k16_ld_a:
19114     return MMA_LDST(4, m16n16k16_load_a_bf16);
19115   case NVPTX::BI__mma_bf16_m16n16k16_ld_b:
19116     return MMA_LDST(4, m16n16k16_load_b_bf16);
19117   case NVPTX::BI__mma_bf16_m8n32k16_ld_a:
19118     return MMA_LDST(2, m8n32k16_load_a_bf16);
19119   case NVPTX::BI__mma_bf16_m8n32k16_ld_b:
19120     return MMA_LDST(8, m8n32k16_load_b_bf16);
19121   case NVPTX::BI__mma_bf16_m32n8k16_ld_a:
19122     return MMA_LDST(8, m32n8k16_load_a_bf16);
19123   case NVPTX::BI__mma_bf16_m32n8k16_ld_b:
19124     return MMA_LDST(2, m32n8k16_load_b_bf16);
19125   case NVPTX::BI__mma_tf32_m16n16k8_ld_a:
19126     return MMA_LDST(4, m16n16k8_load_a_tf32);
19127   case NVPTX::BI__mma_tf32_m16n16k8_ld_b:
19128     return MMA_LDST(4, m16n16k8_load_b_tf32);
19129   case NVPTX::BI__mma_tf32_m16n16k8_ld_c:
19130     return MMA_LDST(8, m16n16k8_load_c_f32);
19131 
19132   // NOTE: We need to follow inconsitent naming scheme used by NVCC.  Unlike
19133   // PTX and LLVM IR where stores always use fragment D, NVCC builtins always
19134   // use fragment C for both loads and stores.
19135   // FP MMA stores.
19136   case NVPTX::BI__hmma_m16n16k16_st_c_f16:
19137     return MMA_LDST(4, m16n16k16_store_d_f16);
19138   case NVPTX::BI__hmma_m16n16k16_st_c_f32:
19139     return MMA_LDST(8, m16n16k16_store_d_f32);
19140   case NVPTX::BI__hmma_m32n8k16_st_c_f16:
19141     return MMA_LDST(4, m32n8k16_store_d_f16);
19142   case NVPTX::BI__hmma_m32n8k16_st_c_f32:
19143     return MMA_LDST(8, m32n8k16_store_d_f32);
19144   case NVPTX::BI__hmma_m8n32k16_st_c_f16:
19145     return MMA_LDST(4, m8n32k16_store_d_f16);
19146   case NVPTX::BI__hmma_m8n32k16_st_c_f32:
19147     return MMA_LDST(8, m8n32k16_store_d_f32);
19148 
19149   // Integer and sub-integer MMA stores.
19150   // Another naming quirk. Unlike other MMA builtins that use PTX types in the
19151   // name, integer loads/stores use LLVM's i32.
19152   case NVPTX::BI__imma_m16n16k16_st_c_i32:
19153     return MMA_LDST(8, m16n16k16_store_d_s32);
19154   case NVPTX::BI__imma_m32n8k16_st_c_i32:
19155     return MMA_LDST(8, m32n8k16_store_d_s32);
19156   case NVPTX::BI__imma_m8n32k16_st_c_i32:
19157     return MMA_LDST(8, m8n32k16_store_d_s32);
19158   case NVPTX::BI__imma_m8n8k32_st_c_i32:
19159     return MMA_LDST(2, m8n8k32_store_d_s32);
19160   case NVPTX::BI__bmma_m8n8k128_st_c_i32:
19161     return MMA_LDST(2, m8n8k128_store_d_s32);
19162 
19163   // Double MMA store
19164   case NVPTX::BI__dmma_m8n8k4_st_c_f64:
19165     return MMA_LDST(2, m8n8k4_store_d_f64);
19166 
19167   // Alternate float MMA store
19168   case NVPTX::BI__mma_m16n16k8_st_c_f32:
19169     return MMA_LDST(8, m16n16k8_store_d_f32);
19170 
19171   default:
19172     llvm_unreachable("Unknown MMA builtin");
19173   }
19174 }
19175 #undef MMA_LDST
19176 #undef MMA_INTR
19177 
19178 
19179 struct NVPTXMmaInfo {
19180   unsigned NumEltsA;
19181   unsigned NumEltsB;
19182   unsigned NumEltsC;
19183   unsigned NumEltsD;
19184 
19185   // Variants are ordered by layout-A/layout-B/satf, where 'row' has priority
19186   // over 'col' for layout. The index of non-satf variants is expected to match
19187   // the undocumented layout constants used by CUDA's mma.hpp.
19188   std::array<unsigned, 8> Variants;
19189 
19190   unsigned getMMAIntrinsic(int Layout, bool Satf) {
19191     unsigned Index = Layout + 4 * Satf;
19192     if (Index >= Variants.size())
19193       return 0;
19194     return Variants[Index];
19195   }
19196 };
19197 
19198   // Returns an intrinsic that matches Layout and Satf for valid combinations of
19199   // Layout and Satf, 0 otherwise.
19200 static NVPTXMmaInfo getNVPTXMmaInfo(unsigned BuiltinID) {
19201   // clang-format off
19202 #define MMA_VARIANTS(geom, type)                                    \
19203       Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type,             \
19204       Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type,             \
19205       Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type,             \
19206       Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type
19207 #define MMA_SATF_VARIANTS(geom, type)                               \
19208       MMA_VARIANTS(geom, type),                                     \
19209       Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type##_satfinite, \
19210       Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
19211       Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type##_satfinite, \
19212       Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type##_satfinite
19213 // Sub-integer MMA only supports row.col layout.
19214 #define MMA_VARIANTS_I4(geom, type) \
19215       0, \
19216       Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type,             \
19217       0, \
19218       0, \
19219       0, \
19220       Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
19221       0, \
19222       0
19223 // b1 MMA does not support .satfinite.
19224 #define MMA_VARIANTS_B1_XOR(geom, type) \
19225       0, \
19226       Intrinsic::nvvm_wmma_##geom##_mma_xor_popc_row_col_##type,             \
19227       0, \
19228       0, \
19229       0, \
19230       0, \
19231       0, \
19232       0
19233 #define MMA_VARIANTS_B1_AND(geom, type) \
19234       0, \
19235       Intrinsic::nvvm_wmma_##geom##_mma_and_popc_row_col_##type,             \
19236       0, \
19237       0, \
19238       0, \
19239       0, \
19240       0, \
19241       0
19242   // clang-format on
19243   switch (BuiltinID) {
19244   // FP MMA
19245   // Note that 'type' argument of MMA_SATF_VARIANTS uses D_C notation, while
19246   // NumEltsN of return value are ordered as A,B,C,D.
19247   case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
19248     return {8, 8, 4, 4, {{MMA_SATF_VARIANTS(m16n16k16, f16_f16)}}};
19249   case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
19250     return {8, 8, 4, 8, {{MMA_SATF_VARIANTS(m16n16k16, f32_f16)}}};
19251   case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
19252     return {8, 8, 8, 4, {{MMA_SATF_VARIANTS(m16n16k16, f16_f32)}}};
19253   case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
19254     return {8, 8, 8, 8, {{MMA_SATF_VARIANTS(m16n16k16, f32_f32)}}};
19255   case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
19256     return {8, 8, 4, 4, {{MMA_SATF_VARIANTS(m32n8k16, f16_f16)}}};
19257   case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
19258     return {8, 8, 4, 8, {{MMA_SATF_VARIANTS(m32n8k16, f32_f16)}}};
19259   case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
19260     return {8, 8, 8, 4, {{MMA_SATF_VARIANTS(m32n8k16, f16_f32)}}};
19261   case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
19262     return {8, 8, 8, 8, {{MMA_SATF_VARIANTS(m32n8k16, f32_f32)}}};
19263   case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
19264     return {8, 8, 4, 4, {{MMA_SATF_VARIANTS(m8n32k16, f16_f16)}}};
19265   case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
19266     return {8, 8, 4, 8, {{MMA_SATF_VARIANTS(m8n32k16, f32_f16)}}};
19267   case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
19268     return {8, 8, 8, 4, {{MMA_SATF_VARIANTS(m8n32k16, f16_f32)}}};
19269   case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
19270     return {8, 8, 8, 8, {{MMA_SATF_VARIANTS(m8n32k16, f32_f32)}}};
19271 
19272   // Integer MMA
19273   case NVPTX::BI__imma_m16n16k16_mma_s8:
19274     return {2, 2, 8, 8, {{MMA_SATF_VARIANTS(m16n16k16, s8)}}};
19275   case NVPTX::BI__imma_m16n16k16_mma_u8:
19276     return {2, 2, 8, 8, {{MMA_SATF_VARIANTS(m16n16k16, u8)}}};
19277   case NVPTX::BI__imma_m32n8k16_mma_s8:
19278     return {4, 1, 8, 8, {{MMA_SATF_VARIANTS(m32n8k16, s8)}}};
19279   case NVPTX::BI__imma_m32n8k16_mma_u8:
19280     return {4, 1, 8, 8, {{MMA_SATF_VARIANTS(m32n8k16, u8)}}};
19281   case NVPTX::BI__imma_m8n32k16_mma_s8:
19282     return {1, 4, 8, 8, {{MMA_SATF_VARIANTS(m8n32k16, s8)}}};
19283   case NVPTX::BI__imma_m8n32k16_mma_u8:
19284     return {1, 4, 8, 8, {{MMA_SATF_VARIANTS(m8n32k16, u8)}}};
19285 
19286   // Sub-integer MMA
19287   case NVPTX::BI__imma_m8n8k32_mma_s4:
19288     return {1, 1, 2, 2, {{MMA_VARIANTS_I4(m8n8k32, s4)}}};
19289   case NVPTX::BI__imma_m8n8k32_mma_u4:
19290     return {1, 1, 2, 2, {{MMA_VARIANTS_I4(m8n8k32, u4)}}};
19291   case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1:
19292     return {1, 1, 2, 2, {{MMA_VARIANTS_B1_XOR(m8n8k128, b1)}}};
19293   case NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1:
19294     return {1, 1, 2, 2, {{MMA_VARIANTS_B1_AND(m8n8k128, b1)}}};
19295 
19296   // Double MMA
19297   case NVPTX::BI__dmma_m8n8k4_mma_f64:
19298     return {1, 1, 2, 2, {{MMA_VARIANTS(m8n8k4, f64)}}};
19299 
19300   // Alternate FP MMA
19301   case NVPTX::BI__mma_bf16_m16n16k16_mma_f32:
19302     return {4, 4, 8, 8, {{MMA_VARIANTS(m16n16k16, bf16)}}};
19303   case NVPTX::BI__mma_bf16_m8n32k16_mma_f32:
19304     return {2, 8, 8, 8, {{MMA_VARIANTS(m8n32k16, bf16)}}};
19305   case NVPTX::BI__mma_bf16_m32n8k16_mma_f32:
19306     return {8, 2, 8, 8, {{MMA_VARIANTS(m32n8k16, bf16)}}};
19307   case NVPTX::BI__mma_tf32_m16n16k8_mma_f32:
19308     return {4, 4, 8, 8, {{MMA_VARIANTS(m16n16k8, tf32)}}};
19309   default:
19310     llvm_unreachable("Unexpected builtin ID.");
19311   }
19312 #undef MMA_VARIANTS
19313 #undef MMA_SATF_VARIANTS
19314 #undef MMA_VARIANTS_I4
19315 #undef MMA_VARIANTS_B1_AND
19316 #undef MMA_VARIANTS_B1_XOR
19317 }
19318 
19319 static Value *MakeLdgLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
19320                          const CallExpr *E) {
19321   Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
19322   QualType ArgType = E->getArg(0)->getType();
19323   clang::CharUnits Align = CGF.CGM.getNaturalPointeeTypeAlignment(ArgType);
19324   llvm::Type *ElemTy = CGF.ConvertTypeForMem(ArgType->getPointeeType());
19325   return CGF.Builder.CreateCall(
19326       CGF.CGM.getIntrinsic(IntrinsicID, {ElemTy, Ptr->getType()}),
19327       {Ptr, ConstantInt::get(CGF.Builder.getInt32Ty(), Align.getQuantity())});
19328 }
19329 
19330 static Value *MakeScopedAtomic(unsigned IntrinsicID, CodeGenFunction &CGF,
19331                                const CallExpr *E) {
19332   Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
19333   llvm::Type *ElemTy =
19334       CGF.ConvertTypeForMem(E->getArg(0)->getType()->getPointeeType());
19335   return CGF.Builder.CreateCall(
19336       CGF.CGM.getIntrinsic(IntrinsicID, {ElemTy, Ptr->getType()}),
19337       {Ptr, CGF.EmitScalarExpr(E->getArg(1))});
19338 }
19339 
19340 static Value *MakeCpAsync(unsigned IntrinsicID, unsigned IntrinsicIDS,
19341                           CodeGenFunction &CGF, const CallExpr *E,
19342                           int SrcSize) {
19343   return E->getNumArgs() == 3
19344              ? CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IntrinsicIDS),
19345                                       {CGF.EmitScalarExpr(E->getArg(0)),
19346                                        CGF.EmitScalarExpr(E->getArg(1)),
19347                                        CGF.EmitScalarExpr(E->getArg(2))})
19348              : CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IntrinsicID),
19349                                       {CGF.EmitScalarExpr(E->getArg(0)),
19350                                        CGF.EmitScalarExpr(E->getArg(1))});
19351 }
19352 
19353 static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
19354                            const CallExpr *E, CodeGenFunction &CGF) {
19355   auto &C = CGF.CGM.getContext();
19356   if (!(C.getLangOpts().NativeHalfType ||
19357         !C.getTargetInfo().useFP16ConversionIntrinsics())) {
19358     CGF.CGM.Error(E->getExprLoc(), C.BuiltinInfo.getName(BuiltinID).str() +
19359                                        " requires native half type support.");
19360     return nullptr;
19361   }
19362 
19363   if (IntrinsicID == Intrinsic::nvvm_ldg_global_f ||
19364       IntrinsicID == Intrinsic::nvvm_ldu_global_f)
19365     return MakeLdgLdu(IntrinsicID, CGF, E);
19366 
19367   SmallVector<Value *, 16> Args;
19368   auto *F = CGF.CGM.getIntrinsic(IntrinsicID);
19369   auto *FTy = F->getFunctionType();
19370   unsigned ICEArguments = 0;
19371   ASTContext::GetBuiltinTypeError Error;
19372   C.GetBuiltinType(BuiltinID, Error, &ICEArguments);
19373   assert(Error == ASTContext::GE_None && "Should not codegen an error");
19374   for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) {
19375     assert((ICEArguments & (1 << i)) == 0);
19376     auto *ArgValue = CGF.EmitScalarExpr(E->getArg(i));
19377     auto *PTy = FTy->getParamType(i);
19378     if (PTy != ArgValue->getType())
19379       ArgValue = CGF.Builder.CreateBitCast(ArgValue, PTy);
19380     Args.push_back(ArgValue);
19381   }
19382 
19383   return CGF.Builder.CreateCall(F, Args);
19384 }
19385 } // namespace
19386 
19387 Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
19388                                              const CallExpr *E) {
19389   switch (BuiltinID) {
19390   case NVPTX::BI__nvvm_atom_add_gen_i:
19391   case NVPTX::BI__nvvm_atom_add_gen_l:
19392   case NVPTX::BI__nvvm_atom_add_gen_ll:
19393     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Add, E);
19394 
19395   case NVPTX::BI__nvvm_atom_sub_gen_i:
19396   case NVPTX::BI__nvvm_atom_sub_gen_l:
19397   case NVPTX::BI__nvvm_atom_sub_gen_ll:
19398     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Sub, E);
19399 
19400   case NVPTX::BI__nvvm_atom_and_gen_i:
19401   case NVPTX::BI__nvvm_atom_and_gen_l:
19402   case NVPTX::BI__nvvm_atom_and_gen_ll:
19403     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::And, E);
19404 
19405   case NVPTX::BI__nvvm_atom_or_gen_i:
19406   case NVPTX::BI__nvvm_atom_or_gen_l:
19407   case NVPTX::BI__nvvm_atom_or_gen_ll:
19408     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Or, E);
19409 
19410   case NVPTX::BI__nvvm_atom_xor_gen_i:
19411   case NVPTX::BI__nvvm_atom_xor_gen_l:
19412   case NVPTX::BI__nvvm_atom_xor_gen_ll:
19413     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xor, E);
19414 
19415   case NVPTX::BI__nvvm_atom_xchg_gen_i:
19416   case NVPTX::BI__nvvm_atom_xchg_gen_l:
19417   case NVPTX::BI__nvvm_atom_xchg_gen_ll:
19418     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xchg, E);
19419 
19420   case NVPTX::BI__nvvm_atom_max_gen_i:
19421   case NVPTX::BI__nvvm_atom_max_gen_l:
19422   case NVPTX::BI__nvvm_atom_max_gen_ll:
19423     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Max, E);
19424 
19425   case NVPTX::BI__nvvm_atom_max_gen_ui:
19426   case NVPTX::BI__nvvm_atom_max_gen_ul:
19427   case NVPTX::BI__nvvm_atom_max_gen_ull:
19428     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMax, E);
19429 
19430   case NVPTX::BI__nvvm_atom_min_gen_i:
19431   case NVPTX::BI__nvvm_atom_min_gen_l:
19432   case NVPTX::BI__nvvm_atom_min_gen_ll:
19433     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Min, E);
19434 
19435   case NVPTX::BI__nvvm_atom_min_gen_ui:
19436   case NVPTX::BI__nvvm_atom_min_gen_ul:
19437   case NVPTX::BI__nvvm_atom_min_gen_ull:
19438     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMin, E);
19439 
19440   case NVPTX::BI__nvvm_atom_cas_gen_i:
19441   case NVPTX::BI__nvvm_atom_cas_gen_l:
19442   case NVPTX::BI__nvvm_atom_cas_gen_ll:
19443     // __nvvm_atom_cas_gen_* should return the old value rather than the
19444     // success flag.
19445     return MakeAtomicCmpXchgValue(*this, E, /*ReturnBool=*/false);
19446 
19447   case NVPTX::BI__nvvm_atom_add_gen_f:
19448   case NVPTX::BI__nvvm_atom_add_gen_d: {
19449     Address DestAddr = EmitPointerWithAlignment(E->getArg(0));
19450     Value *Val = EmitScalarExpr(E->getArg(1));
19451 
19452     return Builder.CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, DestAddr, Val,
19453                                    AtomicOrdering::SequentiallyConsistent);
19454   }
19455 
19456   case NVPTX::BI__nvvm_atom_inc_gen_ui: {
19457     Value *Ptr = EmitScalarExpr(E->getArg(0));
19458     Value *Val = EmitScalarExpr(E->getArg(1));
19459     Function *FnALI32 =
19460         CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_inc_32, Ptr->getType());
19461     return Builder.CreateCall(FnALI32, {Ptr, Val});
19462   }
19463 
19464   case NVPTX::BI__nvvm_atom_dec_gen_ui: {
19465     Value *Ptr = EmitScalarExpr(E->getArg(0));
19466     Value *Val = EmitScalarExpr(E->getArg(1));
19467     Function *FnALD32 =
19468         CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_dec_32, Ptr->getType());
19469     return Builder.CreateCall(FnALD32, {Ptr, Val});
19470   }
19471 
19472   case NVPTX::BI__nvvm_ldg_c:
19473   case NVPTX::BI__nvvm_ldg_sc:
19474   case NVPTX::BI__nvvm_ldg_c2:
19475   case NVPTX::BI__nvvm_ldg_sc2:
19476   case NVPTX::BI__nvvm_ldg_c4:
19477   case NVPTX::BI__nvvm_ldg_sc4:
19478   case NVPTX::BI__nvvm_ldg_s:
19479   case NVPTX::BI__nvvm_ldg_s2:
19480   case NVPTX::BI__nvvm_ldg_s4:
19481   case NVPTX::BI__nvvm_ldg_i:
19482   case NVPTX::BI__nvvm_ldg_i2:
19483   case NVPTX::BI__nvvm_ldg_i4:
19484   case NVPTX::BI__nvvm_ldg_l:
19485   case NVPTX::BI__nvvm_ldg_l2:
19486   case NVPTX::BI__nvvm_ldg_ll:
19487   case NVPTX::BI__nvvm_ldg_ll2:
19488   case NVPTX::BI__nvvm_ldg_uc:
19489   case NVPTX::BI__nvvm_ldg_uc2:
19490   case NVPTX::BI__nvvm_ldg_uc4:
19491   case NVPTX::BI__nvvm_ldg_us:
19492   case NVPTX::BI__nvvm_ldg_us2:
19493   case NVPTX::BI__nvvm_ldg_us4:
19494   case NVPTX::BI__nvvm_ldg_ui:
19495   case NVPTX::BI__nvvm_ldg_ui2:
19496   case NVPTX::BI__nvvm_ldg_ui4:
19497   case NVPTX::BI__nvvm_ldg_ul:
19498   case NVPTX::BI__nvvm_ldg_ul2:
19499   case NVPTX::BI__nvvm_ldg_ull:
19500   case NVPTX::BI__nvvm_ldg_ull2:
19501     // PTX Interoperability section 2.2: "For a vector with an even number of
19502     // elements, its alignment is set to number of elements times the alignment
19503     // of its member: n*alignof(t)."
19504     return MakeLdgLdu(Intrinsic::nvvm_ldg_global_i, *this, E);
19505   case NVPTX::BI__nvvm_ldg_f:
19506   case NVPTX::BI__nvvm_ldg_f2:
19507   case NVPTX::BI__nvvm_ldg_f4:
19508   case NVPTX::BI__nvvm_ldg_d:
19509   case NVPTX::BI__nvvm_ldg_d2:
19510     return MakeLdgLdu(Intrinsic::nvvm_ldg_global_f, *this, E);
19511 
19512   case NVPTX::BI__nvvm_ldu_c:
19513   case NVPTX::BI__nvvm_ldu_sc:
19514   case NVPTX::BI__nvvm_ldu_c2:
19515   case NVPTX::BI__nvvm_ldu_sc2:
19516   case NVPTX::BI__nvvm_ldu_c4:
19517   case NVPTX::BI__nvvm_ldu_sc4:
19518   case NVPTX::BI__nvvm_ldu_s:
19519   case NVPTX::BI__nvvm_ldu_s2:
19520   case NVPTX::BI__nvvm_ldu_s4:
19521   case NVPTX::BI__nvvm_ldu_i:
19522   case NVPTX::BI__nvvm_ldu_i2:
19523   case NVPTX::BI__nvvm_ldu_i4:
19524   case NVPTX::BI__nvvm_ldu_l:
19525   case NVPTX::BI__nvvm_ldu_l2:
19526   case NVPTX::BI__nvvm_ldu_ll:
19527   case NVPTX::BI__nvvm_ldu_ll2:
19528   case NVPTX::BI__nvvm_ldu_uc:
19529   case NVPTX::BI__nvvm_ldu_uc2:
19530   case NVPTX::BI__nvvm_ldu_uc4:
19531   case NVPTX::BI__nvvm_ldu_us:
19532   case NVPTX::BI__nvvm_ldu_us2:
19533   case NVPTX::BI__nvvm_ldu_us4:
19534   case NVPTX::BI__nvvm_ldu_ui:
19535   case NVPTX::BI__nvvm_ldu_ui2:
19536   case NVPTX::BI__nvvm_ldu_ui4:
19537   case NVPTX::BI__nvvm_ldu_ul:
19538   case NVPTX::BI__nvvm_ldu_ul2:
19539   case NVPTX::BI__nvvm_ldu_ull:
19540   case NVPTX::BI__nvvm_ldu_ull2:
19541     return MakeLdgLdu(Intrinsic::nvvm_ldu_global_i, *this, E);
19542   case NVPTX::BI__nvvm_ldu_f:
19543   case NVPTX::BI__nvvm_ldu_f2:
19544   case NVPTX::BI__nvvm_ldu_f4:
19545   case NVPTX::BI__nvvm_ldu_d:
19546   case NVPTX::BI__nvvm_ldu_d2:
19547     return MakeLdgLdu(Intrinsic::nvvm_ldu_global_f, *this, E);
19548 
19549   case NVPTX::BI__nvvm_atom_cta_add_gen_i:
19550   case NVPTX::BI__nvvm_atom_cta_add_gen_l:
19551   case NVPTX::BI__nvvm_atom_cta_add_gen_ll:
19552     return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_cta, *this, E);
19553   case NVPTX::BI__nvvm_atom_sys_add_gen_i:
19554   case NVPTX::BI__nvvm_atom_sys_add_gen_l:
19555   case NVPTX::BI__nvvm_atom_sys_add_gen_ll:
19556     return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_sys, *this, E);
19557   case NVPTX::BI__nvvm_atom_cta_add_gen_f:
19558   case NVPTX::BI__nvvm_atom_cta_add_gen_d:
19559     return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_cta, *this, E);
19560   case NVPTX::BI__nvvm_atom_sys_add_gen_f:
19561   case NVPTX::BI__nvvm_atom_sys_add_gen_d:
19562     return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_sys, *this, E);
19563   case NVPTX::BI__nvvm_atom_cta_xchg_gen_i:
19564   case NVPTX::BI__nvvm_atom_cta_xchg_gen_l:
19565   case NVPTX::BI__nvvm_atom_cta_xchg_gen_ll:
19566     return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_cta, *this, E);
19567   case NVPTX::BI__nvvm_atom_sys_xchg_gen_i:
19568   case NVPTX::BI__nvvm_atom_sys_xchg_gen_l:
19569   case NVPTX::BI__nvvm_atom_sys_xchg_gen_ll:
19570     return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_sys, *this, E);
19571   case NVPTX::BI__nvvm_atom_cta_max_gen_i:
19572   case NVPTX::BI__nvvm_atom_cta_max_gen_ui:
19573   case NVPTX::BI__nvvm_atom_cta_max_gen_l:
19574   case NVPTX::BI__nvvm_atom_cta_max_gen_ul:
19575   case NVPTX::BI__nvvm_atom_cta_max_gen_ll:
19576   case NVPTX::BI__nvvm_atom_cta_max_gen_ull:
19577     return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_cta, *this, E);
19578   case NVPTX::BI__nvvm_atom_sys_max_gen_i:
19579   case NVPTX::BI__nvvm_atom_sys_max_gen_ui:
19580   case NVPTX::BI__nvvm_atom_sys_max_gen_l:
19581   case NVPTX::BI__nvvm_atom_sys_max_gen_ul:
19582   case NVPTX::BI__nvvm_atom_sys_max_gen_ll:
19583   case NVPTX::BI__nvvm_atom_sys_max_gen_ull:
19584     return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_sys, *this, E);
19585   case NVPTX::BI__nvvm_atom_cta_min_gen_i:
19586   case NVPTX::BI__nvvm_atom_cta_min_gen_ui:
19587   case NVPTX::BI__nvvm_atom_cta_min_gen_l:
19588   case NVPTX::BI__nvvm_atom_cta_min_gen_ul:
19589   case NVPTX::BI__nvvm_atom_cta_min_gen_ll:
19590   case NVPTX::BI__nvvm_atom_cta_min_gen_ull:
19591     return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_cta, *this, E);
19592   case NVPTX::BI__nvvm_atom_sys_min_gen_i:
19593   case NVPTX::BI__nvvm_atom_sys_min_gen_ui:
19594   case NVPTX::BI__nvvm_atom_sys_min_gen_l:
19595   case NVPTX::BI__nvvm_atom_sys_min_gen_ul:
19596   case NVPTX::BI__nvvm_atom_sys_min_gen_ll:
19597   case NVPTX::BI__nvvm_atom_sys_min_gen_ull:
19598     return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_sys, *this, E);
19599   case NVPTX::BI__nvvm_atom_cta_inc_gen_ui:
19600     return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_cta, *this, E);
19601   case NVPTX::BI__nvvm_atom_cta_dec_gen_ui:
19602     return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_cta, *this, E);
19603   case NVPTX::BI__nvvm_atom_sys_inc_gen_ui:
19604     return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_sys, *this, E);
19605   case NVPTX::BI__nvvm_atom_sys_dec_gen_ui:
19606     return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_sys, *this, E);
19607   case NVPTX::BI__nvvm_atom_cta_and_gen_i:
19608   case NVPTX::BI__nvvm_atom_cta_and_gen_l:
19609   case NVPTX::BI__nvvm_atom_cta_and_gen_ll:
19610     return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_cta, *this, E);
19611   case NVPTX::BI__nvvm_atom_sys_and_gen_i:
19612   case NVPTX::BI__nvvm_atom_sys_and_gen_l:
19613   case NVPTX::BI__nvvm_atom_sys_and_gen_ll:
19614     return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_sys, *this, E);
19615   case NVPTX::BI__nvvm_atom_cta_or_gen_i:
19616   case NVPTX::BI__nvvm_atom_cta_or_gen_l:
19617   case NVPTX::BI__nvvm_atom_cta_or_gen_ll:
19618     return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_cta, *this, E);
19619   case NVPTX::BI__nvvm_atom_sys_or_gen_i:
19620   case NVPTX::BI__nvvm_atom_sys_or_gen_l:
19621   case NVPTX::BI__nvvm_atom_sys_or_gen_ll:
19622     return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_sys, *this, E);
19623   case NVPTX::BI__nvvm_atom_cta_xor_gen_i:
19624   case NVPTX::BI__nvvm_atom_cta_xor_gen_l:
19625   case NVPTX::BI__nvvm_atom_cta_xor_gen_ll:
19626     return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_cta, *this, E);
19627   case NVPTX::BI__nvvm_atom_sys_xor_gen_i:
19628   case NVPTX::BI__nvvm_atom_sys_xor_gen_l:
19629   case NVPTX::BI__nvvm_atom_sys_xor_gen_ll:
19630     return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_sys, *this, E);
19631   case NVPTX::BI__nvvm_atom_cta_cas_gen_i:
19632   case NVPTX::BI__nvvm_atom_cta_cas_gen_l:
19633   case NVPTX::BI__nvvm_atom_cta_cas_gen_ll: {
19634     Value *Ptr = EmitScalarExpr(E->getArg(0));
19635     llvm::Type *ElemTy =
19636         ConvertTypeForMem(E->getArg(0)->getType()->getPointeeType());
19637     return Builder.CreateCall(
19638         CGM.getIntrinsic(
19639             Intrinsic::nvvm_atomic_cas_gen_i_cta, {ElemTy, Ptr->getType()}),
19640         {Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))});
19641   }
19642   case NVPTX::BI__nvvm_atom_sys_cas_gen_i:
19643   case NVPTX::BI__nvvm_atom_sys_cas_gen_l:
19644   case NVPTX::BI__nvvm_atom_sys_cas_gen_ll: {
19645     Value *Ptr = EmitScalarExpr(E->getArg(0));
19646     llvm::Type *ElemTy =
19647         ConvertTypeForMem(E->getArg(0)->getType()->getPointeeType());
19648     return Builder.CreateCall(
19649         CGM.getIntrinsic(
19650             Intrinsic::nvvm_atomic_cas_gen_i_sys, {ElemTy, Ptr->getType()}),
19651         {Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))});
19652   }
19653   case NVPTX::BI__nvvm_match_all_sync_i32p:
19654   case NVPTX::BI__nvvm_match_all_sync_i64p: {
19655     Value *Mask = EmitScalarExpr(E->getArg(0));
19656     Value *Val = EmitScalarExpr(E->getArg(1));
19657     Address PredOutPtr = EmitPointerWithAlignment(E->getArg(2));
19658     Value *ResultPair = Builder.CreateCall(
19659         CGM.getIntrinsic(BuiltinID == NVPTX::BI__nvvm_match_all_sync_i32p
19660                              ? Intrinsic::nvvm_match_all_sync_i32p
19661                              : Intrinsic::nvvm_match_all_sync_i64p),
19662         {Mask, Val});
19663     Value *Pred = Builder.CreateZExt(Builder.CreateExtractValue(ResultPair, 1),
19664                                      PredOutPtr.getElementType());
19665     Builder.CreateStore(Pred, PredOutPtr);
19666     return Builder.CreateExtractValue(ResultPair, 0);
19667   }
19668 
19669   // FP MMA loads
19670   case NVPTX::BI__hmma_m16n16k16_ld_a:
19671   case NVPTX::BI__hmma_m16n16k16_ld_b:
19672   case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
19673   case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
19674   case NVPTX::BI__hmma_m32n8k16_ld_a:
19675   case NVPTX::BI__hmma_m32n8k16_ld_b:
19676   case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
19677   case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
19678   case NVPTX::BI__hmma_m8n32k16_ld_a:
19679   case NVPTX::BI__hmma_m8n32k16_ld_b:
19680   case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
19681   case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
19682   // Integer MMA loads.
19683   case NVPTX::BI__imma_m16n16k16_ld_a_s8:
19684   case NVPTX::BI__imma_m16n16k16_ld_a_u8:
19685   case NVPTX::BI__imma_m16n16k16_ld_b_s8:
19686   case NVPTX::BI__imma_m16n16k16_ld_b_u8:
19687   case NVPTX::BI__imma_m16n16k16_ld_c:
19688   case NVPTX::BI__imma_m32n8k16_ld_a_s8:
19689   case NVPTX::BI__imma_m32n8k16_ld_a_u8:
19690   case NVPTX::BI__imma_m32n8k16_ld_b_s8:
19691   case NVPTX::BI__imma_m32n8k16_ld_b_u8:
19692   case NVPTX::BI__imma_m32n8k16_ld_c:
19693   case NVPTX::BI__imma_m8n32k16_ld_a_s8:
19694   case NVPTX::BI__imma_m8n32k16_ld_a_u8:
19695   case NVPTX::BI__imma_m8n32k16_ld_b_s8:
19696   case NVPTX::BI__imma_m8n32k16_ld_b_u8:
19697   case NVPTX::BI__imma_m8n32k16_ld_c:
19698   // Sub-integer MMA loads.
19699   case NVPTX::BI__imma_m8n8k32_ld_a_s4:
19700   case NVPTX::BI__imma_m8n8k32_ld_a_u4:
19701   case NVPTX::BI__imma_m8n8k32_ld_b_s4:
19702   case NVPTX::BI__imma_m8n8k32_ld_b_u4:
19703   case NVPTX::BI__imma_m8n8k32_ld_c:
19704   case NVPTX::BI__bmma_m8n8k128_ld_a_b1:
19705   case NVPTX::BI__bmma_m8n8k128_ld_b_b1:
19706   case NVPTX::BI__bmma_m8n8k128_ld_c:
19707   // Double MMA loads.
19708   case NVPTX::BI__dmma_m8n8k4_ld_a:
19709   case NVPTX::BI__dmma_m8n8k4_ld_b:
19710   case NVPTX::BI__dmma_m8n8k4_ld_c:
19711   // Alternate float MMA loads.
19712   case NVPTX::BI__mma_bf16_m16n16k16_ld_a:
19713   case NVPTX::BI__mma_bf16_m16n16k16_ld_b:
19714   case NVPTX::BI__mma_bf16_m8n32k16_ld_a:
19715   case NVPTX::BI__mma_bf16_m8n32k16_ld_b:
19716   case NVPTX::BI__mma_bf16_m32n8k16_ld_a:
19717   case NVPTX::BI__mma_bf16_m32n8k16_ld_b:
19718   case NVPTX::BI__mma_tf32_m16n16k8_ld_a:
19719   case NVPTX::BI__mma_tf32_m16n16k8_ld_b:
19720   case NVPTX::BI__mma_tf32_m16n16k8_ld_c: {
19721     Address Dst = EmitPointerWithAlignment(E->getArg(0));
19722     Value *Src = EmitScalarExpr(E->getArg(1));
19723     Value *Ldm = EmitScalarExpr(E->getArg(2));
19724     std::optional<llvm::APSInt> isColMajorArg =
19725         E->getArg(3)->getIntegerConstantExpr(getContext());
19726     if (!isColMajorArg)
19727       return nullptr;
19728     bool isColMajor = isColMajorArg->getSExtValue();
19729     NVPTXMmaLdstInfo II = getNVPTXMmaLdstInfo(BuiltinID);
19730     unsigned IID = isColMajor ? II.IID_col : II.IID_row;
19731     if (IID == 0)
19732       return nullptr;
19733 
19734     Value *Result =
19735         Builder.CreateCall(CGM.getIntrinsic(IID, Src->getType()), {Src, Ldm});
19736 
19737     // Save returned values.
19738     assert(II.NumResults);
19739     if (II.NumResults == 1) {
19740       Builder.CreateAlignedStore(Result, Dst.getPointer(),
19741                                  CharUnits::fromQuantity(4));
19742     } else {
19743       for (unsigned i = 0; i < II.NumResults; ++i) {
19744         Builder.CreateAlignedStore(
19745             Builder.CreateBitCast(Builder.CreateExtractValue(Result, i),
19746                                   Dst.getElementType()),
19747             Builder.CreateGEP(Dst.getElementType(), Dst.getPointer(),
19748                               llvm::ConstantInt::get(IntTy, i)),
19749             CharUnits::fromQuantity(4));
19750       }
19751     }
19752     return Result;
19753   }
19754 
19755   case NVPTX::BI__hmma_m16n16k16_st_c_f16:
19756   case NVPTX::BI__hmma_m16n16k16_st_c_f32:
19757   case NVPTX::BI__hmma_m32n8k16_st_c_f16:
19758   case NVPTX::BI__hmma_m32n8k16_st_c_f32:
19759   case NVPTX::BI__hmma_m8n32k16_st_c_f16:
19760   case NVPTX::BI__hmma_m8n32k16_st_c_f32:
19761   case NVPTX::BI__imma_m16n16k16_st_c_i32:
19762   case NVPTX::BI__imma_m32n8k16_st_c_i32:
19763   case NVPTX::BI__imma_m8n32k16_st_c_i32:
19764   case NVPTX::BI__imma_m8n8k32_st_c_i32:
19765   case NVPTX::BI__bmma_m8n8k128_st_c_i32:
19766   case NVPTX::BI__dmma_m8n8k4_st_c_f64:
19767   case NVPTX::BI__mma_m16n16k8_st_c_f32: {
19768     Value *Dst = EmitScalarExpr(E->getArg(0));
19769     Address Src = EmitPointerWithAlignment(E->getArg(1));
19770     Value *Ldm = EmitScalarExpr(E->getArg(2));
19771     std::optional<llvm::APSInt> isColMajorArg =
19772         E->getArg(3)->getIntegerConstantExpr(getContext());
19773     if (!isColMajorArg)
19774       return nullptr;
19775     bool isColMajor = isColMajorArg->getSExtValue();
19776     NVPTXMmaLdstInfo II = getNVPTXMmaLdstInfo(BuiltinID);
19777     unsigned IID = isColMajor ? II.IID_col : II.IID_row;
19778     if (IID == 0)
19779       return nullptr;
19780     Function *Intrinsic =
19781         CGM.getIntrinsic(IID, Dst->getType());
19782     llvm::Type *ParamType = Intrinsic->getFunctionType()->getParamType(1);
19783     SmallVector<Value *, 10> Values = {Dst};
19784     for (unsigned i = 0; i < II.NumResults; ++i) {
19785       Value *V = Builder.CreateAlignedLoad(
19786           Src.getElementType(),
19787           Builder.CreateGEP(Src.getElementType(), Src.getPointer(),
19788                             llvm::ConstantInt::get(IntTy, i)),
19789           CharUnits::fromQuantity(4));
19790       Values.push_back(Builder.CreateBitCast(V, ParamType));
19791     }
19792     Values.push_back(Ldm);
19793     Value *Result = Builder.CreateCall(Intrinsic, Values);
19794     return Result;
19795   }
19796 
19797   // BI__hmma_m16n16k16_mma_<Dtype><CType>(d, a, b, c, layout, satf) -->
19798   // Intrinsic::nvvm_wmma_m16n16k16_mma_sync<layout A,B><DType><CType><Satf>
19799   case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
19800   case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
19801   case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
19802   case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
19803   case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
19804   case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
19805   case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
19806   case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
19807   case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
19808   case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
19809   case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
19810   case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
19811   case NVPTX::BI__imma_m16n16k16_mma_s8:
19812   case NVPTX::BI__imma_m16n16k16_mma_u8:
19813   case NVPTX::BI__imma_m32n8k16_mma_s8:
19814   case NVPTX::BI__imma_m32n8k16_mma_u8:
19815   case NVPTX::BI__imma_m8n32k16_mma_s8:
19816   case NVPTX::BI__imma_m8n32k16_mma_u8:
19817   case NVPTX::BI__imma_m8n8k32_mma_s4:
19818   case NVPTX::BI__imma_m8n8k32_mma_u4:
19819   case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1:
19820   case NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1:
19821   case NVPTX::BI__dmma_m8n8k4_mma_f64:
19822   case NVPTX::BI__mma_bf16_m16n16k16_mma_f32:
19823   case NVPTX::BI__mma_bf16_m8n32k16_mma_f32:
19824   case NVPTX::BI__mma_bf16_m32n8k16_mma_f32:
19825   case NVPTX::BI__mma_tf32_m16n16k8_mma_f32: {
19826     Address Dst = EmitPointerWithAlignment(E->getArg(0));
19827     Address SrcA = EmitPointerWithAlignment(E->getArg(1));
19828     Address SrcB = EmitPointerWithAlignment(E->getArg(2));
19829     Address SrcC = EmitPointerWithAlignment(E->getArg(3));
19830     std::optional<llvm::APSInt> LayoutArg =
19831         E->getArg(4)->getIntegerConstantExpr(getContext());
19832     if (!LayoutArg)
19833       return nullptr;
19834     int Layout = LayoutArg->getSExtValue();
19835     if (Layout < 0 || Layout > 3)
19836       return nullptr;
19837     llvm::APSInt SatfArg;
19838     if (BuiltinID == NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1 ||
19839         BuiltinID == NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1)
19840       SatfArg = 0;  // .b1 does not have satf argument.
19841     else if (std::optional<llvm::APSInt> OptSatfArg =
19842                  E->getArg(5)->getIntegerConstantExpr(getContext()))
19843       SatfArg = *OptSatfArg;
19844     else
19845       return nullptr;
19846     bool Satf = SatfArg.getSExtValue();
19847     NVPTXMmaInfo MI = getNVPTXMmaInfo(BuiltinID);
19848     unsigned IID = MI.getMMAIntrinsic(Layout, Satf);
19849     if (IID == 0)  // Unsupported combination of Layout/Satf.
19850       return nullptr;
19851 
19852     SmallVector<Value *, 24> Values;
19853     Function *Intrinsic = CGM.getIntrinsic(IID);
19854     llvm::Type *AType = Intrinsic->getFunctionType()->getParamType(0);
19855     // Load A
19856     for (unsigned i = 0; i < MI.NumEltsA; ++i) {
19857       Value *V = Builder.CreateAlignedLoad(
19858           SrcA.getElementType(),
19859           Builder.CreateGEP(SrcA.getElementType(), SrcA.getPointer(),
19860                             llvm::ConstantInt::get(IntTy, i)),
19861           CharUnits::fromQuantity(4));
19862       Values.push_back(Builder.CreateBitCast(V, AType));
19863     }
19864     // Load B
19865     llvm::Type *BType = Intrinsic->getFunctionType()->getParamType(MI.NumEltsA);
19866     for (unsigned i = 0; i < MI.NumEltsB; ++i) {
19867       Value *V = Builder.CreateAlignedLoad(
19868           SrcB.getElementType(),
19869           Builder.CreateGEP(SrcB.getElementType(), SrcB.getPointer(),
19870                             llvm::ConstantInt::get(IntTy, i)),
19871           CharUnits::fromQuantity(4));
19872       Values.push_back(Builder.CreateBitCast(V, BType));
19873     }
19874     // Load C
19875     llvm::Type *CType =
19876         Intrinsic->getFunctionType()->getParamType(MI.NumEltsA + MI.NumEltsB);
19877     for (unsigned i = 0; i < MI.NumEltsC; ++i) {
19878       Value *V = Builder.CreateAlignedLoad(
19879           SrcC.getElementType(),
19880           Builder.CreateGEP(SrcC.getElementType(), SrcC.getPointer(),
19881                             llvm::ConstantInt::get(IntTy, i)),
19882           CharUnits::fromQuantity(4));
19883       Values.push_back(Builder.CreateBitCast(V, CType));
19884     }
19885     Value *Result = Builder.CreateCall(Intrinsic, Values);
19886     llvm::Type *DType = Dst.getElementType();
19887     for (unsigned i = 0; i < MI.NumEltsD; ++i)
19888       Builder.CreateAlignedStore(
19889           Builder.CreateBitCast(Builder.CreateExtractValue(Result, i), DType),
19890           Builder.CreateGEP(Dst.getElementType(), Dst.getPointer(),
19891                             llvm::ConstantInt::get(IntTy, i)),
19892           CharUnits::fromQuantity(4));
19893     return Result;
19894   }
19895   // The following builtins require half type support
19896   case NVPTX::BI__nvvm_ex2_approx_f16:
19897     return MakeHalfType(Intrinsic::nvvm_ex2_approx_f16, BuiltinID, E, *this);
19898   case NVPTX::BI__nvvm_ex2_approx_f16x2:
19899     return MakeHalfType(Intrinsic::nvvm_ex2_approx_f16x2, BuiltinID, E, *this);
19900   case NVPTX::BI__nvvm_ff2f16x2_rn:
19901     return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rn, BuiltinID, E, *this);
19902   case NVPTX::BI__nvvm_ff2f16x2_rn_relu:
19903     return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rn_relu, BuiltinID, E, *this);
19904   case NVPTX::BI__nvvm_ff2f16x2_rz:
19905     return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rz, BuiltinID, E, *this);
19906   case NVPTX::BI__nvvm_ff2f16x2_rz_relu:
19907     return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rz_relu, BuiltinID, E, *this);
19908   case NVPTX::BI__nvvm_fma_rn_f16:
19909     return MakeHalfType(Intrinsic::nvvm_fma_rn_f16, BuiltinID, E, *this);
19910   case NVPTX::BI__nvvm_fma_rn_f16x2:
19911     return MakeHalfType(Intrinsic::nvvm_fma_rn_f16x2, BuiltinID, E, *this);
19912   case NVPTX::BI__nvvm_fma_rn_ftz_f16:
19913     return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_f16, BuiltinID, E, *this);
19914   case NVPTX::BI__nvvm_fma_rn_ftz_f16x2:
19915     return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_f16x2, BuiltinID, E, *this);
19916   case NVPTX::BI__nvvm_fma_rn_ftz_relu_f16:
19917     return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_relu_f16, BuiltinID, E,
19918                         *this);
19919   case NVPTX::BI__nvvm_fma_rn_ftz_relu_f16x2:
19920     return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_relu_f16x2, BuiltinID, E,
19921                         *this);
19922   case NVPTX::BI__nvvm_fma_rn_ftz_sat_f16:
19923     return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_sat_f16, BuiltinID, E,
19924                         *this);
19925   case NVPTX::BI__nvvm_fma_rn_ftz_sat_f16x2:
19926     return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_sat_f16x2, BuiltinID, E,
19927                         *this);
19928   case NVPTX::BI__nvvm_fma_rn_relu_f16:
19929     return MakeHalfType(Intrinsic::nvvm_fma_rn_relu_f16, BuiltinID, E, *this);
19930   case NVPTX::BI__nvvm_fma_rn_relu_f16x2:
19931     return MakeHalfType(Intrinsic::nvvm_fma_rn_relu_f16x2, BuiltinID, E, *this);
19932   case NVPTX::BI__nvvm_fma_rn_sat_f16:
19933     return MakeHalfType(Intrinsic::nvvm_fma_rn_sat_f16, BuiltinID, E, *this);
19934   case NVPTX::BI__nvvm_fma_rn_sat_f16x2:
19935     return MakeHalfType(Intrinsic::nvvm_fma_rn_sat_f16x2, BuiltinID, E, *this);
19936   case NVPTX::BI__nvvm_fmax_f16:
19937     return MakeHalfType(Intrinsic::nvvm_fmax_f16, BuiltinID, E, *this);
19938   case NVPTX::BI__nvvm_fmax_f16x2:
19939     return MakeHalfType(Intrinsic::nvvm_fmax_f16x2, BuiltinID, E, *this);
19940   case NVPTX::BI__nvvm_fmax_ftz_f16:
19941     return MakeHalfType(Intrinsic::nvvm_fmax_ftz_f16, BuiltinID, E, *this);
19942   case NVPTX::BI__nvvm_fmax_ftz_f16x2:
19943     return MakeHalfType(Intrinsic::nvvm_fmax_ftz_f16x2, BuiltinID, E, *this);
19944   case NVPTX::BI__nvvm_fmax_ftz_nan_f16:
19945     return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_f16, BuiltinID, E, *this);
19946   case NVPTX::BI__nvvm_fmax_ftz_nan_f16x2:
19947     return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_f16x2, BuiltinID, E,
19948                         *this);
19949   case NVPTX::BI__nvvm_fmax_ftz_nan_xorsign_abs_f16:
19950     return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f16, BuiltinID,
19951                         E, *this);
19952   case NVPTX::BI__nvvm_fmax_ftz_nan_xorsign_abs_f16x2:
19953     return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f16x2,
19954                         BuiltinID, E, *this);
19955   case NVPTX::BI__nvvm_fmax_ftz_xorsign_abs_f16:
19956     return MakeHalfType(Intrinsic::nvvm_fmax_ftz_xorsign_abs_f16, BuiltinID, E,
19957                         *this);
19958   case NVPTX::BI__nvvm_fmax_ftz_xorsign_abs_f16x2:
19959     return MakeHalfType(Intrinsic::nvvm_fmax_ftz_xorsign_abs_f16x2, BuiltinID,
19960                         E, *this);
19961   case NVPTX::BI__nvvm_fmax_nan_f16:
19962     return MakeHalfType(Intrinsic::nvvm_fmax_nan_f16, BuiltinID, E, *this);
19963   case NVPTX::BI__nvvm_fmax_nan_f16x2:
19964     return MakeHalfType(Intrinsic::nvvm_fmax_nan_f16x2, BuiltinID, E, *this);
19965   case NVPTX::BI__nvvm_fmax_nan_xorsign_abs_f16:
19966     return MakeHalfType(Intrinsic::nvvm_fmax_nan_xorsign_abs_f16, BuiltinID, E,
19967                         *this);
19968   case NVPTX::BI__nvvm_fmax_nan_xorsign_abs_f16x2:
19969     return MakeHalfType(Intrinsic::nvvm_fmax_nan_xorsign_abs_f16x2, BuiltinID,
19970                         E, *this);
19971   case NVPTX::BI__nvvm_fmax_xorsign_abs_f16:
19972     return MakeHalfType(Intrinsic::nvvm_fmax_xorsign_abs_f16, BuiltinID, E,
19973                         *this);
19974   case NVPTX::BI__nvvm_fmax_xorsign_abs_f16x2:
19975     return MakeHalfType(Intrinsic::nvvm_fmax_xorsign_abs_f16x2, BuiltinID, E,
19976                         *this);
19977   case NVPTX::BI__nvvm_fmin_f16:
19978     return MakeHalfType(Intrinsic::nvvm_fmin_f16, BuiltinID, E, *this);
19979   case NVPTX::BI__nvvm_fmin_f16x2:
19980     return MakeHalfType(Intrinsic::nvvm_fmin_f16x2, BuiltinID, E, *this);
19981   case NVPTX::BI__nvvm_fmin_ftz_f16:
19982     return MakeHalfType(Intrinsic::nvvm_fmin_ftz_f16, BuiltinID, E, *this);
19983   case NVPTX::BI__nvvm_fmin_ftz_f16x2:
19984     return MakeHalfType(Intrinsic::nvvm_fmin_ftz_f16x2, BuiltinID, E, *this);
19985   case NVPTX::BI__nvvm_fmin_ftz_nan_f16:
19986     return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_f16, BuiltinID, E, *this);
19987   case NVPTX::BI__nvvm_fmin_ftz_nan_f16x2:
19988     return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_f16x2, BuiltinID, E,
19989                         *this);
19990   case NVPTX::BI__nvvm_fmin_ftz_nan_xorsign_abs_f16:
19991     return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f16, BuiltinID,
19992                         E, *this);
19993   case NVPTX::BI__nvvm_fmin_ftz_nan_xorsign_abs_f16x2:
19994     return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f16x2,
19995                         BuiltinID, E, *this);
19996   case NVPTX::BI__nvvm_fmin_ftz_xorsign_abs_f16:
19997     return MakeHalfType(Intrinsic::nvvm_fmin_ftz_xorsign_abs_f16, BuiltinID, E,
19998                         *this);
19999   case NVPTX::BI__nvvm_fmin_ftz_xorsign_abs_f16x2:
20000     return MakeHalfType(Intrinsic::nvvm_fmin_ftz_xorsign_abs_f16x2, BuiltinID,
20001                         E, *this);
20002   case NVPTX::BI__nvvm_fmin_nan_f16:
20003     return MakeHalfType(Intrinsic::nvvm_fmin_nan_f16, BuiltinID, E, *this);
20004   case NVPTX::BI__nvvm_fmin_nan_f16x2:
20005     return MakeHalfType(Intrinsic::nvvm_fmin_nan_f16x2, BuiltinID, E, *this);
20006   case NVPTX::BI__nvvm_fmin_nan_xorsign_abs_f16:
20007     return MakeHalfType(Intrinsic::nvvm_fmin_nan_xorsign_abs_f16, BuiltinID, E,
20008                         *this);
20009   case NVPTX::BI__nvvm_fmin_nan_xorsign_abs_f16x2:
20010     return MakeHalfType(Intrinsic::nvvm_fmin_nan_xorsign_abs_f16x2, BuiltinID,
20011                         E, *this);
20012   case NVPTX::BI__nvvm_fmin_xorsign_abs_f16:
20013     return MakeHalfType(Intrinsic::nvvm_fmin_xorsign_abs_f16, BuiltinID, E,
20014                         *this);
20015   case NVPTX::BI__nvvm_fmin_xorsign_abs_f16x2:
20016     return MakeHalfType(Intrinsic::nvvm_fmin_xorsign_abs_f16x2, BuiltinID, E,
20017                         *this);
20018   case NVPTX::BI__nvvm_ldg_h:
20019     return MakeHalfType(Intrinsic::nvvm_ldg_global_f, BuiltinID, E, *this);
20020   case NVPTX::BI__nvvm_ldg_h2:
20021     return MakeHalfType(Intrinsic::nvvm_ldg_global_f, BuiltinID, E, *this);
20022   case NVPTX::BI__nvvm_ldu_h:
20023     return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
20024   case NVPTX::BI__nvvm_ldu_h2: {
20025     return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
20026   }
20027   case NVPTX::BI__nvvm_cp_async_ca_shared_global_4:
20028     return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4,
20029                        Intrinsic::nvvm_cp_async_ca_shared_global_4_s, *this, E,
20030                        4);
20031   case NVPTX::BI__nvvm_cp_async_ca_shared_global_8:
20032     return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_8,
20033                        Intrinsic::nvvm_cp_async_ca_shared_global_8_s, *this, E,
20034                        8);
20035   case NVPTX::BI__nvvm_cp_async_ca_shared_global_16:
20036     return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_16,
20037                        Intrinsic::nvvm_cp_async_ca_shared_global_16_s, *this, E,
20038                        16);
20039   case NVPTX::BI__nvvm_cp_async_cg_shared_global_16:
20040     return MakeCpAsync(Intrinsic::nvvm_cp_async_cg_shared_global_16,
20041                        Intrinsic::nvvm_cp_async_cg_shared_global_16_s, *this, E,
20042                        16);
20043   case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_x:
20044     return Builder.CreateCall(
20045         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_x));
20046   case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_y:
20047     return Builder.CreateCall(
20048         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_y));
20049   case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_z:
20050     return Builder.CreateCall(
20051         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_z));
20052   case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_w:
20053     return Builder.CreateCall(
20054         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_w));
20055   case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_x:
20056     return Builder.CreateCall(
20057         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_x));
20058   case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_y:
20059     return Builder.CreateCall(
20060         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_y));
20061   case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_z:
20062     return Builder.CreateCall(
20063         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_z));
20064   case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_w:
20065     return Builder.CreateCall(
20066         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_w));
20067   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_x:
20068     return Builder.CreateCall(
20069         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_x));
20070   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_y:
20071     return Builder.CreateCall(
20072         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_y));
20073   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_z:
20074     return Builder.CreateCall(
20075         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_z));
20076   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_w:
20077     return Builder.CreateCall(
20078         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_w));
20079   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_x:
20080     return Builder.CreateCall(
20081         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_x));
20082   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_y:
20083     return Builder.CreateCall(
20084         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_y));
20085   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_z:
20086     return Builder.CreateCall(
20087         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_z));
20088   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_w:
20089     return Builder.CreateCall(
20090         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_w));
20091   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctarank:
20092     return Builder.CreateCall(
20093         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctarank));
20094   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctarank:
20095     return Builder.CreateCall(
20096         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctarank));
20097   case NVPTX::BI__nvvm_is_explicit_cluster:
20098     return Builder.CreateCall(
20099         CGM.getIntrinsic(Intrinsic::nvvm_is_explicit_cluster));
20100   case NVPTX::BI__nvvm_isspacep_shared_cluster:
20101     return Builder.CreateCall(
20102         CGM.getIntrinsic(Intrinsic::nvvm_isspacep_shared_cluster),
20103         EmitScalarExpr(E->getArg(0)));
20104   case NVPTX::BI__nvvm_mapa:
20105     return Builder.CreateCall(
20106         CGM.getIntrinsic(Intrinsic::nvvm_mapa),
20107         {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))});
20108   case NVPTX::BI__nvvm_mapa_shared_cluster:
20109     return Builder.CreateCall(
20110         CGM.getIntrinsic(Intrinsic::nvvm_mapa_shared_cluster),
20111         {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))});
20112   case NVPTX::BI__nvvm_getctarank:
20113     return Builder.CreateCall(
20114         CGM.getIntrinsic(Intrinsic::nvvm_getctarank),
20115         EmitScalarExpr(E->getArg(0)));
20116   case NVPTX::BI__nvvm_getctarank_shared_cluster:
20117     return Builder.CreateCall(
20118         CGM.getIntrinsic(Intrinsic::nvvm_getctarank_shared_cluster),
20119         EmitScalarExpr(E->getArg(0)));
20120   case NVPTX::BI__nvvm_barrier_cluster_arrive:
20121     return Builder.CreateCall(
20122         CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_arrive));
20123   case NVPTX::BI__nvvm_barrier_cluster_arrive_relaxed:
20124     return Builder.CreateCall(
20125         CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_arrive_relaxed));
20126   case NVPTX::BI__nvvm_barrier_cluster_wait:
20127     return Builder.CreateCall(
20128         CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_wait));
20129   case NVPTX::BI__nvvm_fence_sc_cluster:
20130     return Builder.CreateCall(
20131         CGM.getIntrinsic(Intrinsic::nvvm_fence_sc_cluster));
20132   default:
20133     return nullptr;
20134   }
20135 }
20136 
20137 namespace {
20138 struct BuiltinAlignArgs {
20139   llvm::Value *Src = nullptr;
20140   llvm::Type *SrcType = nullptr;
20141   llvm::Value *Alignment = nullptr;
20142   llvm::Value *Mask = nullptr;
20143   llvm::IntegerType *IntType = nullptr;
20144 
20145   BuiltinAlignArgs(const CallExpr *E, CodeGenFunction &CGF) {
20146     QualType AstType = E->getArg(0)->getType();
20147     if (AstType->isArrayType())
20148       Src = CGF.EmitArrayToPointerDecay(E->getArg(0)).getPointer();
20149     else
20150       Src = CGF.EmitScalarExpr(E->getArg(0));
20151     SrcType = Src->getType();
20152     if (SrcType->isPointerTy()) {
20153       IntType = IntegerType::get(
20154           CGF.getLLVMContext(),
20155           CGF.CGM.getDataLayout().getIndexTypeSizeInBits(SrcType));
20156     } else {
20157       assert(SrcType->isIntegerTy());
20158       IntType = cast<llvm::IntegerType>(SrcType);
20159     }
20160     Alignment = CGF.EmitScalarExpr(E->getArg(1));
20161     Alignment = CGF.Builder.CreateZExtOrTrunc(Alignment, IntType, "alignment");
20162     auto *One = llvm::ConstantInt::get(IntType, 1);
20163     Mask = CGF.Builder.CreateSub(Alignment, One, "mask");
20164   }
20165 };
20166 } // namespace
20167 
20168 /// Generate (x & (y-1)) == 0.
20169 RValue CodeGenFunction::EmitBuiltinIsAligned(const CallExpr *E) {
20170   BuiltinAlignArgs Args(E, *this);
20171   llvm::Value *SrcAddress = Args.Src;
20172   if (Args.SrcType->isPointerTy())
20173     SrcAddress =
20174         Builder.CreateBitOrPointerCast(Args.Src, Args.IntType, "src_addr");
20175   return RValue::get(Builder.CreateICmpEQ(
20176       Builder.CreateAnd(SrcAddress, Args.Mask, "set_bits"),
20177       llvm::Constant::getNullValue(Args.IntType), "is_aligned"));
20178 }
20179 
20180 /// Generate (x & ~(y-1)) to align down or ((x+(y-1)) & ~(y-1)) to align up.
20181 /// Note: For pointer types we can avoid ptrtoint/inttoptr pairs by using the
20182 /// llvm.ptrmask intrinsic (with a GEP before in the align_up case).
20183 RValue CodeGenFunction::EmitBuiltinAlignTo(const CallExpr *E, bool AlignUp) {
20184   BuiltinAlignArgs Args(E, *this);
20185   llvm::Value *SrcForMask = Args.Src;
20186   if (AlignUp) {
20187     // When aligning up we have to first add the mask to ensure we go over the
20188     // next alignment value and then align down to the next valid multiple.
20189     // By adding the mask, we ensure that align_up on an already aligned
20190     // value will not change the value.
20191     if (Args.Src->getType()->isPointerTy()) {
20192       if (getLangOpts().isSignedOverflowDefined())
20193         SrcForMask =
20194             Builder.CreateGEP(Int8Ty, SrcForMask, Args.Mask, "over_boundary");
20195       else
20196         SrcForMask = EmitCheckedInBoundsGEP(Int8Ty, SrcForMask, Args.Mask,
20197                                             /*SignedIndices=*/true,
20198                                             /*isSubtraction=*/false,
20199                                             E->getExprLoc(), "over_boundary");
20200     } else {
20201       SrcForMask = Builder.CreateAdd(SrcForMask, Args.Mask, "over_boundary");
20202     }
20203   }
20204   // Invert the mask to only clear the lower bits.
20205   llvm::Value *InvertedMask = Builder.CreateNot(Args.Mask, "inverted_mask");
20206   llvm::Value *Result = nullptr;
20207   if (Args.Src->getType()->isPointerTy()) {
20208     Result = Builder.CreateIntrinsic(
20209         Intrinsic::ptrmask, {Args.SrcType, Args.IntType},
20210         {SrcForMask, InvertedMask}, nullptr, "aligned_result");
20211   } else {
20212     Result = Builder.CreateAnd(SrcForMask, InvertedMask, "aligned_result");
20213   }
20214   assert(Result->getType() == Args.SrcType);
20215   return RValue::get(Result);
20216 }
20217 
20218 Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
20219                                                    const CallExpr *E) {
20220   switch (BuiltinID) {
20221   case WebAssembly::BI__builtin_wasm_memory_size: {
20222     llvm::Type *ResultType = ConvertType(E->getType());
20223     Value *I = EmitScalarExpr(E->getArg(0));
20224     Function *Callee =
20225         CGM.getIntrinsic(Intrinsic::wasm_memory_size, ResultType);
20226     return Builder.CreateCall(Callee, I);
20227   }
20228   case WebAssembly::BI__builtin_wasm_memory_grow: {
20229     llvm::Type *ResultType = ConvertType(E->getType());
20230     Value *Args[] = {EmitScalarExpr(E->getArg(0)),
20231                      EmitScalarExpr(E->getArg(1))};
20232     Function *Callee =
20233         CGM.getIntrinsic(Intrinsic::wasm_memory_grow, ResultType);
20234     return Builder.CreateCall(Callee, Args);
20235   }
20236   case WebAssembly::BI__builtin_wasm_tls_size: {
20237     llvm::Type *ResultType = ConvertType(E->getType());
20238     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_tls_size, ResultType);
20239     return Builder.CreateCall(Callee);
20240   }
20241   case WebAssembly::BI__builtin_wasm_tls_align: {
20242     llvm::Type *ResultType = ConvertType(E->getType());
20243     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_tls_align, ResultType);
20244     return Builder.CreateCall(Callee);
20245   }
20246   case WebAssembly::BI__builtin_wasm_tls_base: {
20247     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_tls_base);
20248     return Builder.CreateCall(Callee);
20249   }
20250   case WebAssembly::BI__builtin_wasm_throw: {
20251     Value *Tag = EmitScalarExpr(E->getArg(0));
20252     Value *Obj = EmitScalarExpr(E->getArg(1));
20253     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_throw);
20254     return Builder.CreateCall(Callee, {Tag, Obj});
20255   }
20256   case WebAssembly::BI__builtin_wasm_rethrow: {
20257     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_rethrow);
20258     return Builder.CreateCall(Callee);
20259   }
20260   case WebAssembly::BI__builtin_wasm_memory_atomic_wait32: {
20261     Value *Addr = EmitScalarExpr(E->getArg(0));
20262     Value *Expected = EmitScalarExpr(E->getArg(1));
20263     Value *Timeout = EmitScalarExpr(E->getArg(2));
20264     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_memory_atomic_wait32);
20265     return Builder.CreateCall(Callee, {Addr, Expected, Timeout});
20266   }
20267   case WebAssembly::BI__builtin_wasm_memory_atomic_wait64: {
20268     Value *Addr = EmitScalarExpr(E->getArg(0));
20269     Value *Expected = EmitScalarExpr(E->getArg(1));
20270     Value *Timeout = EmitScalarExpr(E->getArg(2));
20271     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_memory_atomic_wait64);
20272     return Builder.CreateCall(Callee, {Addr, Expected, Timeout});
20273   }
20274   case WebAssembly::BI__builtin_wasm_memory_atomic_notify: {
20275     Value *Addr = EmitScalarExpr(E->getArg(0));
20276     Value *Count = EmitScalarExpr(E->getArg(1));
20277     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_memory_atomic_notify);
20278     return Builder.CreateCall(Callee, {Addr, Count});
20279   }
20280   case WebAssembly::BI__builtin_wasm_trunc_s_i32_f32:
20281   case WebAssembly::BI__builtin_wasm_trunc_s_i32_f64:
20282   case WebAssembly::BI__builtin_wasm_trunc_s_i64_f32:
20283   case WebAssembly::BI__builtin_wasm_trunc_s_i64_f64: {
20284     Value *Src = EmitScalarExpr(E->getArg(0));
20285     llvm::Type *ResT = ConvertType(E->getType());
20286     Function *Callee =
20287         CGM.getIntrinsic(Intrinsic::wasm_trunc_signed, {ResT, Src->getType()});
20288     return Builder.CreateCall(Callee, {Src});
20289   }
20290   case WebAssembly::BI__builtin_wasm_trunc_u_i32_f32:
20291   case WebAssembly::BI__builtin_wasm_trunc_u_i32_f64:
20292   case WebAssembly::BI__builtin_wasm_trunc_u_i64_f32:
20293   case WebAssembly::BI__builtin_wasm_trunc_u_i64_f64: {
20294     Value *Src = EmitScalarExpr(E->getArg(0));
20295     llvm::Type *ResT = ConvertType(E->getType());
20296     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_trunc_unsigned,
20297                                         {ResT, Src->getType()});
20298     return Builder.CreateCall(Callee, {Src});
20299   }
20300   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i32_f32:
20301   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i32_f64:
20302   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i64_f32:
20303   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i64_f64:
20304   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i32x4_f32x4: {
20305     Value *Src = EmitScalarExpr(E->getArg(0));
20306     llvm::Type *ResT = ConvertType(E->getType());
20307     Function *Callee =
20308         CGM.getIntrinsic(Intrinsic::fptosi_sat, {ResT, Src->getType()});
20309     return Builder.CreateCall(Callee, {Src});
20310   }
20311   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32_f32:
20312   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32_f64:
20313   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i64_f32:
20314   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i64_f64:
20315   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32x4_f32x4: {
20316     Value *Src = EmitScalarExpr(E->getArg(0));
20317     llvm::Type *ResT = ConvertType(E->getType());
20318     Function *Callee =
20319         CGM.getIntrinsic(Intrinsic::fptoui_sat, {ResT, Src->getType()});
20320     return Builder.CreateCall(Callee, {Src});
20321   }
20322   case WebAssembly::BI__builtin_wasm_min_f32:
20323   case WebAssembly::BI__builtin_wasm_min_f64:
20324   case WebAssembly::BI__builtin_wasm_min_f32x4:
20325   case WebAssembly::BI__builtin_wasm_min_f64x2: {
20326     Value *LHS = EmitScalarExpr(E->getArg(0));
20327     Value *RHS = EmitScalarExpr(E->getArg(1));
20328     Function *Callee =
20329         CGM.getIntrinsic(Intrinsic::minimum, ConvertType(E->getType()));
20330     return Builder.CreateCall(Callee, {LHS, RHS});
20331   }
20332   case WebAssembly::BI__builtin_wasm_max_f32:
20333   case WebAssembly::BI__builtin_wasm_max_f64:
20334   case WebAssembly::BI__builtin_wasm_max_f32x4:
20335   case WebAssembly::BI__builtin_wasm_max_f64x2: {
20336     Value *LHS = EmitScalarExpr(E->getArg(0));
20337     Value *RHS = EmitScalarExpr(E->getArg(1));
20338     Function *Callee =
20339         CGM.getIntrinsic(Intrinsic::maximum, ConvertType(E->getType()));
20340     return Builder.CreateCall(Callee, {LHS, RHS});
20341   }
20342   case WebAssembly::BI__builtin_wasm_pmin_f32x4:
20343   case WebAssembly::BI__builtin_wasm_pmin_f64x2: {
20344     Value *LHS = EmitScalarExpr(E->getArg(0));
20345     Value *RHS = EmitScalarExpr(E->getArg(1));
20346     Function *Callee =
20347         CGM.getIntrinsic(Intrinsic::wasm_pmin, ConvertType(E->getType()));
20348     return Builder.CreateCall(Callee, {LHS, RHS});
20349   }
20350   case WebAssembly::BI__builtin_wasm_pmax_f32x4:
20351   case WebAssembly::BI__builtin_wasm_pmax_f64x2: {
20352     Value *LHS = EmitScalarExpr(E->getArg(0));
20353     Value *RHS = EmitScalarExpr(E->getArg(1));
20354     Function *Callee =
20355         CGM.getIntrinsic(Intrinsic::wasm_pmax, ConvertType(E->getType()));
20356     return Builder.CreateCall(Callee, {LHS, RHS});
20357   }
20358   case WebAssembly::BI__builtin_wasm_ceil_f32x4:
20359   case WebAssembly::BI__builtin_wasm_floor_f32x4:
20360   case WebAssembly::BI__builtin_wasm_trunc_f32x4:
20361   case WebAssembly::BI__builtin_wasm_nearest_f32x4:
20362   case WebAssembly::BI__builtin_wasm_ceil_f64x2:
20363   case WebAssembly::BI__builtin_wasm_floor_f64x2:
20364   case WebAssembly::BI__builtin_wasm_trunc_f64x2:
20365   case WebAssembly::BI__builtin_wasm_nearest_f64x2: {
20366     unsigned IntNo;
20367     switch (BuiltinID) {
20368     case WebAssembly::BI__builtin_wasm_ceil_f32x4:
20369     case WebAssembly::BI__builtin_wasm_ceil_f64x2:
20370       IntNo = Intrinsic::ceil;
20371       break;
20372     case WebAssembly::BI__builtin_wasm_floor_f32x4:
20373     case WebAssembly::BI__builtin_wasm_floor_f64x2:
20374       IntNo = Intrinsic::floor;
20375       break;
20376     case WebAssembly::BI__builtin_wasm_trunc_f32x4:
20377     case WebAssembly::BI__builtin_wasm_trunc_f64x2:
20378       IntNo = Intrinsic::trunc;
20379       break;
20380     case WebAssembly::BI__builtin_wasm_nearest_f32x4:
20381     case WebAssembly::BI__builtin_wasm_nearest_f64x2:
20382       IntNo = Intrinsic::nearbyint;
20383       break;
20384     default:
20385       llvm_unreachable("unexpected builtin ID");
20386     }
20387     Value *Value = EmitScalarExpr(E->getArg(0));
20388     Function *Callee = CGM.getIntrinsic(IntNo, ConvertType(E->getType()));
20389     return Builder.CreateCall(Callee, Value);
20390   }
20391   case WebAssembly::BI__builtin_wasm_ref_null_extern: {
20392     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_ref_null_extern);
20393     return Builder.CreateCall(Callee);
20394   }
20395   case WebAssembly::BI__builtin_wasm_ref_null_func: {
20396     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_ref_null_func);
20397     return Builder.CreateCall(Callee);
20398   }
20399   case WebAssembly::BI__builtin_wasm_swizzle_i8x16: {
20400     Value *Src = EmitScalarExpr(E->getArg(0));
20401     Value *Indices = EmitScalarExpr(E->getArg(1));
20402     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_swizzle);
20403     return Builder.CreateCall(Callee, {Src, Indices});
20404   }
20405   case WebAssembly::BI__builtin_wasm_add_sat_s_i8x16:
20406   case WebAssembly::BI__builtin_wasm_add_sat_u_i8x16:
20407   case WebAssembly::BI__builtin_wasm_add_sat_s_i16x8:
20408   case WebAssembly::BI__builtin_wasm_add_sat_u_i16x8:
20409   case WebAssembly::BI__builtin_wasm_sub_sat_s_i8x16:
20410   case WebAssembly::BI__builtin_wasm_sub_sat_u_i8x16:
20411   case WebAssembly::BI__builtin_wasm_sub_sat_s_i16x8:
20412   case WebAssembly::BI__builtin_wasm_sub_sat_u_i16x8: {
20413     unsigned IntNo;
20414     switch (BuiltinID) {
20415     case WebAssembly::BI__builtin_wasm_add_sat_s_i8x16:
20416     case WebAssembly::BI__builtin_wasm_add_sat_s_i16x8:
20417       IntNo = Intrinsic::sadd_sat;
20418       break;
20419     case WebAssembly::BI__builtin_wasm_add_sat_u_i8x16:
20420     case WebAssembly::BI__builtin_wasm_add_sat_u_i16x8:
20421       IntNo = Intrinsic::uadd_sat;
20422       break;
20423     case WebAssembly::BI__builtin_wasm_sub_sat_s_i8x16:
20424     case WebAssembly::BI__builtin_wasm_sub_sat_s_i16x8:
20425       IntNo = Intrinsic::wasm_sub_sat_signed;
20426       break;
20427     case WebAssembly::BI__builtin_wasm_sub_sat_u_i8x16:
20428     case WebAssembly::BI__builtin_wasm_sub_sat_u_i16x8:
20429       IntNo = Intrinsic::wasm_sub_sat_unsigned;
20430       break;
20431     default:
20432       llvm_unreachable("unexpected builtin ID");
20433     }
20434     Value *LHS = EmitScalarExpr(E->getArg(0));
20435     Value *RHS = EmitScalarExpr(E->getArg(1));
20436     Function *Callee = CGM.getIntrinsic(IntNo, ConvertType(E->getType()));
20437     return Builder.CreateCall(Callee, {LHS, RHS});
20438   }
20439   case WebAssembly::BI__builtin_wasm_abs_i8x16:
20440   case WebAssembly::BI__builtin_wasm_abs_i16x8:
20441   case WebAssembly::BI__builtin_wasm_abs_i32x4:
20442   case WebAssembly::BI__builtin_wasm_abs_i64x2: {
20443     Value *Vec = EmitScalarExpr(E->getArg(0));
20444     Value *Neg = Builder.CreateNeg(Vec, "neg");
20445     Constant *Zero = llvm::Constant::getNullValue(Vec->getType());
20446     Value *ICmp = Builder.CreateICmpSLT(Vec, Zero, "abscond");
20447     return Builder.CreateSelect(ICmp, Neg, Vec, "abs");
20448   }
20449   case WebAssembly::BI__builtin_wasm_min_s_i8x16:
20450   case WebAssembly::BI__builtin_wasm_min_u_i8x16:
20451   case WebAssembly::BI__builtin_wasm_max_s_i8x16:
20452   case WebAssembly::BI__builtin_wasm_max_u_i8x16:
20453   case WebAssembly::BI__builtin_wasm_min_s_i16x8:
20454   case WebAssembly::BI__builtin_wasm_min_u_i16x8:
20455   case WebAssembly::BI__builtin_wasm_max_s_i16x8:
20456   case WebAssembly::BI__builtin_wasm_max_u_i16x8:
20457   case WebAssembly::BI__builtin_wasm_min_s_i32x4:
20458   case WebAssembly::BI__builtin_wasm_min_u_i32x4:
20459   case WebAssembly::BI__builtin_wasm_max_s_i32x4:
20460   case WebAssembly::BI__builtin_wasm_max_u_i32x4: {
20461     Value *LHS = EmitScalarExpr(E->getArg(0));
20462     Value *RHS = EmitScalarExpr(E->getArg(1));
20463     Value *ICmp;
20464     switch (BuiltinID) {
20465     case WebAssembly::BI__builtin_wasm_min_s_i8x16:
20466     case WebAssembly::BI__builtin_wasm_min_s_i16x8:
20467     case WebAssembly::BI__builtin_wasm_min_s_i32x4:
20468       ICmp = Builder.CreateICmpSLT(LHS, RHS);
20469       break;
20470     case WebAssembly::BI__builtin_wasm_min_u_i8x16:
20471     case WebAssembly::BI__builtin_wasm_min_u_i16x8:
20472     case WebAssembly::BI__builtin_wasm_min_u_i32x4:
20473       ICmp = Builder.CreateICmpULT(LHS, RHS);
20474       break;
20475     case WebAssembly::BI__builtin_wasm_max_s_i8x16:
20476     case WebAssembly::BI__builtin_wasm_max_s_i16x8:
20477     case WebAssembly::BI__builtin_wasm_max_s_i32x4:
20478       ICmp = Builder.CreateICmpSGT(LHS, RHS);
20479       break;
20480     case WebAssembly::BI__builtin_wasm_max_u_i8x16:
20481     case WebAssembly::BI__builtin_wasm_max_u_i16x8:
20482     case WebAssembly::BI__builtin_wasm_max_u_i32x4:
20483       ICmp = Builder.CreateICmpUGT(LHS, RHS);
20484       break;
20485     default:
20486       llvm_unreachable("unexpected builtin ID");
20487     }
20488     return Builder.CreateSelect(ICmp, LHS, RHS);
20489   }
20490   case WebAssembly::BI__builtin_wasm_avgr_u_i8x16:
20491   case WebAssembly::BI__builtin_wasm_avgr_u_i16x8: {
20492     Value *LHS = EmitScalarExpr(E->getArg(0));
20493     Value *RHS = EmitScalarExpr(E->getArg(1));
20494     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_avgr_unsigned,
20495                                         ConvertType(E->getType()));
20496     return Builder.CreateCall(Callee, {LHS, RHS});
20497   }
20498   case WebAssembly::BI__builtin_wasm_q15mulr_sat_s_i16x8: {
20499     Value *LHS = EmitScalarExpr(E->getArg(0));
20500     Value *RHS = EmitScalarExpr(E->getArg(1));
20501     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_q15mulr_sat_signed);
20502     return Builder.CreateCall(Callee, {LHS, RHS});
20503   }
20504   case WebAssembly::BI__builtin_wasm_extadd_pairwise_i8x16_s_i16x8:
20505   case WebAssembly::BI__builtin_wasm_extadd_pairwise_i8x16_u_i16x8:
20506   case WebAssembly::BI__builtin_wasm_extadd_pairwise_i16x8_s_i32x4:
20507   case WebAssembly::BI__builtin_wasm_extadd_pairwise_i16x8_u_i32x4: {
20508     Value *Vec = EmitScalarExpr(E->getArg(0));
20509     unsigned IntNo;
20510     switch (BuiltinID) {
20511     case WebAssembly::BI__builtin_wasm_extadd_pairwise_i8x16_s_i16x8:
20512     case WebAssembly::BI__builtin_wasm_extadd_pairwise_i16x8_s_i32x4:
20513       IntNo = Intrinsic::wasm_extadd_pairwise_signed;
20514       break;
20515     case WebAssembly::BI__builtin_wasm_extadd_pairwise_i8x16_u_i16x8:
20516     case WebAssembly::BI__builtin_wasm_extadd_pairwise_i16x8_u_i32x4:
20517       IntNo = Intrinsic::wasm_extadd_pairwise_unsigned;
20518       break;
20519     default:
20520       llvm_unreachable("unexpected builtin ID");
20521     }
20522 
20523     Function *Callee = CGM.getIntrinsic(IntNo, ConvertType(E->getType()));
20524     return Builder.CreateCall(Callee, Vec);
20525   }
20526   case WebAssembly::BI__builtin_wasm_bitselect: {
20527     Value *V1 = EmitScalarExpr(E->getArg(0));
20528     Value *V2 = EmitScalarExpr(E->getArg(1));
20529     Value *C = EmitScalarExpr(E->getArg(2));
20530     Function *Callee =
20531         CGM.getIntrinsic(Intrinsic::wasm_bitselect, ConvertType(E->getType()));
20532     return Builder.CreateCall(Callee, {V1, V2, C});
20533   }
20534   case WebAssembly::BI__builtin_wasm_dot_s_i32x4_i16x8: {
20535     Value *LHS = EmitScalarExpr(E->getArg(0));
20536     Value *RHS = EmitScalarExpr(E->getArg(1));
20537     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_dot);
20538     return Builder.CreateCall(Callee, {LHS, RHS});
20539   }
20540   case WebAssembly::BI__builtin_wasm_popcnt_i8x16: {
20541     Value *Vec = EmitScalarExpr(E->getArg(0));
20542     Function *Callee =
20543         CGM.getIntrinsic(Intrinsic::ctpop, ConvertType(E->getType()));
20544     return Builder.CreateCall(Callee, {Vec});
20545   }
20546   case WebAssembly::BI__builtin_wasm_any_true_v128:
20547   case WebAssembly::BI__builtin_wasm_all_true_i8x16:
20548   case WebAssembly::BI__builtin_wasm_all_true_i16x8:
20549   case WebAssembly::BI__builtin_wasm_all_true_i32x4:
20550   case WebAssembly::BI__builtin_wasm_all_true_i64x2: {
20551     unsigned IntNo;
20552     switch (BuiltinID) {
20553     case WebAssembly::BI__builtin_wasm_any_true_v128:
20554       IntNo = Intrinsic::wasm_anytrue;
20555       break;
20556     case WebAssembly::BI__builtin_wasm_all_true_i8x16:
20557     case WebAssembly::BI__builtin_wasm_all_true_i16x8:
20558     case WebAssembly::BI__builtin_wasm_all_true_i32x4:
20559     case WebAssembly::BI__builtin_wasm_all_true_i64x2:
20560       IntNo = Intrinsic::wasm_alltrue;
20561       break;
20562     default:
20563       llvm_unreachable("unexpected builtin ID");
20564     }
20565     Value *Vec = EmitScalarExpr(E->getArg(0));
20566     Function *Callee = CGM.getIntrinsic(IntNo, Vec->getType());
20567     return Builder.CreateCall(Callee, {Vec});
20568   }
20569   case WebAssembly::BI__builtin_wasm_bitmask_i8x16:
20570   case WebAssembly::BI__builtin_wasm_bitmask_i16x8:
20571   case WebAssembly::BI__builtin_wasm_bitmask_i32x4:
20572   case WebAssembly::BI__builtin_wasm_bitmask_i64x2: {
20573     Value *Vec = EmitScalarExpr(E->getArg(0));
20574     Function *Callee =
20575         CGM.getIntrinsic(Intrinsic::wasm_bitmask, Vec->getType());
20576     return Builder.CreateCall(Callee, {Vec});
20577   }
20578   case WebAssembly::BI__builtin_wasm_abs_f32x4:
20579   case WebAssembly::BI__builtin_wasm_abs_f64x2: {
20580     Value *Vec = EmitScalarExpr(E->getArg(0));
20581     Function *Callee = CGM.getIntrinsic(Intrinsic::fabs, Vec->getType());
20582     return Builder.CreateCall(Callee, {Vec});
20583   }
20584   case WebAssembly::BI__builtin_wasm_sqrt_f32x4:
20585   case WebAssembly::BI__builtin_wasm_sqrt_f64x2: {
20586     Value *Vec = EmitScalarExpr(E->getArg(0));
20587     Function *Callee = CGM.getIntrinsic(Intrinsic::sqrt, Vec->getType());
20588     return Builder.CreateCall(Callee, {Vec});
20589   }
20590   case WebAssembly::BI__builtin_wasm_narrow_s_i8x16_i16x8:
20591   case WebAssembly::BI__builtin_wasm_narrow_u_i8x16_i16x8:
20592   case WebAssembly::BI__builtin_wasm_narrow_s_i16x8_i32x4:
20593   case WebAssembly::BI__builtin_wasm_narrow_u_i16x8_i32x4: {
20594     Value *Low = EmitScalarExpr(E->getArg(0));
20595     Value *High = EmitScalarExpr(E->getArg(1));
20596     unsigned IntNo;
20597     switch (BuiltinID) {
20598     case WebAssembly::BI__builtin_wasm_narrow_s_i8x16_i16x8:
20599     case WebAssembly::BI__builtin_wasm_narrow_s_i16x8_i32x4:
20600       IntNo = Intrinsic::wasm_narrow_signed;
20601       break;
20602     case WebAssembly::BI__builtin_wasm_narrow_u_i8x16_i16x8:
20603     case WebAssembly::BI__builtin_wasm_narrow_u_i16x8_i32x4:
20604       IntNo = Intrinsic::wasm_narrow_unsigned;
20605       break;
20606     default:
20607       llvm_unreachable("unexpected builtin ID");
20608     }
20609     Function *Callee =
20610         CGM.getIntrinsic(IntNo, {ConvertType(E->getType()), Low->getType()});
20611     return Builder.CreateCall(Callee, {Low, High});
20612   }
20613   case WebAssembly::BI__builtin_wasm_trunc_sat_s_zero_f64x2_i32x4:
20614   case WebAssembly::BI__builtin_wasm_trunc_sat_u_zero_f64x2_i32x4: {
20615     Value *Vec = EmitScalarExpr(E->getArg(0));
20616     unsigned IntNo;
20617     switch (BuiltinID) {
20618     case WebAssembly::BI__builtin_wasm_trunc_sat_s_zero_f64x2_i32x4:
20619       IntNo = Intrinsic::fptosi_sat;
20620       break;
20621     case WebAssembly::BI__builtin_wasm_trunc_sat_u_zero_f64x2_i32x4:
20622       IntNo = Intrinsic::fptoui_sat;
20623       break;
20624     default:
20625       llvm_unreachable("unexpected builtin ID");
20626     }
20627     llvm::Type *SrcT = Vec->getType();
20628     llvm::Type *TruncT = SrcT->getWithNewType(Builder.getInt32Ty());
20629     Function *Callee = CGM.getIntrinsic(IntNo, {TruncT, SrcT});
20630     Value *Trunc = Builder.CreateCall(Callee, Vec);
20631     Value *Splat = Constant::getNullValue(TruncT);
20632     return Builder.CreateShuffleVector(Trunc, Splat, ArrayRef<int>{0, 1, 2, 3});
20633   }
20634   case WebAssembly::BI__builtin_wasm_shuffle_i8x16: {
20635     Value *Ops[18];
20636     size_t OpIdx = 0;
20637     Ops[OpIdx++] = EmitScalarExpr(E->getArg(0));
20638     Ops[OpIdx++] = EmitScalarExpr(E->getArg(1));
20639     while (OpIdx < 18) {
20640       std::optional<llvm::APSInt> LaneConst =
20641           E->getArg(OpIdx)->getIntegerConstantExpr(getContext());
20642       assert(LaneConst && "Constant arg isn't actually constant?");
20643       Ops[OpIdx++] = llvm::ConstantInt::get(getLLVMContext(), *LaneConst);
20644     }
20645     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_shuffle);
20646     return Builder.CreateCall(Callee, Ops);
20647   }
20648   case WebAssembly::BI__builtin_wasm_relaxed_madd_f32x4:
20649   case WebAssembly::BI__builtin_wasm_relaxed_nmadd_f32x4:
20650   case WebAssembly::BI__builtin_wasm_relaxed_madd_f64x2:
20651   case WebAssembly::BI__builtin_wasm_relaxed_nmadd_f64x2: {
20652     Value *A = EmitScalarExpr(E->getArg(0));
20653     Value *B = EmitScalarExpr(E->getArg(1));
20654     Value *C = EmitScalarExpr(E->getArg(2));
20655     unsigned IntNo;
20656     switch (BuiltinID) {
20657     case WebAssembly::BI__builtin_wasm_relaxed_madd_f32x4:
20658     case WebAssembly::BI__builtin_wasm_relaxed_madd_f64x2:
20659       IntNo = Intrinsic::wasm_relaxed_madd;
20660       break;
20661     case WebAssembly::BI__builtin_wasm_relaxed_nmadd_f32x4:
20662     case WebAssembly::BI__builtin_wasm_relaxed_nmadd_f64x2:
20663       IntNo = Intrinsic::wasm_relaxed_nmadd;
20664       break;
20665     default:
20666       llvm_unreachable("unexpected builtin ID");
20667     }
20668     Function *Callee = CGM.getIntrinsic(IntNo, A->getType());
20669     return Builder.CreateCall(Callee, {A, B, C});
20670   }
20671   case WebAssembly::BI__builtin_wasm_relaxed_laneselect_i8x16:
20672   case WebAssembly::BI__builtin_wasm_relaxed_laneselect_i16x8:
20673   case WebAssembly::BI__builtin_wasm_relaxed_laneselect_i32x4:
20674   case WebAssembly::BI__builtin_wasm_relaxed_laneselect_i64x2: {
20675     Value *A = EmitScalarExpr(E->getArg(0));
20676     Value *B = EmitScalarExpr(E->getArg(1));
20677     Value *C = EmitScalarExpr(E->getArg(2));
20678     Function *Callee =
20679         CGM.getIntrinsic(Intrinsic::wasm_relaxed_laneselect, A->getType());
20680     return Builder.CreateCall(Callee, {A, B, C});
20681   }
20682   case WebAssembly::BI__builtin_wasm_relaxed_swizzle_i8x16: {
20683     Value *Src = EmitScalarExpr(E->getArg(0));
20684     Value *Indices = EmitScalarExpr(E->getArg(1));
20685     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_relaxed_swizzle);
20686     return Builder.CreateCall(Callee, {Src, Indices});
20687   }
20688   case WebAssembly::BI__builtin_wasm_relaxed_min_f32x4:
20689   case WebAssembly::BI__builtin_wasm_relaxed_max_f32x4:
20690   case WebAssembly::BI__builtin_wasm_relaxed_min_f64x2:
20691   case WebAssembly::BI__builtin_wasm_relaxed_max_f64x2: {
20692     Value *LHS = EmitScalarExpr(E->getArg(0));
20693     Value *RHS = EmitScalarExpr(E->getArg(1));
20694     unsigned IntNo;
20695     switch (BuiltinID) {
20696     case WebAssembly::BI__builtin_wasm_relaxed_min_f32x4:
20697     case WebAssembly::BI__builtin_wasm_relaxed_min_f64x2:
20698       IntNo = Intrinsic::wasm_relaxed_min;
20699       break;
20700     case WebAssembly::BI__builtin_wasm_relaxed_max_f32x4:
20701     case WebAssembly::BI__builtin_wasm_relaxed_max_f64x2:
20702       IntNo = Intrinsic::wasm_relaxed_max;
20703       break;
20704     default:
20705       llvm_unreachable("unexpected builtin ID");
20706     }
20707     Function *Callee = CGM.getIntrinsic(IntNo, LHS->getType());
20708     return Builder.CreateCall(Callee, {LHS, RHS});
20709   }
20710   case WebAssembly::BI__builtin_wasm_relaxed_trunc_s_i32x4_f32x4:
20711   case WebAssembly::BI__builtin_wasm_relaxed_trunc_u_i32x4_f32x4:
20712   case WebAssembly::BI__builtin_wasm_relaxed_trunc_s_zero_i32x4_f64x2:
20713   case WebAssembly::BI__builtin_wasm_relaxed_trunc_u_zero_i32x4_f64x2: {
20714     Value *Vec = EmitScalarExpr(E->getArg(0));
20715     unsigned IntNo;
20716     switch (BuiltinID) {
20717     case WebAssembly::BI__builtin_wasm_relaxed_trunc_s_i32x4_f32x4:
20718       IntNo = Intrinsic::wasm_relaxed_trunc_signed;
20719       break;
20720     case WebAssembly::BI__builtin_wasm_relaxed_trunc_u_i32x4_f32x4:
20721       IntNo = Intrinsic::wasm_relaxed_trunc_unsigned;
20722       break;
20723     case WebAssembly::BI__builtin_wasm_relaxed_trunc_s_zero_i32x4_f64x2:
20724       IntNo = Intrinsic::wasm_relaxed_trunc_signed_zero;
20725       break;
20726     case WebAssembly::BI__builtin_wasm_relaxed_trunc_u_zero_i32x4_f64x2:
20727       IntNo = Intrinsic::wasm_relaxed_trunc_unsigned_zero;
20728       break;
20729     default:
20730       llvm_unreachable("unexpected builtin ID");
20731     }
20732     Function *Callee = CGM.getIntrinsic(IntNo);
20733     return Builder.CreateCall(Callee, {Vec});
20734   }
20735   case WebAssembly::BI__builtin_wasm_relaxed_q15mulr_s_i16x8: {
20736     Value *LHS = EmitScalarExpr(E->getArg(0));
20737     Value *RHS = EmitScalarExpr(E->getArg(1));
20738     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_relaxed_q15mulr_signed);
20739     return Builder.CreateCall(Callee, {LHS, RHS});
20740   }
20741   case WebAssembly::BI__builtin_wasm_relaxed_dot_i8x16_i7x16_s_i16x8: {
20742     Value *LHS = EmitScalarExpr(E->getArg(0));
20743     Value *RHS = EmitScalarExpr(E->getArg(1));
20744     Function *Callee =
20745         CGM.getIntrinsic(Intrinsic::wasm_relaxed_dot_i8x16_i7x16_signed);
20746     return Builder.CreateCall(Callee, {LHS, RHS});
20747   }
20748   case WebAssembly::BI__builtin_wasm_relaxed_dot_i8x16_i7x16_add_s_i32x4: {
20749     Value *LHS = EmitScalarExpr(E->getArg(0));
20750     Value *RHS = EmitScalarExpr(E->getArg(1));
20751     Value *Acc = EmitScalarExpr(E->getArg(2));
20752     Function *Callee =
20753         CGM.getIntrinsic(Intrinsic::wasm_relaxed_dot_i8x16_i7x16_add_signed);
20754     return Builder.CreateCall(Callee, {LHS, RHS, Acc});
20755   }
20756   case WebAssembly::BI__builtin_wasm_relaxed_dot_bf16x8_add_f32_f32x4: {
20757     Value *LHS = EmitScalarExpr(E->getArg(0));
20758     Value *RHS = EmitScalarExpr(E->getArg(1));
20759     Value *Acc = EmitScalarExpr(E->getArg(2));
20760     Function *Callee =
20761         CGM.getIntrinsic(Intrinsic::wasm_relaxed_dot_bf16x8_add_f32);
20762     return Builder.CreateCall(Callee, {LHS, RHS, Acc});
20763   }
20764   case WebAssembly::BI__builtin_wasm_table_get: {
20765     assert(E->getArg(0)->getType()->isArrayType());
20766     Value *Table = EmitArrayToPointerDecay(E->getArg(0)).getPointer();
20767     Value *Index = EmitScalarExpr(E->getArg(1));
20768     Function *Callee;
20769     if (E->getType().isWebAssemblyExternrefType())
20770       Callee = CGM.getIntrinsic(Intrinsic::wasm_table_get_externref);
20771     else if (E->getType().isWebAssemblyFuncrefType())
20772       Callee = CGM.getIntrinsic(Intrinsic::wasm_table_get_funcref);
20773     else
20774       llvm_unreachable(
20775           "Unexpected reference type for __builtin_wasm_table_get");
20776     return Builder.CreateCall(Callee, {Table, Index});
20777   }
20778   case WebAssembly::BI__builtin_wasm_table_set: {
20779     assert(E->getArg(0)->getType()->isArrayType());
20780     Value *Table = EmitArrayToPointerDecay(E->getArg(0)).getPointer();
20781     Value *Index = EmitScalarExpr(E->getArg(1));
20782     Value *Val = EmitScalarExpr(E->getArg(2));
20783     Function *Callee;
20784     if (E->getArg(2)->getType().isWebAssemblyExternrefType())
20785       Callee = CGM.getIntrinsic(Intrinsic::wasm_table_set_externref);
20786     else if (E->getArg(2)->getType().isWebAssemblyFuncrefType())
20787       Callee = CGM.getIntrinsic(Intrinsic::wasm_table_set_funcref);
20788     else
20789       llvm_unreachable(
20790           "Unexpected reference type for __builtin_wasm_table_set");
20791     return Builder.CreateCall(Callee, {Table, Index, Val});
20792   }
20793   case WebAssembly::BI__builtin_wasm_table_size: {
20794     assert(E->getArg(0)->getType()->isArrayType());
20795     Value *Value = EmitArrayToPointerDecay(E->getArg(0)).getPointer();
20796     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_table_size);
20797     return Builder.CreateCall(Callee, Value);
20798   }
20799   case WebAssembly::BI__builtin_wasm_table_grow: {
20800     assert(E->getArg(0)->getType()->isArrayType());
20801     Value *Table = EmitArrayToPointerDecay(E->getArg(0)).getPointer();
20802     Value *Val = EmitScalarExpr(E->getArg(1));
20803     Value *NElems = EmitScalarExpr(E->getArg(2));
20804 
20805     Function *Callee;
20806     if (E->getArg(1)->getType().isWebAssemblyExternrefType())
20807       Callee = CGM.getIntrinsic(Intrinsic::wasm_table_grow_externref);
20808     else if (E->getArg(2)->getType().isWebAssemblyFuncrefType())
20809       Callee = CGM.getIntrinsic(Intrinsic::wasm_table_fill_funcref);
20810     else
20811       llvm_unreachable(
20812           "Unexpected reference type for __builtin_wasm_table_grow");
20813 
20814     return Builder.CreateCall(Callee, {Table, Val, NElems});
20815   }
20816   case WebAssembly::BI__builtin_wasm_table_fill: {
20817     assert(E->getArg(0)->getType()->isArrayType());
20818     Value *Table = EmitArrayToPointerDecay(E->getArg(0)).getPointer();
20819     Value *Index = EmitScalarExpr(E->getArg(1));
20820     Value *Val = EmitScalarExpr(E->getArg(2));
20821     Value *NElems = EmitScalarExpr(E->getArg(3));
20822 
20823     Function *Callee;
20824     if (E->getArg(2)->getType().isWebAssemblyExternrefType())
20825       Callee = CGM.getIntrinsic(Intrinsic::wasm_table_fill_externref);
20826     else if (E->getArg(2)->getType().isWebAssemblyFuncrefType())
20827       Callee = CGM.getIntrinsic(Intrinsic::wasm_table_fill_funcref);
20828     else
20829       llvm_unreachable(
20830           "Unexpected reference type for __builtin_wasm_table_fill");
20831 
20832     return Builder.CreateCall(Callee, {Table, Index, Val, NElems});
20833   }
20834   case WebAssembly::BI__builtin_wasm_table_copy: {
20835     assert(E->getArg(0)->getType()->isArrayType());
20836     Value *TableX = EmitArrayToPointerDecay(E->getArg(0)).getPointer();
20837     Value *TableY = EmitArrayToPointerDecay(E->getArg(1)).getPointer();
20838     Value *DstIdx = EmitScalarExpr(E->getArg(2));
20839     Value *SrcIdx = EmitScalarExpr(E->getArg(3));
20840     Value *NElems = EmitScalarExpr(E->getArg(4));
20841 
20842     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_table_copy);
20843 
20844     return Builder.CreateCall(Callee, {TableX, TableY, SrcIdx, DstIdx, NElems});
20845   }
20846   default:
20847     return nullptr;
20848   }
20849 }
20850 
20851 static std::pair<Intrinsic::ID, unsigned>
20852 getIntrinsicForHexagonNonClangBuiltin(unsigned BuiltinID) {
20853   struct Info {
20854     unsigned BuiltinID;
20855     Intrinsic::ID IntrinsicID;
20856     unsigned VecLen;
20857   };
20858   static Info Infos[] = {
20859 #define CUSTOM_BUILTIN_MAPPING(x,s) \
20860   { Hexagon::BI__builtin_HEXAGON_##x, Intrinsic::hexagon_##x, s },
20861     CUSTOM_BUILTIN_MAPPING(L2_loadrub_pci, 0)
20862     CUSTOM_BUILTIN_MAPPING(L2_loadrb_pci, 0)
20863     CUSTOM_BUILTIN_MAPPING(L2_loadruh_pci, 0)
20864     CUSTOM_BUILTIN_MAPPING(L2_loadrh_pci, 0)
20865     CUSTOM_BUILTIN_MAPPING(L2_loadri_pci, 0)
20866     CUSTOM_BUILTIN_MAPPING(L2_loadrd_pci, 0)
20867     CUSTOM_BUILTIN_MAPPING(L2_loadrub_pcr, 0)
20868     CUSTOM_BUILTIN_MAPPING(L2_loadrb_pcr, 0)
20869     CUSTOM_BUILTIN_MAPPING(L2_loadruh_pcr, 0)
20870     CUSTOM_BUILTIN_MAPPING(L2_loadrh_pcr, 0)
20871     CUSTOM_BUILTIN_MAPPING(L2_loadri_pcr, 0)
20872     CUSTOM_BUILTIN_MAPPING(L2_loadrd_pcr, 0)
20873     CUSTOM_BUILTIN_MAPPING(S2_storerb_pci, 0)
20874     CUSTOM_BUILTIN_MAPPING(S2_storerh_pci, 0)
20875     CUSTOM_BUILTIN_MAPPING(S2_storerf_pci, 0)
20876     CUSTOM_BUILTIN_MAPPING(S2_storeri_pci, 0)
20877     CUSTOM_BUILTIN_MAPPING(S2_storerd_pci, 0)
20878     CUSTOM_BUILTIN_MAPPING(S2_storerb_pcr, 0)
20879     CUSTOM_BUILTIN_MAPPING(S2_storerh_pcr, 0)
20880     CUSTOM_BUILTIN_MAPPING(S2_storerf_pcr, 0)
20881     CUSTOM_BUILTIN_MAPPING(S2_storeri_pcr, 0)
20882     CUSTOM_BUILTIN_MAPPING(S2_storerd_pcr, 0)
20883     // Legacy builtins that take a vector in place of a vector predicate.
20884     CUSTOM_BUILTIN_MAPPING(V6_vmaskedstoreq, 64)
20885     CUSTOM_BUILTIN_MAPPING(V6_vmaskedstorenq, 64)
20886     CUSTOM_BUILTIN_MAPPING(V6_vmaskedstorentq, 64)
20887     CUSTOM_BUILTIN_MAPPING(V6_vmaskedstorentnq, 64)
20888     CUSTOM_BUILTIN_MAPPING(V6_vmaskedstoreq_128B, 128)
20889     CUSTOM_BUILTIN_MAPPING(V6_vmaskedstorenq_128B, 128)
20890     CUSTOM_BUILTIN_MAPPING(V6_vmaskedstorentq_128B, 128)
20891     CUSTOM_BUILTIN_MAPPING(V6_vmaskedstorentnq_128B, 128)
20892 #include "clang/Basic/BuiltinsHexagonMapCustomDep.def"
20893 #undef CUSTOM_BUILTIN_MAPPING
20894   };
20895 
20896   auto CmpInfo = [] (Info A, Info B) { return A.BuiltinID < B.BuiltinID; };
20897   static const bool SortOnce = (llvm::sort(Infos, CmpInfo), true);
20898   (void)SortOnce;
20899 
20900   const Info *F = llvm::lower_bound(Infos, Info{BuiltinID, 0, 0}, CmpInfo);
20901   if (F == std::end(Infos) || F->BuiltinID != BuiltinID)
20902     return {Intrinsic::not_intrinsic, 0};
20903 
20904   return {F->IntrinsicID, F->VecLen};
20905 }
20906 
20907 Value *CodeGenFunction::EmitHexagonBuiltinExpr(unsigned BuiltinID,
20908                                                const CallExpr *E) {
20909   Intrinsic::ID ID;
20910   unsigned VecLen;
20911   std::tie(ID, VecLen) = getIntrinsicForHexagonNonClangBuiltin(BuiltinID);
20912 
20913   auto MakeCircOp = [this, E](unsigned IntID, bool IsLoad) {
20914     // The base pointer is passed by address, so it needs to be loaded.
20915     Address A = EmitPointerWithAlignment(E->getArg(0));
20916     Address BP = Address(A.getPointer(), Int8PtrTy, A.getAlignment());
20917     llvm::Value *Base = Builder.CreateLoad(BP);
20918     // The treatment of both loads and stores is the same: the arguments for
20919     // the builtin are the same as the arguments for the intrinsic.
20920     // Load:
20921     //   builtin(Base, Inc, Mod, Start) -> intr(Base, Inc, Mod, Start)
20922     //   builtin(Base, Mod, Start)      -> intr(Base, Mod, Start)
20923     // Store:
20924     //   builtin(Base, Inc, Mod, Val, Start) -> intr(Base, Inc, Mod, Val, Start)
20925     //   builtin(Base, Mod, Val, Start)      -> intr(Base, Mod, Val, Start)
20926     SmallVector<llvm::Value*,5> Ops = { Base };
20927     for (unsigned i = 1, e = E->getNumArgs(); i != e; ++i)
20928       Ops.push_back(EmitScalarExpr(E->getArg(i)));
20929 
20930     llvm::Value *Result = Builder.CreateCall(CGM.getIntrinsic(IntID), Ops);
20931     // The load intrinsics generate two results (Value, NewBase), stores
20932     // generate one (NewBase). The new base address needs to be stored.
20933     llvm::Value *NewBase = IsLoad ? Builder.CreateExtractValue(Result, 1)
20934                                   : Result;
20935     llvm::Value *LV = EmitScalarExpr(E->getArg(0));
20936     Address Dest = EmitPointerWithAlignment(E->getArg(0));
20937     llvm::Value *RetVal =
20938         Builder.CreateAlignedStore(NewBase, LV, Dest.getAlignment());
20939     if (IsLoad)
20940       RetVal = Builder.CreateExtractValue(Result, 0);
20941     return RetVal;
20942   };
20943 
20944   // Handle the conversion of bit-reverse load intrinsics to bit code.
20945   // The intrinsic call after this function only reads from memory and the
20946   // write to memory is dealt by the store instruction.
20947   auto MakeBrevLd = [this, E](unsigned IntID, llvm::Type *DestTy) {
20948     // The intrinsic generates one result, which is the new value for the base
20949     // pointer. It needs to be returned. The result of the load instruction is
20950     // passed to intrinsic by address, so the value needs to be stored.
20951     llvm::Value *BaseAddress = EmitScalarExpr(E->getArg(0));
20952 
20953     // Expressions like &(*pt++) will be incremented per evaluation.
20954     // EmitPointerWithAlignment and EmitScalarExpr evaluates the expression
20955     // per call.
20956     Address DestAddr = EmitPointerWithAlignment(E->getArg(1));
20957     DestAddr = Address(DestAddr.getPointer(), Int8Ty, DestAddr.getAlignment());
20958     llvm::Value *DestAddress = DestAddr.getPointer();
20959 
20960     // Operands are Base, Dest, Modifier.
20961     // The intrinsic format in LLVM IR is defined as
20962     // { ValueType, i8* } (i8*, i32).
20963     llvm::Value *Result = Builder.CreateCall(
20964         CGM.getIntrinsic(IntID), {BaseAddress, EmitScalarExpr(E->getArg(2))});
20965 
20966     // The value needs to be stored as the variable is passed by reference.
20967     llvm::Value *DestVal = Builder.CreateExtractValue(Result, 0);
20968 
20969     // The store needs to be truncated to fit the destination type.
20970     // While i32 and i64 are natively supported on Hexagon, i8 and i16 needs
20971     // to be handled with stores of respective destination type.
20972     DestVal = Builder.CreateTrunc(DestVal, DestTy);
20973 
20974     Builder.CreateAlignedStore(DestVal, DestAddress, DestAddr.getAlignment());
20975     // The updated value of the base pointer is returned.
20976     return Builder.CreateExtractValue(Result, 1);
20977   };
20978 
20979   auto V2Q = [this, VecLen] (llvm::Value *Vec) {
20980     Intrinsic::ID ID = VecLen == 128 ? Intrinsic::hexagon_V6_vandvrt_128B
20981                                      : Intrinsic::hexagon_V6_vandvrt;
20982     return Builder.CreateCall(CGM.getIntrinsic(ID),
20983                               {Vec, Builder.getInt32(-1)});
20984   };
20985   auto Q2V = [this, VecLen] (llvm::Value *Pred) {
20986     Intrinsic::ID ID = VecLen == 128 ? Intrinsic::hexagon_V6_vandqrt_128B
20987                                      : Intrinsic::hexagon_V6_vandqrt;
20988     return Builder.CreateCall(CGM.getIntrinsic(ID),
20989                               {Pred, Builder.getInt32(-1)});
20990   };
20991 
20992   switch (BuiltinID) {
20993   // These intrinsics return a tuple {Vector, VectorPred} in LLVM IR,
20994   // and the corresponding C/C++ builtins use loads/stores to update
20995   // the predicate.
20996   case Hexagon::BI__builtin_HEXAGON_V6_vaddcarry:
20997   case Hexagon::BI__builtin_HEXAGON_V6_vaddcarry_128B:
20998   case Hexagon::BI__builtin_HEXAGON_V6_vsubcarry:
20999   case Hexagon::BI__builtin_HEXAGON_V6_vsubcarry_128B: {
21000     // Get the type from the 0-th argument.
21001     llvm::Type *VecType = ConvertType(E->getArg(0)->getType());
21002     Address PredAddr =
21003         EmitPointerWithAlignment(E->getArg(2)).withElementType(VecType);
21004     llvm::Value *PredIn = V2Q(Builder.CreateLoad(PredAddr));
21005     llvm::Value *Result = Builder.CreateCall(CGM.getIntrinsic(ID),
21006         {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)), PredIn});
21007 
21008     llvm::Value *PredOut = Builder.CreateExtractValue(Result, 1);
21009     Builder.CreateAlignedStore(Q2V(PredOut), PredAddr.getPointer(),
21010         PredAddr.getAlignment());
21011     return Builder.CreateExtractValue(Result, 0);
21012   }
21013   // These are identical to the builtins above, except they don't consume
21014   // input carry, only generate carry-out. Since they still produce two
21015   // outputs, generate the store of the predicate, but no load.
21016   case Hexagon::BI__builtin_HEXAGON_V6_vaddcarryo:
21017   case Hexagon::BI__builtin_HEXAGON_V6_vaddcarryo_128B:
21018   case Hexagon::BI__builtin_HEXAGON_V6_vsubcarryo:
21019   case Hexagon::BI__builtin_HEXAGON_V6_vsubcarryo_128B: {
21020     // Get the type from the 0-th argument.
21021     llvm::Type *VecType = ConvertType(E->getArg(0)->getType());
21022     Address PredAddr =
21023         EmitPointerWithAlignment(E->getArg(2)).withElementType(VecType);
21024     llvm::Value *Result = Builder.CreateCall(CGM.getIntrinsic(ID),
21025         {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))});
21026 
21027     llvm::Value *PredOut = Builder.CreateExtractValue(Result, 1);
21028     Builder.CreateAlignedStore(Q2V(PredOut), PredAddr.getPointer(),
21029         PredAddr.getAlignment());
21030     return Builder.CreateExtractValue(Result, 0);
21031   }
21032 
21033   case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstoreq:
21034   case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstorenq:
21035   case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstorentq:
21036   case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstorentnq:
21037   case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstoreq_128B:
21038   case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstorenq_128B:
21039   case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstorentq_128B:
21040   case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstorentnq_128B: {
21041     SmallVector<llvm::Value*,4> Ops;
21042     const Expr *PredOp = E->getArg(0);
21043     // There will be an implicit cast to a boolean vector. Strip it.
21044     if (auto *Cast = dyn_cast<ImplicitCastExpr>(PredOp)) {
21045       if (Cast->getCastKind() == CK_BitCast)
21046         PredOp = Cast->getSubExpr();
21047       Ops.push_back(V2Q(EmitScalarExpr(PredOp)));
21048     }
21049     for (int i = 1, e = E->getNumArgs(); i != e; ++i)
21050       Ops.push_back(EmitScalarExpr(E->getArg(i)));
21051     return Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
21052   }
21053 
21054   case Hexagon::BI__builtin_HEXAGON_L2_loadrub_pci:
21055   case Hexagon::BI__builtin_HEXAGON_L2_loadrb_pci:
21056   case Hexagon::BI__builtin_HEXAGON_L2_loadruh_pci:
21057   case Hexagon::BI__builtin_HEXAGON_L2_loadrh_pci:
21058   case Hexagon::BI__builtin_HEXAGON_L2_loadri_pci:
21059   case Hexagon::BI__builtin_HEXAGON_L2_loadrd_pci:
21060   case Hexagon::BI__builtin_HEXAGON_L2_loadrub_pcr:
21061   case Hexagon::BI__builtin_HEXAGON_L2_loadrb_pcr:
21062   case Hexagon::BI__builtin_HEXAGON_L2_loadruh_pcr:
21063   case Hexagon::BI__builtin_HEXAGON_L2_loadrh_pcr:
21064   case Hexagon::BI__builtin_HEXAGON_L2_loadri_pcr:
21065   case Hexagon::BI__builtin_HEXAGON_L2_loadrd_pcr:
21066     return MakeCircOp(ID, /*IsLoad=*/true);
21067   case Hexagon::BI__builtin_HEXAGON_S2_storerb_pci:
21068   case Hexagon::BI__builtin_HEXAGON_S2_storerh_pci:
21069   case Hexagon::BI__builtin_HEXAGON_S2_storerf_pci:
21070   case Hexagon::BI__builtin_HEXAGON_S2_storeri_pci:
21071   case Hexagon::BI__builtin_HEXAGON_S2_storerd_pci:
21072   case Hexagon::BI__builtin_HEXAGON_S2_storerb_pcr:
21073   case Hexagon::BI__builtin_HEXAGON_S2_storerh_pcr:
21074   case Hexagon::BI__builtin_HEXAGON_S2_storerf_pcr:
21075   case Hexagon::BI__builtin_HEXAGON_S2_storeri_pcr:
21076   case Hexagon::BI__builtin_HEXAGON_S2_storerd_pcr:
21077     return MakeCircOp(ID, /*IsLoad=*/false);
21078   case Hexagon::BI__builtin_brev_ldub:
21079     return MakeBrevLd(Intrinsic::hexagon_L2_loadrub_pbr, Int8Ty);
21080   case Hexagon::BI__builtin_brev_ldb:
21081     return MakeBrevLd(Intrinsic::hexagon_L2_loadrb_pbr, Int8Ty);
21082   case Hexagon::BI__builtin_brev_lduh:
21083     return MakeBrevLd(Intrinsic::hexagon_L2_loadruh_pbr, Int16Ty);
21084   case Hexagon::BI__builtin_brev_ldh:
21085     return MakeBrevLd(Intrinsic::hexagon_L2_loadrh_pbr, Int16Ty);
21086   case Hexagon::BI__builtin_brev_ldw:
21087     return MakeBrevLd(Intrinsic::hexagon_L2_loadri_pbr, Int32Ty);
21088   case Hexagon::BI__builtin_brev_ldd:
21089     return MakeBrevLd(Intrinsic::hexagon_L2_loadrd_pbr, Int64Ty);
21090   } // switch
21091 
21092   return nullptr;
21093 }
21094 
21095 Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned BuiltinID,
21096                                              const CallExpr *E,
21097                                              ReturnValueSlot ReturnValue) {
21098   SmallVector<Value *, 4> Ops;
21099   llvm::Type *ResultType = ConvertType(E->getType());
21100 
21101   // Find out if any arguments are required to be integer constant expressions.
21102   unsigned ICEArguments = 0;
21103   ASTContext::GetBuiltinTypeError Error;
21104   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
21105   if (Error == ASTContext::GE_Missing_type) {
21106     // Vector intrinsics don't have a type string.
21107     assert(BuiltinID >= clang::RISCV::FirstRVVBuiltin &&
21108            BuiltinID <= clang::RISCV::LastRVVBuiltin);
21109     ICEArguments = 0;
21110     if (BuiltinID == RISCVVector::BI__builtin_rvv_vget_v ||
21111         BuiltinID == RISCVVector::BI__builtin_rvv_vset_v)
21112       ICEArguments = 1 << 1;
21113   } else {
21114     assert(Error == ASTContext::GE_None && "Unexpected error");
21115   }
21116 
21117   if (BuiltinID == RISCV::BI__builtin_riscv_ntl_load)
21118     ICEArguments |= (1 << 1);
21119   if (BuiltinID == RISCV::BI__builtin_riscv_ntl_store)
21120     ICEArguments |= (1 << 2);
21121 
21122   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
21123     // Handle aggregate argument, namely RVV tuple types in segment load/store
21124     if (hasAggregateEvaluationKind(E->getArg(i)->getType())) {
21125       LValue L = EmitAggExprToLValue(E->getArg(i));
21126       llvm::Value *AggValue = Builder.CreateLoad(L.getAddress(*this));
21127       Ops.push_back(AggValue);
21128       continue;
21129     }
21130     Ops.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, i, E));
21131   }
21132 
21133   Intrinsic::ID ID = Intrinsic::not_intrinsic;
21134   unsigned NF = 1;
21135   // The 0th bit simulates the `vta` of RVV
21136   // The 1st bit simulates the `vma` of RVV
21137   constexpr unsigned RVV_VTA = 0x1;
21138   constexpr unsigned RVV_VMA = 0x2;
21139   int PolicyAttrs = 0;
21140   bool IsMasked = false;
21141 
21142   // Required for overloaded intrinsics.
21143   llvm::SmallVector<llvm::Type *, 2> IntrinsicTypes;
21144   switch (BuiltinID) {
21145   default: llvm_unreachable("unexpected builtin ID");
21146   case RISCV::BI__builtin_riscv_orc_b_32:
21147   case RISCV::BI__builtin_riscv_orc_b_64:
21148   case RISCV::BI__builtin_riscv_clz_32:
21149   case RISCV::BI__builtin_riscv_clz_64:
21150   case RISCV::BI__builtin_riscv_ctz_32:
21151   case RISCV::BI__builtin_riscv_ctz_64:
21152   case RISCV::BI__builtin_riscv_clmul_32:
21153   case RISCV::BI__builtin_riscv_clmul_64:
21154   case RISCV::BI__builtin_riscv_clmulh_32:
21155   case RISCV::BI__builtin_riscv_clmulh_64:
21156   case RISCV::BI__builtin_riscv_clmulr_32:
21157   case RISCV::BI__builtin_riscv_clmulr_64:
21158   case RISCV::BI__builtin_riscv_xperm4_32:
21159   case RISCV::BI__builtin_riscv_xperm4_64:
21160   case RISCV::BI__builtin_riscv_xperm8_32:
21161   case RISCV::BI__builtin_riscv_xperm8_64:
21162   case RISCV::BI__builtin_riscv_brev8_32:
21163   case RISCV::BI__builtin_riscv_brev8_64:
21164   case RISCV::BI__builtin_riscv_zip_32:
21165   case RISCV::BI__builtin_riscv_unzip_32: {
21166     switch (BuiltinID) {
21167     default: llvm_unreachable("unexpected builtin ID");
21168     // Zbb
21169     case RISCV::BI__builtin_riscv_orc_b_32:
21170     case RISCV::BI__builtin_riscv_orc_b_64:
21171       ID = Intrinsic::riscv_orc_b;
21172       break;
21173     case RISCV::BI__builtin_riscv_clz_32:
21174     case RISCV::BI__builtin_riscv_clz_64: {
21175       Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType());
21176       Value *Result = Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)});
21177       if (Result->getType() != ResultType)
21178         Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
21179                                        "cast");
21180       return Result;
21181     }
21182     case RISCV::BI__builtin_riscv_ctz_32:
21183     case RISCV::BI__builtin_riscv_ctz_64: {
21184       Function *F = CGM.getIntrinsic(Intrinsic::cttz, Ops[0]->getType());
21185       Value *Result = Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)});
21186       if (Result->getType() != ResultType)
21187         Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
21188                                        "cast");
21189       return Result;
21190     }
21191 
21192     // Zbc
21193     case RISCV::BI__builtin_riscv_clmul_32:
21194     case RISCV::BI__builtin_riscv_clmul_64:
21195       ID = Intrinsic::riscv_clmul;
21196       break;
21197     case RISCV::BI__builtin_riscv_clmulh_32:
21198     case RISCV::BI__builtin_riscv_clmulh_64:
21199       ID = Intrinsic::riscv_clmulh;
21200       break;
21201     case RISCV::BI__builtin_riscv_clmulr_32:
21202     case RISCV::BI__builtin_riscv_clmulr_64:
21203       ID = Intrinsic::riscv_clmulr;
21204       break;
21205 
21206     // Zbkx
21207     case RISCV::BI__builtin_riscv_xperm8_32:
21208     case RISCV::BI__builtin_riscv_xperm8_64:
21209       ID = Intrinsic::riscv_xperm8;
21210       break;
21211     case RISCV::BI__builtin_riscv_xperm4_32:
21212     case RISCV::BI__builtin_riscv_xperm4_64:
21213       ID = Intrinsic::riscv_xperm4;
21214       break;
21215 
21216     // Zbkb
21217     case RISCV::BI__builtin_riscv_brev8_32:
21218     case RISCV::BI__builtin_riscv_brev8_64:
21219       ID = Intrinsic::riscv_brev8;
21220       break;
21221     case RISCV::BI__builtin_riscv_zip_32:
21222       ID = Intrinsic::riscv_zip;
21223       break;
21224     case RISCV::BI__builtin_riscv_unzip_32:
21225       ID = Intrinsic::riscv_unzip;
21226       break;
21227     }
21228 
21229     IntrinsicTypes = {ResultType};
21230     break;
21231   }
21232 
21233   // Zk builtins
21234 
21235   // Zknh
21236   case RISCV::BI__builtin_riscv_sha256sig0:
21237     ID = Intrinsic::riscv_sha256sig0;
21238     break;
21239   case RISCV::BI__builtin_riscv_sha256sig1:
21240     ID = Intrinsic::riscv_sha256sig1;
21241     break;
21242   case RISCV::BI__builtin_riscv_sha256sum0:
21243     ID = Intrinsic::riscv_sha256sum0;
21244     break;
21245   case RISCV::BI__builtin_riscv_sha256sum1:
21246     ID = Intrinsic::riscv_sha256sum1;
21247     break;
21248 
21249   // Zksed
21250   case RISCV::BI__builtin_riscv_sm4ks:
21251     ID = Intrinsic::riscv_sm4ks;
21252     break;
21253   case RISCV::BI__builtin_riscv_sm4ed:
21254     ID = Intrinsic::riscv_sm4ed;
21255     break;
21256 
21257   // Zksh
21258   case RISCV::BI__builtin_riscv_sm3p0:
21259     ID = Intrinsic::riscv_sm3p0;
21260     break;
21261   case RISCV::BI__builtin_riscv_sm3p1:
21262     ID = Intrinsic::riscv_sm3p1;
21263     break;
21264 
21265   // Zihintntl
21266   case RISCV::BI__builtin_riscv_ntl_load: {
21267     llvm::Type *ResTy = ConvertType(E->getType());
21268     unsigned DomainVal = 5; // Default __RISCV_NTLH_ALL
21269     if (Ops.size() == 2)
21270       DomainVal = cast<ConstantInt>(Ops[1])->getZExtValue();
21271 
21272     llvm::MDNode *RISCVDomainNode = llvm::MDNode::get(
21273         getLLVMContext(),
21274         llvm::ConstantAsMetadata::get(Builder.getInt32(DomainVal)));
21275     llvm::MDNode *NontemporalNode = llvm::MDNode::get(
21276         getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
21277 
21278     int Width;
21279     if(ResTy->isScalableTy()) {
21280       const ScalableVectorType *SVTy = cast<ScalableVectorType>(ResTy);
21281       llvm::Type *ScalarTy = ResTy->getScalarType();
21282       Width = ScalarTy->getPrimitiveSizeInBits() *
21283               SVTy->getElementCount().getKnownMinValue();
21284     } else
21285       Width = ResTy->getPrimitiveSizeInBits();
21286     LoadInst *Load = Builder.CreateLoad(
21287         Address(Ops[0], ResTy, CharUnits::fromQuantity(Width / 8)));
21288 
21289     Load->setMetadata(llvm::LLVMContext::MD_nontemporal, NontemporalNode);
21290     Load->setMetadata(CGM.getModule().getMDKindID("riscv-nontemporal-domain"),
21291                       RISCVDomainNode);
21292 
21293     return Load;
21294   }
21295   case RISCV::BI__builtin_riscv_ntl_store: {
21296     unsigned DomainVal = 5; // Default __RISCV_NTLH_ALL
21297     if (Ops.size() == 3)
21298       DomainVal = cast<ConstantInt>(Ops[2])->getZExtValue();
21299 
21300     llvm::MDNode *RISCVDomainNode = llvm::MDNode::get(
21301         getLLVMContext(),
21302         llvm::ConstantAsMetadata::get(Builder.getInt32(DomainVal)));
21303     llvm::MDNode *NontemporalNode = llvm::MDNode::get(
21304         getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
21305 
21306     StoreInst *Store = Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
21307     Store->setMetadata(llvm::LLVMContext::MD_nontemporal, NontemporalNode);
21308     Store->setMetadata(CGM.getModule().getMDKindID("riscv-nontemporal-domain"),
21309                        RISCVDomainNode);
21310 
21311     return Store;
21312   }
21313 
21314   // Vector builtins are handled from here.
21315 #include "clang/Basic/riscv_vector_builtin_cg.inc"
21316   // SiFive Vector builtins are handled from here.
21317 #include "clang/Basic/riscv_sifive_vector_builtin_cg.inc"
21318   }
21319 
21320   assert(ID != Intrinsic::not_intrinsic);
21321 
21322   llvm::Function *F = CGM.getIntrinsic(ID, IntrinsicTypes);
21323   return Builder.CreateCall(F, Ops, "");
21324 }
21325