1 //===---- CGBuiltin.cpp - Emit LLVM Code for builtins ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This contains code to emit Builtin calls as LLVM code.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "ABIInfo.h"
14 #include "CGCUDARuntime.h"
15 #include "CGCXXABI.h"
16 #include "CGObjCRuntime.h"
17 #include "CGOpenCLRuntime.h"
18 #include "CGRecordLayout.h"
19 #include "CodeGenFunction.h"
20 #include "CodeGenModule.h"
21 #include "ConstantEmitter.h"
22 #include "PatternInit.h"
23 #include "TargetInfo.h"
24 #include "clang/AST/ASTContext.h"
25 #include "clang/AST/Attr.h"
26 #include "clang/AST/Decl.h"
27 #include "clang/AST/OSLog.h"
28 #include "clang/Basic/TargetBuiltins.h"
29 #include "clang/Basic/TargetInfo.h"
30 #include "clang/CodeGen/CGFunctionInfo.h"
31 #include "clang/Frontend/FrontendDiagnostic.h"
32 #include "llvm/ADT/APFloat.h"
33 #include "llvm/ADT/APInt.h"
34 #include "llvm/ADT/FloatingPointMode.h"
35 #include "llvm/ADT/SmallPtrSet.h"
36 #include "llvm/ADT/StringExtras.h"
37 #include "llvm/Analysis/ValueTracking.h"
38 #include "llvm/IR/DataLayout.h"
39 #include "llvm/IR/InlineAsm.h"
40 #include "llvm/IR/Intrinsics.h"
41 #include "llvm/IR/IntrinsicsAArch64.h"
42 #include "llvm/IR/IntrinsicsAMDGPU.h"
43 #include "llvm/IR/IntrinsicsARM.h"
44 #include "llvm/IR/IntrinsicsBPF.h"
45 #include "llvm/IR/IntrinsicsHexagon.h"
46 #include "llvm/IR/IntrinsicsLoongArch.h"
47 #include "llvm/IR/IntrinsicsNVPTX.h"
48 #include "llvm/IR/IntrinsicsPowerPC.h"
49 #include "llvm/IR/IntrinsicsR600.h"
50 #include "llvm/IR/IntrinsicsRISCV.h"
51 #include "llvm/IR/IntrinsicsS390.h"
52 #include "llvm/IR/IntrinsicsVE.h"
53 #include "llvm/IR/IntrinsicsWebAssembly.h"
54 #include "llvm/IR/IntrinsicsX86.h"
55 #include "llvm/IR/MDBuilder.h"
56 #include "llvm/IR/MatrixBuilder.h"
57 #include "llvm/Support/ConvertUTF.h"
58 #include "llvm/Support/ScopedPrinter.h"
59 #include "llvm/TargetParser/AArch64TargetParser.h"
60 #include "llvm/TargetParser/X86TargetParser.h"
61 #include <optional>
62 #include <sstream>
63 
64 using namespace clang;
65 using namespace CodeGen;
66 using namespace llvm;
67 
68 static void initializeAlloca(CodeGenFunction &CGF, AllocaInst *AI, Value *Size,
69                              Align AlignmentInBytes) {
70   ConstantInt *Byte;
71   switch (CGF.getLangOpts().getTrivialAutoVarInit()) {
72   case LangOptions::TrivialAutoVarInitKind::Uninitialized:
73     // Nothing to initialize.
74     return;
75   case LangOptions::TrivialAutoVarInitKind::Zero:
76     Byte = CGF.Builder.getInt8(0x00);
77     break;
78   case LangOptions::TrivialAutoVarInitKind::Pattern: {
79     llvm::Type *Int8 = llvm::IntegerType::getInt8Ty(CGF.CGM.getLLVMContext());
80     Byte = llvm::dyn_cast<llvm::ConstantInt>(
81         initializationPatternFor(CGF.CGM, Int8));
82     break;
83   }
84   }
85   if (CGF.CGM.stopAutoInit())
86     return;
87   auto *I = CGF.Builder.CreateMemSet(AI, Byte, Size, AlignmentInBytes);
88   I->addAnnotationMetadata("auto-init");
89 }
90 
91 /// getBuiltinLibFunction - Given a builtin id for a function like
92 /// "__builtin_fabsf", return a Function* for "fabsf".
93 llvm::Constant *CodeGenModule::getBuiltinLibFunction(const FunctionDecl *FD,
94                                                      unsigned BuiltinID) {
95   assert(Context.BuiltinInfo.isLibFunction(BuiltinID));
96 
97   // Get the name, skip over the __builtin_ prefix (if necessary).
98   StringRef Name;
99   GlobalDecl D(FD);
100 
101   // TODO: This list should be expanded or refactored after all GCC-compatible
102   // std libcall builtins are implemented.
103   static SmallDenseMap<unsigned, StringRef, 64> F128Builtins{
104       {Builtin::BI__builtin___fprintf_chk, "__fprintf_chkieee128"},
105       {Builtin::BI__builtin___printf_chk, "__printf_chkieee128"},
106       {Builtin::BI__builtin___snprintf_chk, "__snprintf_chkieee128"},
107       {Builtin::BI__builtin___sprintf_chk, "__sprintf_chkieee128"},
108       {Builtin::BI__builtin___vfprintf_chk, "__vfprintf_chkieee128"},
109       {Builtin::BI__builtin___vprintf_chk, "__vprintf_chkieee128"},
110       {Builtin::BI__builtin___vsnprintf_chk, "__vsnprintf_chkieee128"},
111       {Builtin::BI__builtin___vsprintf_chk, "__vsprintf_chkieee128"},
112       {Builtin::BI__builtin_fprintf, "__fprintfieee128"},
113       {Builtin::BI__builtin_printf, "__printfieee128"},
114       {Builtin::BI__builtin_snprintf, "__snprintfieee128"},
115       {Builtin::BI__builtin_sprintf, "__sprintfieee128"},
116       {Builtin::BI__builtin_vfprintf, "__vfprintfieee128"},
117       {Builtin::BI__builtin_vprintf, "__vprintfieee128"},
118       {Builtin::BI__builtin_vsnprintf, "__vsnprintfieee128"},
119       {Builtin::BI__builtin_vsprintf, "__vsprintfieee128"},
120       {Builtin::BI__builtin_fscanf, "__fscanfieee128"},
121       {Builtin::BI__builtin_scanf, "__scanfieee128"},
122       {Builtin::BI__builtin_sscanf, "__sscanfieee128"},
123       {Builtin::BI__builtin_vfscanf, "__vfscanfieee128"},
124       {Builtin::BI__builtin_vscanf, "__vscanfieee128"},
125       {Builtin::BI__builtin_vsscanf, "__vsscanfieee128"},
126       {Builtin::BI__builtin_nexttowardf128, "__nexttowardieee128"},
127   };
128 
129   // The AIX library functions frexpl, ldexpl, and modfl are for 128-bit
130   // IBM 'long double' (i.e. __ibm128). Map to the 'double' versions
131   // if it is 64-bit 'long double' mode.
132   static SmallDenseMap<unsigned, StringRef, 4> AIXLongDouble64Builtins{
133       {Builtin::BI__builtin_frexpl, "frexp"},
134       {Builtin::BI__builtin_ldexpl, "ldexp"},
135       {Builtin::BI__builtin_modfl, "modf"},
136   };
137 
138   // If the builtin has been declared explicitly with an assembler label,
139   // use the mangled name. This differs from the plain label on platforms
140   // that prefix labels.
141   if (FD->hasAttr<AsmLabelAttr>())
142     Name = getMangledName(D);
143   else {
144     // TODO: This mutation should also be applied to other targets other than
145     // PPC, after backend supports IEEE 128-bit style libcalls.
146     if (getTriple().isPPC64() &&
147         &getTarget().getLongDoubleFormat() == &llvm::APFloat::IEEEquad() &&
148         F128Builtins.find(BuiltinID) != F128Builtins.end())
149       Name = F128Builtins[BuiltinID];
150     else if (getTriple().isOSAIX() &&
151              &getTarget().getLongDoubleFormat() ==
152                  &llvm::APFloat::IEEEdouble() &&
153              AIXLongDouble64Builtins.find(BuiltinID) !=
154                  AIXLongDouble64Builtins.end())
155       Name = AIXLongDouble64Builtins[BuiltinID];
156     else
157       Name = Context.BuiltinInfo.getName(BuiltinID).substr(10);
158   }
159 
160   llvm::FunctionType *Ty =
161     cast<llvm::FunctionType>(getTypes().ConvertType(FD->getType()));
162 
163   return GetOrCreateLLVMFunction(Name, Ty, D, /*ForVTable=*/false);
164 }
165 
166 /// Emit the conversions required to turn the given value into an
167 /// integer of the given size.
168 static Value *EmitToInt(CodeGenFunction &CGF, llvm::Value *V,
169                         QualType T, llvm::IntegerType *IntType) {
170   V = CGF.EmitToMemory(V, T);
171 
172   if (V->getType()->isPointerTy())
173     return CGF.Builder.CreatePtrToInt(V, IntType);
174 
175   assert(V->getType() == IntType);
176   return V;
177 }
178 
179 static Value *EmitFromInt(CodeGenFunction &CGF, llvm::Value *V,
180                           QualType T, llvm::Type *ResultType) {
181   V = CGF.EmitFromMemory(V, T);
182 
183   if (ResultType->isPointerTy())
184     return CGF.Builder.CreateIntToPtr(V, ResultType);
185 
186   assert(V->getType() == ResultType);
187   return V;
188 }
189 
190 static llvm::Value *CheckAtomicAlignment(CodeGenFunction &CGF,
191                                          const CallExpr *E) {
192   ASTContext &Ctx = CGF.getContext();
193   Address Ptr = CGF.EmitPointerWithAlignment(E->getArg(0));
194   unsigned Bytes = Ptr.getElementType()->isPointerTy()
195                        ? Ctx.getTypeSizeInChars(Ctx.VoidPtrTy).getQuantity()
196                        : Ptr.getElementType()->getScalarSizeInBits() / 8;
197   unsigned Align = Ptr.getAlignment().getQuantity();
198   if (Align % Bytes != 0) {
199     DiagnosticsEngine &Diags = CGF.CGM.getDiags();
200     Diags.Report(E->getBeginLoc(), diag::warn_sync_op_misaligned);
201   }
202   return Ptr.getPointer();
203 }
204 
205 /// Utility to insert an atomic instruction based on Intrinsic::ID
206 /// and the expression node.
207 static Value *MakeBinaryAtomicValue(
208     CodeGenFunction &CGF, llvm::AtomicRMWInst::BinOp Kind, const CallExpr *E,
209     AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent) {
210 
211   QualType T = E->getType();
212   assert(E->getArg(0)->getType()->isPointerType());
213   assert(CGF.getContext().hasSameUnqualifiedType(T,
214                                   E->getArg(0)->getType()->getPointeeType()));
215   assert(CGF.getContext().hasSameUnqualifiedType(T, E->getArg(1)->getType()));
216 
217   llvm::Value *DestPtr = CheckAtomicAlignment(CGF, E);
218   unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();
219 
220   llvm::IntegerType *IntType =
221     llvm::IntegerType::get(CGF.getLLVMContext(),
222                            CGF.getContext().getTypeSize(T));
223   llvm::Type *IntPtrType =
224       llvm::PointerType::get(CGF.getLLVMContext(), AddrSpace);
225 
226   llvm::Value *Args[2];
227   Args[0] = CGF.Builder.CreateBitCast(DestPtr, IntPtrType);
228   Args[1] = CGF.EmitScalarExpr(E->getArg(1));
229   llvm::Type *ValueType = Args[1]->getType();
230   Args[1] = EmitToInt(CGF, Args[1], T, IntType);
231 
232   llvm::Value *Result = CGF.Builder.CreateAtomicRMW(
233       Kind, Args[0], Args[1], Ordering);
234   return EmitFromInt(CGF, Result, T, ValueType);
235 }
236 
237 static Value *EmitNontemporalStore(CodeGenFunction &CGF, const CallExpr *E) {
238   Value *Val = CGF.EmitScalarExpr(E->getArg(0));
239   Value *Address = CGF.EmitScalarExpr(E->getArg(1));
240 
241   // Convert the type of the pointer to a pointer to the stored type.
242   Val = CGF.EmitToMemory(Val, E->getArg(0)->getType());
243   unsigned SrcAddrSpace = Address->getType()->getPointerAddressSpace();
244   Value *BC = CGF.Builder.CreateBitCast(
245       Address, llvm::PointerType::get(Val->getType(), SrcAddrSpace), "cast");
246   LValue LV = CGF.MakeNaturalAlignAddrLValue(BC, E->getArg(0)->getType());
247   LV.setNontemporal(true);
248   CGF.EmitStoreOfScalar(Val, LV, false);
249   return nullptr;
250 }
251 
252 static Value *EmitNontemporalLoad(CodeGenFunction &CGF, const CallExpr *E) {
253   Value *Address = CGF.EmitScalarExpr(E->getArg(0));
254 
255   LValue LV = CGF.MakeNaturalAlignAddrLValue(Address, E->getType());
256   LV.setNontemporal(true);
257   return CGF.EmitLoadOfScalar(LV, E->getExprLoc());
258 }
259 
260 static RValue EmitBinaryAtomic(CodeGenFunction &CGF,
261                                llvm::AtomicRMWInst::BinOp Kind,
262                                const CallExpr *E) {
263   return RValue::get(MakeBinaryAtomicValue(CGF, Kind, E));
264 }
265 
266 /// Utility to insert an atomic instruction based Intrinsic::ID and
267 /// the expression node, where the return value is the result of the
268 /// operation.
269 static RValue EmitBinaryAtomicPost(CodeGenFunction &CGF,
270                                    llvm::AtomicRMWInst::BinOp Kind,
271                                    const CallExpr *E,
272                                    Instruction::BinaryOps Op,
273                                    bool Invert = false) {
274   QualType T = E->getType();
275   assert(E->getArg(0)->getType()->isPointerType());
276   assert(CGF.getContext().hasSameUnqualifiedType(T,
277                                   E->getArg(0)->getType()->getPointeeType()));
278   assert(CGF.getContext().hasSameUnqualifiedType(T, E->getArg(1)->getType()));
279 
280   llvm::Value *DestPtr = CheckAtomicAlignment(CGF, E);
281 
282   llvm::IntegerType *IntType = llvm::IntegerType::get(
283       CGF.getLLVMContext(), CGF.getContext().getTypeSize(T));
284 
285   llvm::Value *Args[2];
286   Args[1] = CGF.EmitScalarExpr(E->getArg(1));
287   llvm::Type *ValueType = Args[1]->getType();
288   Args[1] = EmitToInt(CGF, Args[1], T, IntType);
289   Args[0] = DestPtr;
290 
291   llvm::Value *Result = CGF.Builder.CreateAtomicRMW(
292       Kind, Args[0], Args[1], llvm::AtomicOrdering::SequentiallyConsistent);
293   Result = CGF.Builder.CreateBinOp(Op, Result, Args[1]);
294   if (Invert)
295     Result =
296         CGF.Builder.CreateBinOp(llvm::Instruction::Xor, Result,
297                                 llvm::ConstantInt::getAllOnesValue(IntType));
298   Result = EmitFromInt(CGF, Result, T, ValueType);
299   return RValue::get(Result);
300 }
301 
302 /// Utility to insert an atomic cmpxchg instruction.
303 ///
304 /// @param CGF The current codegen function.
305 /// @param E   Builtin call expression to convert to cmpxchg.
306 ///            arg0 - address to operate on
307 ///            arg1 - value to compare with
308 ///            arg2 - new value
309 /// @param ReturnBool Specifies whether to return success flag of
310 ///                   cmpxchg result or the old value.
311 ///
312 /// @returns result of cmpxchg, according to ReturnBool
313 ///
314 /// Note: In order to lower Microsoft's _InterlockedCompareExchange* intrinsics
315 /// invoke the function EmitAtomicCmpXchgForMSIntrin.
316 static Value *MakeAtomicCmpXchgValue(CodeGenFunction &CGF, const CallExpr *E,
317                                      bool ReturnBool) {
318   QualType T = ReturnBool ? E->getArg(1)->getType() : E->getType();
319   llvm::Value *DestPtr = CheckAtomicAlignment(CGF, E);
320 
321   llvm::IntegerType *IntType = llvm::IntegerType::get(
322       CGF.getLLVMContext(), CGF.getContext().getTypeSize(T));
323 
324   Value *Args[3];
325   Args[0] = DestPtr;
326   Args[1] = CGF.EmitScalarExpr(E->getArg(1));
327   llvm::Type *ValueType = Args[1]->getType();
328   Args[1] = EmitToInt(CGF, Args[1], T, IntType);
329   Args[2] = EmitToInt(CGF, CGF.EmitScalarExpr(E->getArg(2)), T, IntType);
330 
331   Value *Pair = CGF.Builder.CreateAtomicCmpXchg(
332       Args[0], Args[1], Args[2], llvm::AtomicOrdering::SequentiallyConsistent,
333       llvm::AtomicOrdering::SequentiallyConsistent);
334   if (ReturnBool)
335     // Extract boolean success flag and zext it to int.
336     return CGF.Builder.CreateZExt(CGF.Builder.CreateExtractValue(Pair, 1),
337                                   CGF.ConvertType(E->getType()));
338   else
339     // Extract old value and emit it using the same type as compare value.
340     return EmitFromInt(CGF, CGF.Builder.CreateExtractValue(Pair, 0), T,
341                        ValueType);
342 }
343 
344 /// This function should be invoked to emit atomic cmpxchg for Microsoft's
345 /// _InterlockedCompareExchange* intrinsics which have the following signature:
346 /// T _InterlockedCompareExchange(T volatile *Destination,
347 ///                               T Exchange,
348 ///                               T Comparand);
349 ///
350 /// Whereas the llvm 'cmpxchg' instruction has the following syntax:
351 /// cmpxchg *Destination, Comparand, Exchange.
352 /// So we need to swap Comparand and Exchange when invoking
353 /// CreateAtomicCmpXchg. That is the reason we could not use the above utility
354 /// function MakeAtomicCmpXchgValue since it expects the arguments to be
355 /// already swapped.
356 
357 static
358 Value *EmitAtomicCmpXchgForMSIntrin(CodeGenFunction &CGF, const CallExpr *E,
359     AtomicOrdering SuccessOrdering = AtomicOrdering::SequentiallyConsistent) {
360   assert(E->getArg(0)->getType()->isPointerType());
361   assert(CGF.getContext().hasSameUnqualifiedType(
362       E->getType(), E->getArg(0)->getType()->getPointeeType()));
363   assert(CGF.getContext().hasSameUnqualifiedType(E->getType(),
364                                                  E->getArg(1)->getType()));
365   assert(CGF.getContext().hasSameUnqualifiedType(E->getType(),
366                                                  E->getArg(2)->getType()));
367 
368   auto *Destination = CGF.EmitScalarExpr(E->getArg(0));
369   auto *Comparand = CGF.EmitScalarExpr(E->getArg(2));
370   auto *Exchange = CGF.EmitScalarExpr(E->getArg(1));
371 
372   // For Release ordering, the failure ordering should be Monotonic.
373   auto FailureOrdering = SuccessOrdering == AtomicOrdering::Release ?
374                          AtomicOrdering::Monotonic :
375                          SuccessOrdering;
376 
377   // The atomic instruction is marked volatile for consistency with MSVC. This
378   // blocks the few atomics optimizations that LLVM has. If we want to optimize
379   // _Interlocked* operations in the future, we will have to remove the volatile
380   // marker.
381   auto *Result = CGF.Builder.CreateAtomicCmpXchg(
382                    Destination, Comparand, Exchange,
383                    SuccessOrdering, FailureOrdering);
384   Result->setVolatile(true);
385   return CGF.Builder.CreateExtractValue(Result, 0);
386 }
387 
388 // 64-bit Microsoft platforms support 128 bit cmpxchg operations. They are
389 // prototyped like this:
390 //
391 // unsigned char _InterlockedCompareExchange128...(
392 //     __int64 volatile * _Destination,
393 //     __int64 _ExchangeHigh,
394 //     __int64 _ExchangeLow,
395 //     __int64 * _ComparandResult);
396 static Value *EmitAtomicCmpXchg128ForMSIntrin(CodeGenFunction &CGF,
397                                               const CallExpr *E,
398                                               AtomicOrdering SuccessOrdering) {
399   assert(E->getNumArgs() == 4);
400   llvm::Value *Destination = CGF.EmitScalarExpr(E->getArg(0));
401   llvm::Value *ExchangeHigh = CGF.EmitScalarExpr(E->getArg(1));
402   llvm::Value *ExchangeLow = CGF.EmitScalarExpr(E->getArg(2));
403   llvm::Value *ComparandPtr = CGF.EmitScalarExpr(E->getArg(3));
404 
405   assert(Destination->getType()->isPointerTy());
406   assert(!ExchangeHigh->getType()->isPointerTy());
407   assert(!ExchangeLow->getType()->isPointerTy());
408   assert(ComparandPtr->getType()->isPointerTy());
409 
410   // For Release ordering, the failure ordering should be Monotonic.
411   auto FailureOrdering = SuccessOrdering == AtomicOrdering::Release
412                              ? AtomicOrdering::Monotonic
413                              : SuccessOrdering;
414 
415   // Convert to i128 pointers and values.
416   llvm::Type *Int128Ty = llvm::IntegerType::get(CGF.getLLVMContext(), 128);
417   Address ComparandResult(ComparandPtr, Int128Ty,
418                           CGF.getContext().toCharUnitsFromBits(128));
419 
420   // (((i128)hi) << 64) | ((i128)lo)
421   ExchangeHigh = CGF.Builder.CreateZExt(ExchangeHigh, Int128Ty);
422   ExchangeLow = CGF.Builder.CreateZExt(ExchangeLow, Int128Ty);
423   ExchangeHigh =
424       CGF.Builder.CreateShl(ExchangeHigh, llvm::ConstantInt::get(Int128Ty, 64));
425   llvm::Value *Exchange = CGF.Builder.CreateOr(ExchangeHigh, ExchangeLow);
426 
427   // Load the comparand for the instruction.
428   llvm::Value *Comparand = CGF.Builder.CreateLoad(ComparandResult);
429 
430   auto *CXI = CGF.Builder.CreateAtomicCmpXchg(Destination, Comparand, Exchange,
431                                               SuccessOrdering, FailureOrdering);
432 
433   // The atomic instruction is marked volatile for consistency with MSVC. This
434   // blocks the few atomics optimizations that LLVM has. If we want to optimize
435   // _Interlocked* operations in the future, we will have to remove the volatile
436   // marker.
437   CXI->setVolatile(true);
438 
439   // Store the result as an outparameter.
440   CGF.Builder.CreateStore(CGF.Builder.CreateExtractValue(CXI, 0),
441                           ComparandResult);
442 
443   // Get the success boolean and zero extend it to i8.
444   Value *Success = CGF.Builder.CreateExtractValue(CXI, 1);
445   return CGF.Builder.CreateZExt(Success, CGF.Int8Ty);
446 }
447 
448 static Value *EmitAtomicIncrementValue(CodeGenFunction &CGF, const CallExpr *E,
449     AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent) {
450   assert(E->getArg(0)->getType()->isPointerType());
451 
452   auto *IntTy = CGF.ConvertType(E->getType());
453   auto *Result = CGF.Builder.CreateAtomicRMW(
454                    AtomicRMWInst::Add,
455                    CGF.EmitScalarExpr(E->getArg(0)),
456                    ConstantInt::get(IntTy, 1),
457                    Ordering);
458   return CGF.Builder.CreateAdd(Result, ConstantInt::get(IntTy, 1));
459 }
460 
461 static Value *EmitAtomicDecrementValue(CodeGenFunction &CGF, const CallExpr *E,
462     AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent) {
463   assert(E->getArg(0)->getType()->isPointerType());
464 
465   auto *IntTy = CGF.ConvertType(E->getType());
466   auto *Result = CGF.Builder.CreateAtomicRMW(
467                    AtomicRMWInst::Sub,
468                    CGF.EmitScalarExpr(E->getArg(0)),
469                    ConstantInt::get(IntTy, 1),
470                    Ordering);
471   return CGF.Builder.CreateSub(Result, ConstantInt::get(IntTy, 1));
472 }
473 
474 // Build a plain volatile load.
475 static Value *EmitISOVolatileLoad(CodeGenFunction &CGF, const CallExpr *E) {
476   Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
477   QualType ElTy = E->getArg(0)->getType()->getPointeeType();
478   CharUnits LoadSize = CGF.getContext().getTypeSizeInChars(ElTy);
479   llvm::Type *ITy =
480       llvm::IntegerType::get(CGF.getLLVMContext(), LoadSize.getQuantity() * 8);
481   llvm::LoadInst *Load = CGF.Builder.CreateAlignedLoad(ITy, Ptr, LoadSize);
482   Load->setVolatile(true);
483   return Load;
484 }
485 
486 // Build a plain volatile store.
487 static Value *EmitISOVolatileStore(CodeGenFunction &CGF, const CallExpr *E) {
488   Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
489   Value *Value = CGF.EmitScalarExpr(E->getArg(1));
490   QualType ElTy = E->getArg(0)->getType()->getPointeeType();
491   CharUnits StoreSize = CGF.getContext().getTypeSizeInChars(ElTy);
492   llvm::StoreInst *Store =
493       CGF.Builder.CreateAlignedStore(Value, Ptr, StoreSize);
494   Store->setVolatile(true);
495   return Store;
496 }
497 
498 // Emit a simple mangled intrinsic that has 1 argument and a return type
499 // matching the argument type. Depending on mode, this may be a constrained
500 // floating-point intrinsic.
501 static Value *emitUnaryMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
502                                 const CallExpr *E, unsigned IntrinsicID,
503                                 unsigned ConstrainedIntrinsicID) {
504   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
505 
506   if (CGF.Builder.getIsFPConstrained()) {
507     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
508     Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Src0->getType());
509     return CGF.Builder.CreateConstrainedFPCall(F, { Src0 });
510   } else {
511     Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
512     return CGF.Builder.CreateCall(F, Src0);
513   }
514 }
515 
516 // Emit an intrinsic that has 2 operands of the same type as its result.
517 // Depending on mode, this may be a constrained floating-point intrinsic.
518 static Value *emitBinaryMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
519                                 const CallExpr *E, unsigned IntrinsicID,
520                                 unsigned ConstrainedIntrinsicID) {
521   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
522   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
523 
524   if (CGF.Builder.getIsFPConstrained()) {
525     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
526     Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Src0->getType());
527     return CGF.Builder.CreateConstrainedFPCall(F, { Src0, Src1 });
528   } else {
529     Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
530     return CGF.Builder.CreateCall(F, { Src0, Src1 });
531   }
532 }
533 
534 // Has second type mangled argument.
535 static Value *emitBinaryExpMaybeConstrainedFPBuiltin(
536     CodeGenFunction &CGF, const CallExpr *E, llvm::Intrinsic::ID IntrinsicID,
537     llvm::Intrinsic::ID ConstrainedIntrinsicID) {
538   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
539   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
540 
541   if (CGF.Builder.getIsFPConstrained()) {
542     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
543     Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID,
544                                        {Src0->getType(), Src1->getType()});
545     return CGF.Builder.CreateConstrainedFPCall(F, {Src0, Src1});
546   }
547 
548   Function *F =
549       CGF.CGM.getIntrinsic(IntrinsicID, {Src0->getType(), Src1->getType()});
550   return CGF.Builder.CreateCall(F, {Src0, Src1});
551 }
552 
553 // Emit an intrinsic that has 3 operands of the same type as its result.
554 // Depending on mode, this may be a constrained floating-point intrinsic.
555 static Value *emitTernaryMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
556                                  const CallExpr *E, unsigned IntrinsicID,
557                                  unsigned ConstrainedIntrinsicID) {
558   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
559   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
560   llvm::Value *Src2 = CGF.EmitScalarExpr(E->getArg(2));
561 
562   if (CGF.Builder.getIsFPConstrained()) {
563     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
564     Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Src0->getType());
565     return CGF.Builder.CreateConstrainedFPCall(F, { Src0, Src1, Src2 });
566   } else {
567     Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
568     return CGF.Builder.CreateCall(F, { Src0, Src1, Src2 });
569   }
570 }
571 
572 // Emit an intrinsic where all operands are of the same type as the result.
573 // Depending on mode, this may be a constrained floating-point intrinsic.
574 static Value *emitCallMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
575                                                 unsigned IntrinsicID,
576                                                 unsigned ConstrainedIntrinsicID,
577                                                 llvm::Type *Ty,
578                                                 ArrayRef<Value *> Args) {
579   Function *F;
580   if (CGF.Builder.getIsFPConstrained())
581     F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Ty);
582   else
583     F = CGF.CGM.getIntrinsic(IntrinsicID, Ty);
584 
585   if (CGF.Builder.getIsFPConstrained())
586     return CGF.Builder.CreateConstrainedFPCall(F, Args);
587   else
588     return CGF.Builder.CreateCall(F, Args);
589 }
590 
591 // Emit a simple mangled intrinsic that has 1 argument and a return type
592 // matching the argument type.
593 static Value *emitUnaryBuiltin(CodeGenFunction &CGF, const CallExpr *E,
594                                unsigned IntrinsicID,
595                                llvm::StringRef Name = "") {
596   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
597 
598   Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
599   return CGF.Builder.CreateCall(F, Src0, Name);
600 }
601 
602 // Emit an intrinsic that has 2 operands of the same type as its result.
603 static Value *emitBinaryBuiltin(CodeGenFunction &CGF,
604                                 const CallExpr *E,
605                                 unsigned IntrinsicID) {
606   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
607   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
608 
609   Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
610   return CGF.Builder.CreateCall(F, { Src0, Src1 });
611 }
612 
613 // Emit an intrinsic that has 3 operands of the same type as its result.
614 static Value *emitTernaryBuiltin(CodeGenFunction &CGF,
615                                  const CallExpr *E,
616                                  unsigned IntrinsicID) {
617   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
618   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
619   llvm::Value *Src2 = CGF.EmitScalarExpr(E->getArg(2));
620 
621   Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
622   return CGF.Builder.CreateCall(F, { Src0, Src1, Src2 });
623 }
624 
625 // Emit an intrinsic that has 1 float or double operand, and 1 integer.
626 static Value *emitFPIntBuiltin(CodeGenFunction &CGF,
627                                const CallExpr *E,
628                                unsigned IntrinsicID) {
629   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
630   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
631 
632   Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
633   return CGF.Builder.CreateCall(F, {Src0, Src1});
634 }
635 
636 // Emit an intrinsic that has overloaded integer result and fp operand.
637 static Value *
638 emitMaybeConstrainedFPToIntRoundBuiltin(CodeGenFunction &CGF, const CallExpr *E,
639                                         unsigned IntrinsicID,
640                                         unsigned ConstrainedIntrinsicID) {
641   llvm::Type *ResultType = CGF.ConvertType(E->getType());
642   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
643 
644   if (CGF.Builder.getIsFPConstrained()) {
645     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
646     Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID,
647                                        {ResultType, Src0->getType()});
648     return CGF.Builder.CreateConstrainedFPCall(F, {Src0});
649   } else {
650     Function *F =
651         CGF.CGM.getIntrinsic(IntrinsicID, {ResultType, Src0->getType()});
652     return CGF.Builder.CreateCall(F, Src0);
653   }
654 }
655 
656 static Value *emitFrexpBuiltin(CodeGenFunction &CGF, const CallExpr *E,
657                                llvm::Intrinsic::ID IntrinsicID) {
658   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
659   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
660 
661   QualType IntPtrTy = E->getArg(1)->getType()->getPointeeType();
662   llvm::Type *IntTy = CGF.ConvertType(IntPtrTy);
663   llvm::Function *F =
664       CGF.CGM.getIntrinsic(IntrinsicID, {Src0->getType(), IntTy});
665   llvm::Value *Call = CGF.Builder.CreateCall(F, Src0);
666 
667   llvm::Value *Exp = CGF.Builder.CreateExtractValue(Call, 1);
668   LValue LV = CGF.MakeNaturalAlignAddrLValue(Src1, IntPtrTy);
669   CGF.EmitStoreOfScalar(Exp, LV);
670 
671   return CGF.Builder.CreateExtractValue(Call, 0);
672 }
673 
674 /// EmitFAbs - Emit a call to @llvm.fabs().
675 static Value *EmitFAbs(CodeGenFunction &CGF, Value *V) {
676   Function *F = CGF.CGM.getIntrinsic(Intrinsic::fabs, V->getType());
677   llvm::CallInst *Call = CGF.Builder.CreateCall(F, V);
678   Call->setDoesNotAccessMemory();
679   return Call;
680 }
681 
682 /// Emit the computation of the sign bit for a floating point value. Returns
683 /// the i1 sign bit value.
684 static Value *EmitSignBit(CodeGenFunction &CGF, Value *V) {
685   LLVMContext &C = CGF.CGM.getLLVMContext();
686 
687   llvm::Type *Ty = V->getType();
688   int Width = Ty->getPrimitiveSizeInBits();
689   llvm::Type *IntTy = llvm::IntegerType::get(C, Width);
690   V = CGF.Builder.CreateBitCast(V, IntTy);
691   if (Ty->isPPC_FP128Ty()) {
692     // We want the sign bit of the higher-order double. The bitcast we just
693     // did works as if the double-double was stored to memory and then
694     // read as an i128. The "store" will put the higher-order double in the
695     // lower address in both little- and big-Endian modes, but the "load"
696     // will treat those bits as a different part of the i128: the low bits in
697     // little-Endian, the high bits in big-Endian. Therefore, on big-Endian
698     // we need to shift the high bits down to the low before truncating.
699     Width >>= 1;
700     if (CGF.getTarget().isBigEndian()) {
701       Value *ShiftCst = llvm::ConstantInt::get(IntTy, Width);
702       V = CGF.Builder.CreateLShr(V, ShiftCst);
703     }
704     // We are truncating value in order to extract the higher-order
705     // double, which we will be using to extract the sign from.
706     IntTy = llvm::IntegerType::get(C, Width);
707     V = CGF.Builder.CreateTrunc(V, IntTy);
708   }
709   Value *Zero = llvm::Constant::getNullValue(IntTy);
710   return CGF.Builder.CreateICmpSLT(V, Zero);
711 }
712 
713 static RValue emitLibraryCall(CodeGenFunction &CGF, const FunctionDecl *FD,
714                               const CallExpr *E, llvm::Constant *calleeValue) {
715   CGCallee callee = CGCallee::forDirect(calleeValue, GlobalDecl(FD));
716   return CGF.EmitCall(E->getCallee()->getType(), callee, E, ReturnValueSlot());
717 }
718 
719 /// Emit a call to llvm.{sadd,uadd,ssub,usub,smul,umul}.with.overflow.*
720 /// depending on IntrinsicID.
721 ///
722 /// \arg CGF The current codegen function.
723 /// \arg IntrinsicID The ID for the Intrinsic we wish to generate.
724 /// \arg X The first argument to the llvm.*.with.overflow.*.
725 /// \arg Y The second argument to the llvm.*.with.overflow.*.
726 /// \arg Carry The carry returned by the llvm.*.with.overflow.*.
727 /// \returns The result (i.e. sum/product) returned by the intrinsic.
728 static llvm::Value *EmitOverflowIntrinsic(CodeGenFunction &CGF,
729                                           const llvm::Intrinsic::ID IntrinsicID,
730                                           llvm::Value *X, llvm::Value *Y,
731                                           llvm::Value *&Carry) {
732   // Make sure we have integers of the same width.
733   assert(X->getType() == Y->getType() &&
734          "Arguments must be the same type. (Did you forget to make sure both "
735          "arguments have the same integer width?)");
736 
737   Function *Callee = CGF.CGM.getIntrinsic(IntrinsicID, X->getType());
738   llvm::Value *Tmp = CGF.Builder.CreateCall(Callee, {X, Y});
739   Carry = CGF.Builder.CreateExtractValue(Tmp, 1);
740   return CGF.Builder.CreateExtractValue(Tmp, 0);
741 }
742 
743 static Value *emitRangedBuiltin(CodeGenFunction &CGF,
744                                 unsigned IntrinsicID,
745                                 int low, int high) {
746     llvm::MDBuilder MDHelper(CGF.getLLVMContext());
747     llvm::MDNode *RNode = MDHelper.createRange(APInt(32, low), APInt(32, high));
748     Function *F = CGF.CGM.getIntrinsic(IntrinsicID, {});
749     llvm::Instruction *Call = CGF.Builder.CreateCall(F);
750     Call->setMetadata(llvm::LLVMContext::MD_range, RNode);
751     Call->setMetadata(llvm::LLVMContext::MD_noundef,
752                       llvm::MDNode::get(CGF.getLLVMContext(), std::nullopt));
753     return Call;
754 }
755 
756 namespace {
757   struct WidthAndSignedness {
758     unsigned Width;
759     bool Signed;
760   };
761 }
762 
763 static WidthAndSignedness
764 getIntegerWidthAndSignedness(const clang::ASTContext &context,
765                              const clang::QualType Type) {
766   assert(Type->isIntegerType() && "Given type is not an integer.");
767   unsigned Width = Type->isBooleanType()  ? 1
768                    : Type->isBitIntType() ? context.getIntWidth(Type)
769                                           : context.getTypeInfo(Type).Width;
770   bool Signed = Type->isSignedIntegerType();
771   return {Width, Signed};
772 }
773 
774 // Given one or more integer types, this function produces an integer type that
775 // encompasses them: any value in one of the given types could be expressed in
776 // the encompassing type.
777 static struct WidthAndSignedness
778 EncompassingIntegerType(ArrayRef<struct WidthAndSignedness> Types) {
779   assert(Types.size() > 0 && "Empty list of types.");
780 
781   // If any of the given types is signed, we must return a signed type.
782   bool Signed = false;
783   for (const auto &Type : Types) {
784     Signed |= Type.Signed;
785   }
786 
787   // The encompassing type must have a width greater than or equal to the width
788   // of the specified types.  Additionally, if the encompassing type is signed,
789   // its width must be strictly greater than the width of any unsigned types
790   // given.
791   unsigned Width = 0;
792   for (const auto &Type : Types) {
793     unsigned MinWidth = Type.Width + (Signed && !Type.Signed);
794     if (Width < MinWidth) {
795       Width = MinWidth;
796     }
797   }
798 
799   return {Width, Signed};
800 }
801 
802 Value *CodeGenFunction::EmitVAStartEnd(Value *ArgValue, bool IsStart) {
803   llvm::Type *DestType = Int8PtrTy;
804   if (ArgValue->getType() != DestType)
805     ArgValue =
806         Builder.CreateBitCast(ArgValue, DestType, ArgValue->getName().data());
807 
808   Intrinsic::ID inst = IsStart ? Intrinsic::vastart : Intrinsic::vaend;
809   return Builder.CreateCall(CGM.getIntrinsic(inst), ArgValue);
810 }
811 
812 /// Checks if using the result of __builtin_object_size(p, @p From) in place of
813 /// __builtin_object_size(p, @p To) is correct
814 static bool areBOSTypesCompatible(int From, int To) {
815   // Note: Our __builtin_object_size implementation currently treats Type=0 and
816   // Type=2 identically. Encoding this implementation detail here may make
817   // improving __builtin_object_size difficult in the future, so it's omitted.
818   return From == To || (From == 0 && To == 1) || (From == 3 && To == 2);
819 }
820 
821 static llvm::Value *
822 getDefaultBuiltinObjectSizeResult(unsigned Type, llvm::IntegerType *ResType) {
823   return ConstantInt::get(ResType, (Type & 2) ? 0 : -1, /*isSigned=*/true);
824 }
825 
826 llvm::Value *
827 CodeGenFunction::evaluateOrEmitBuiltinObjectSize(const Expr *E, unsigned Type,
828                                                  llvm::IntegerType *ResType,
829                                                  llvm::Value *EmittedE,
830                                                  bool IsDynamic) {
831   uint64_t ObjectSize;
832   if (!E->tryEvaluateObjectSize(ObjectSize, getContext(), Type))
833     return emitBuiltinObjectSize(E, Type, ResType, EmittedE, IsDynamic);
834   return ConstantInt::get(ResType, ObjectSize, /*isSigned=*/true);
835 }
836 
837 /// Returns a Value corresponding to the size of the given expression.
838 /// This Value may be either of the following:
839 ///   - A llvm::Argument (if E is a param with the pass_object_size attribute on
840 ///     it)
841 ///   - A call to the @llvm.objectsize intrinsic
842 ///
843 /// EmittedE is the result of emitting `E` as a scalar expr. If it's non-null
844 /// and we wouldn't otherwise try to reference a pass_object_size parameter,
845 /// we'll call @llvm.objectsize on EmittedE, rather than emitting E.
846 llvm::Value *
847 CodeGenFunction::emitBuiltinObjectSize(const Expr *E, unsigned Type,
848                                        llvm::IntegerType *ResType,
849                                        llvm::Value *EmittedE, bool IsDynamic) {
850   // We need to reference an argument if the pointer is a parameter with the
851   // pass_object_size attribute.
852   if (auto *D = dyn_cast<DeclRefExpr>(E->IgnoreParenImpCasts())) {
853     auto *Param = dyn_cast<ParmVarDecl>(D->getDecl());
854     auto *PS = D->getDecl()->getAttr<PassObjectSizeAttr>();
855     if (Param != nullptr && PS != nullptr &&
856         areBOSTypesCompatible(PS->getType(), Type)) {
857       auto Iter = SizeArguments.find(Param);
858       assert(Iter != SizeArguments.end());
859 
860       const ImplicitParamDecl *D = Iter->second;
861       auto DIter = LocalDeclMap.find(D);
862       assert(DIter != LocalDeclMap.end());
863 
864       return EmitLoadOfScalar(DIter->second, /*Volatile=*/false,
865                               getContext().getSizeType(), E->getBeginLoc());
866     }
867   }
868 
869   // LLVM can't handle Type=3 appropriately, and __builtin_object_size shouldn't
870   // evaluate E for side-effects. In either case, we shouldn't lower to
871   // @llvm.objectsize.
872   if (Type == 3 || (!EmittedE && E->HasSideEffects(getContext())))
873     return getDefaultBuiltinObjectSizeResult(Type, ResType);
874 
875   Value *Ptr = EmittedE ? EmittedE : EmitScalarExpr(E);
876   assert(Ptr->getType()->isPointerTy() &&
877          "Non-pointer passed to __builtin_object_size?");
878 
879   Function *F =
880       CGM.getIntrinsic(Intrinsic::objectsize, {ResType, Ptr->getType()});
881 
882   // LLVM only supports 0 and 2, make sure that we pass along that as a boolean.
883   Value *Min = Builder.getInt1((Type & 2) != 0);
884   // For GCC compatibility, __builtin_object_size treat NULL as unknown size.
885   Value *NullIsUnknown = Builder.getTrue();
886   Value *Dynamic = Builder.getInt1(IsDynamic);
887   return Builder.CreateCall(F, {Ptr, Min, NullIsUnknown, Dynamic});
888 }
889 
890 namespace {
891 /// A struct to generically describe a bit test intrinsic.
892 struct BitTest {
893   enum ActionKind : uint8_t { TestOnly, Complement, Reset, Set };
894   enum InterlockingKind : uint8_t {
895     Unlocked,
896     Sequential,
897     Acquire,
898     Release,
899     NoFence
900   };
901 
902   ActionKind Action;
903   InterlockingKind Interlocking;
904   bool Is64Bit;
905 
906   static BitTest decodeBitTestBuiltin(unsigned BuiltinID);
907 };
908 } // namespace
909 
910 BitTest BitTest::decodeBitTestBuiltin(unsigned BuiltinID) {
911   switch (BuiltinID) {
912     // Main portable variants.
913   case Builtin::BI_bittest:
914     return {TestOnly, Unlocked, false};
915   case Builtin::BI_bittestandcomplement:
916     return {Complement, Unlocked, false};
917   case Builtin::BI_bittestandreset:
918     return {Reset, Unlocked, false};
919   case Builtin::BI_bittestandset:
920     return {Set, Unlocked, false};
921   case Builtin::BI_interlockedbittestandreset:
922     return {Reset, Sequential, false};
923   case Builtin::BI_interlockedbittestandset:
924     return {Set, Sequential, false};
925 
926     // X86-specific 64-bit variants.
927   case Builtin::BI_bittest64:
928     return {TestOnly, Unlocked, true};
929   case Builtin::BI_bittestandcomplement64:
930     return {Complement, Unlocked, true};
931   case Builtin::BI_bittestandreset64:
932     return {Reset, Unlocked, true};
933   case Builtin::BI_bittestandset64:
934     return {Set, Unlocked, true};
935   case Builtin::BI_interlockedbittestandreset64:
936     return {Reset, Sequential, true};
937   case Builtin::BI_interlockedbittestandset64:
938     return {Set, Sequential, true};
939 
940     // ARM/AArch64-specific ordering variants.
941   case Builtin::BI_interlockedbittestandset_acq:
942     return {Set, Acquire, false};
943   case Builtin::BI_interlockedbittestandset_rel:
944     return {Set, Release, false};
945   case Builtin::BI_interlockedbittestandset_nf:
946     return {Set, NoFence, false};
947   case Builtin::BI_interlockedbittestandreset_acq:
948     return {Reset, Acquire, false};
949   case Builtin::BI_interlockedbittestandreset_rel:
950     return {Reset, Release, false};
951   case Builtin::BI_interlockedbittestandreset_nf:
952     return {Reset, NoFence, false};
953   }
954   llvm_unreachable("expected only bittest intrinsics");
955 }
956 
957 static char bitActionToX86BTCode(BitTest::ActionKind A) {
958   switch (A) {
959   case BitTest::TestOnly:   return '\0';
960   case BitTest::Complement: return 'c';
961   case BitTest::Reset:      return 'r';
962   case BitTest::Set:        return 's';
963   }
964   llvm_unreachable("invalid action");
965 }
966 
967 static llvm::Value *EmitX86BitTestIntrinsic(CodeGenFunction &CGF,
968                                             BitTest BT,
969                                             const CallExpr *E, Value *BitBase,
970                                             Value *BitPos) {
971   char Action = bitActionToX86BTCode(BT.Action);
972   char SizeSuffix = BT.Is64Bit ? 'q' : 'l';
973 
974   // Build the assembly.
975   SmallString<64> Asm;
976   raw_svector_ostream AsmOS(Asm);
977   if (BT.Interlocking != BitTest::Unlocked)
978     AsmOS << "lock ";
979   AsmOS << "bt";
980   if (Action)
981     AsmOS << Action;
982   AsmOS << SizeSuffix << " $2, ($1)";
983 
984   // Build the constraints. FIXME: We should support immediates when possible.
985   std::string Constraints = "={@ccc},r,r,~{cc},~{memory}";
986   std::string_view MachineClobbers = CGF.getTarget().getClobbers();
987   if (!MachineClobbers.empty()) {
988     Constraints += ',';
989     Constraints += MachineClobbers;
990   }
991   llvm::IntegerType *IntType = llvm::IntegerType::get(
992       CGF.getLLVMContext(),
993       CGF.getContext().getTypeSize(E->getArg(1)->getType()));
994   llvm::Type *PtrType = llvm::PointerType::getUnqual(CGF.getLLVMContext());
995   llvm::FunctionType *FTy =
996       llvm::FunctionType::get(CGF.Int8Ty, {PtrType, IntType}, false);
997 
998   llvm::InlineAsm *IA =
999       llvm::InlineAsm::get(FTy, Asm, Constraints, /*hasSideEffects=*/true);
1000   return CGF.Builder.CreateCall(IA, {BitBase, BitPos});
1001 }
1002 
1003 static llvm::AtomicOrdering
1004 getBitTestAtomicOrdering(BitTest::InterlockingKind I) {
1005   switch (I) {
1006   case BitTest::Unlocked:   return llvm::AtomicOrdering::NotAtomic;
1007   case BitTest::Sequential: return llvm::AtomicOrdering::SequentiallyConsistent;
1008   case BitTest::Acquire:    return llvm::AtomicOrdering::Acquire;
1009   case BitTest::Release:    return llvm::AtomicOrdering::Release;
1010   case BitTest::NoFence:    return llvm::AtomicOrdering::Monotonic;
1011   }
1012   llvm_unreachable("invalid interlocking");
1013 }
1014 
1015 /// Emit a _bittest* intrinsic. These intrinsics take a pointer to an array of
1016 /// bits and a bit position and read and optionally modify the bit at that
1017 /// position. The position index can be arbitrarily large, i.e. it can be larger
1018 /// than 31 or 63, so we need an indexed load in the general case.
1019 static llvm::Value *EmitBitTestIntrinsic(CodeGenFunction &CGF,
1020                                          unsigned BuiltinID,
1021                                          const CallExpr *E) {
1022   Value *BitBase = CGF.EmitScalarExpr(E->getArg(0));
1023   Value *BitPos = CGF.EmitScalarExpr(E->getArg(1));
1024 
1025   BitTest BT = BitTest::decodeBitTestBuiltin(BuiltinID);
1026 
1027   // X86 has special BT, BTC, BTR, and BTS instructions that handle the array
1028   // indexing operation internally. Use them if possible.
1029   if (CGF.getTarget().getTriple().isX86())
1030     return EmitX86BitTestIntrinsic(CGF, BT, E, BitBase, BitPos);
1031 
1032   // Otherwise, use generic code to load one byte and test the bit. Use all but
1033   // the bottom three bits as the array index, and the bottom three bits to form
1034   // a mask.
1035   // Bit = BitBaseI8[BitPos >> 3] & (1 << (BitPos & 0x7)) != 0;
1036   Value *ByteIndex = CGF.Builder.CreateAShr(
1037       BitPos, llvm::ConstantInt::get(BitPos->getType(), 3), "bittest.byteidx");
1038   Value *BitBaseI8 = CGF.Builder.CreatePointerCast(BitBase, CGF.Int8PtrTy);
1039   Address ByteAddr(CGF.Builder.CreateInBoundsGEP(CGF.Int8Ty, BitBaseI8,
1040                                                  ByteIndex, "bittest.byteaddr"),
1041                    CGF.Int8Ty, CharUnits::One());
1042   Value *PosLow =
1043       CGF.Builder.CreateAnd(CGF.Builder.CreateTrunc(BitPos, CGF.Int8Ty),
1044                             llvm::ConstantInt::get(CGF.Int8Ty, 0x7));
1045 
1046   // The updating instructions will need a mask.
1047   Value *Mask = nullptr;
1048   if (BT.Action != BitTest::TestOnly) {
1049     Mask = CGF.Builder.CreateShl(llvm::ConstantInt::get(CGF.Int8Ty, 1), PosLow,
1050                                  "bittest.mask");
1051   }
1052 
1053   // Check the action and ordering of the interlocked intrinsics.
1054   llvm::AtomicOrdering Ordering = getBitTestAtomicOrdering(BT.Interlocking);
1055 
1056   Value *OldByte = nullptr;
1057   if (Ordering != llvm::AtomicOrdering::NotAtomic) {
1058     // Emit a combined atomicrmw load/store operation for the interlocked
1059     // intrinsics.
1060     llvm::AtomicRMWInst::BinOp RMWOp = llvm::AtomicRMWInst::Or;
1061     if (BT.Action == BitTest::Reset) {
1062       Mask = CGF.Builder.CreateNot(Mask);
1063       RMWOp = llvm::AtomicRMWInst::And;
1064     }
1065     OldByte = CGF.Builder.CreateAtomicRMW(RMWOp, ByteAddr.getPointer(), Mask,
1066                                           Ordering);
1067   } else {
1068     // Emit a plain load for the non-interlocked intrinsics.
1069     OldByte = CGF.Builder.CreateLoad(ByteAddr, "bittest.byte");
1070     Value *NewByte = nullptr;
1071     switch (BT.Action) {
1072     case BitTest::TestOnly:
1073       // Don't store anything.
1074       break;
1075     case BitTest::Complement:
1076       NewByte = CGF.Builder.CreateXor(OldByte, Mask);
1077       break;
1078     case BitTest::Reset:
1079       NewByte = CGF.Builder.CreateAnd(OldByte, CGF.Builder.CreateNot(Mask));
1080       break;
1081     case BitTest::Set:
1082       NewByte = CGF.Builder.CreateOr(OldByte, Mask);
1083       break;
1084     }
1085     if (NewByte)
1086       CGF.Builder.CreateStore(NewByte, ByteAddr);
1087   }
1088 
1089   // However we loaded the old byte, either by plain load or atomicrmw, shift
1090   // the bit into the low position and mask it to 0 or 1.
1091   Value *ShiftedByte = CGF.Builder.CreateLShr(OldByte, PosLow, "bittest.shr");
1092   return CGF.Builder.CreateAnd(
1093       ShiftedByte, llvm::ConstantInt::get(CGF.Int8Ty, 1), "bittest.res");
1094 }
1095 
1096 static llvm::Value *emitPPCLoadReserveIntrinsic(CodeGenFunction &CGF,
1097                                                 unsigned BuiltinID,
1098                                                 const CallExpr *E) {
1099   Value *Addr = CGF.EmitScalarExpr(E->getArg(0));
1100 
1101   SmallString<64> Asm;
1102   raw_svector_ostream AsmOS(Asm);
1103   llvm::IntegerType *RetType = CGF.Int32Ty;
1104 
1105   switch (BuiltinID) {
1106   case clang::PPC::BI__builtin_ppc_ldarx:
1107     AsmOS << "ldarx ";
1108     RetType = CGF.Int64Ty;
1109     break;
1110   case clang::PPC::BI__builtin_ppc_lwarx:
1111     AsmOS << "lwarx ";
1112     RetType = CGF.Int32Ty;
1113     break;
1114   case clang::PPC::BI__builtin_ppc_lharx:
1115     AsmOS << "lharx ";
1116     RetType = CGF.Int16Ty;
1117     break;
1118   case clang::PPC::BI__builtin_ppc_lbarx:
1119     AsmOS << "lbarx ";
1120     RetType = CGF.Int8Ty;
1121     break;
1122   default:
1123     llvm_unreachable("Expected only PowerPC load reserve intrinsics");
1124   }
1125 
1126   AsmOS << "$0, ${1:y}";
1127 
1128   std::string Constraints = "=r,*Z,~{memory}";
1129   std::string_view MachineClobbers = CGF.getTarget().getClobbers();
1130   if (!MachineClobbers.empty()) {
1131     Constraints += ',';
1132     Constraints += MachineClobbers;
1133   }
1134 
1135   llvm::Type *PtrType = llvm::PointerType::getUnqual(CGF.getLLVMContext());
1136   llvm::FunctionType *FTy = llvm::FunctionType::get(RetType, {PtrType}, false);
1137 
1138   llvm::InlineAsm *IA =
1139       llvm::InlineAsm::get(FTy, Asm, Constraints, /*hasSideEffects=*/true);
1140   llvm::CallInst *CI = CGF.Builder.CreateCall(IA, {Addr});
1141   CI->addParamAttr(
1142       0, Attribute::get(CGF.getLLVMContext(), Attribute::ElementType, RetType));
1143   return CI;
1144 }
1145 
1146 namespace {
1147 enum class MSVCSetJmpKind {
1148   _setjmpex,
1149   _setjmp3,
1150   _setjmp
1151 };
1152 }
1153 
1154 /// MSVC handles setjmp a bit differently on different platforms. On every
1155 /// architecture except 32-bit x86, the frame address is passed. On x86, extra
1156 /// parameters can be passed as variadic arguments, but we always pass none.
1157 static RValue EmitMSVCRTSetJmp(CodeGenFunction &CGF, MSVCSetJmpKind SJKind,
1158                                const CallExpr *E) {
1159   llvm::Value *Arg1 = nullptr;
1160   llvm::Type *Arg1Ty = nullptr;
1161   StringRef Name;
1162   bool IsVarArg = false;
1163   if (SJKind == MSVCSetJmpKind::_setjmp3) {
1164     Name = "_setjmp3";
1165     Arg1Ty = CGF.Int32Ty;
1166     Arg1 = llvm::ConstantInt::get(CGF.IntTy, 0);
1167     IsVarArg = true;
1168   } else {
1169     Name = SJKind == MSVCSetJmpKind::_setjmp ? "_setjmp" : "_setjmpex";
1170     Arg1Ty = CGF.Int8PtrTy;
1171     if (CGF.getTarget().getTriple().getArch() == llvm::Triple::aarch64) {
1172       Arg1 = CGF.Builder.CreateCall(
1173           CGF.CGM.getIntrinsic(Intrinsic::sponentry, CGF.AllocaInt8PtrTy));
1174     } else
1175       Arg1 = CGF.Builder.CreateCall(
1176           CGF.CGM.getIntrinsic(Intrinsic::frameaddress, CGF.AllocaInt8PtrTy),
1177           llvm::ConstantInt::get(CGF.Int32Ty, 0));
1178   }
1179 
1180   // Mark the call site and declaration with ReturnsTwice.
1181   llvm::Type *ArgTypes[2] = {CGF.Int8PtrTy, Arg1Ty};
1182   llvm::AttributeList ReturnsTwiceAttr = llvm::AttributeList::get(
1183       CGF.getLLVMContext(), llvm::AttributeList::FunctionIndex,
1184       llvm::Attribute::ReturnsTwice);
1185   llvm::FunctionCallee SetJmpFn = CGF.CGM.CreateRuntimeFunction(
1186       llvm::FunctionType::get(CGF.IntTy, ArgTypes, IsVarArg), Name,
1187       ReturnsTwiceAttr, /*Local=*/true);
1188 
1189   llvm::Value *Buf = CGF.Builder.CreateBitOrPointerCast(
1190       CGF.EmitScalarExpr(E->getArg(0)), CGF.Int8PtrTy);
1191   llvm::Value *Args[] = {Buf, Arg1};
1192   llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(SetJmpFn, Args);
1193   CB->setAttributes(ReturnsTwiceAttr);
1194   return RValue::get(CB);
1195 }
1196 
1197 // Many of MSVC builtins are on x64, ARM and AArch64; to avoid repeating code,
1198 // we handle them here.
1199 enum class CodeGenFunction::MSVCIntrin {
1200   _BitScanForward,
1201   _BitScanReverse,
1202   _InterlockedAnd,
1203   _InterlockedDecrement,
1204   _InterlockedExchange,
1205   _InterlockedExchangeAdd,
1206   _InterlockedExchangeSub,
1207   _InterlockedIncrement,
1208   _InterlockedOr,
1209   _InterlockedXor,
1210   _InterlockedExchangeAdd_acq,
1211   _InterlockedExchangeAdd_rel,
1212   _InterlockedExchangeAdd_nf,
1213   _InterlockedExchange_acq,
1214   _InterlockedExchange_rel,
1215   _InterlockedExchange_nf,
1216   _InterlockedCompareExchange_acq,
1217   _InterlockedCompareExchange_rel,
1218   _InterlockedCompareExchange_nf,
1219   _InterlockedCompareExchange128,
1220   _InterlockedCompareExchange128_acq,
1221   _InterlockedCompareExchange128_rel,
1222   _InterlockedCompareExchange128_nf,
1223   _InterlockedOr_acq,
1224   _InterlockedOr_rel,
1225   _InterlockedOr_nf,
1226   _InterlockedXor_acq,
1227   _InterlockedXor_rel,
1228   _InterlockedXor_nf,
1229   _InterlockedAnd_acq,
1230   _InterlockedAnd_rel,
1231   _InterlockedAnd_nf,
1232   _InterlockedIncrement_acq,
1233   _InterlockedIncrement_rel,
1234   _InterlockedIncrement_nf,
1235   _InterlockedDecrement_acq,
1236   _InterlockedDecrement_rel,
1237   _InterlockedDecrement_nf,
1238   __fastfail,
1239 };
1240 
1241 static std::optional<CodeGenFunction::MSVCIntrin>
1242 translateArmToMsvcIntrin(unsigned BuiltinID) {
1243   using MSVCIntrin = CodeGenFunction::MSVCIntrin;
1244   switch (BuiltinID) {
1245   default:
1246     return std::nullopt;
1247   case clang::ARM::BI_BitScanForward:
1248   case clang::ARM::BI_BitScanForward64:
1249     return MSVCIntrin::_BitScanForward;
1250   case clang::ARM::BI_BitScanReverse:
1251   case clang::ARM::BI_BitScanReverse64:
1252     return MSVCIntrin::_BitScanReverse;
1253   case clang::ARM::BI_InterlockedAnd64:
1254     return MSVCIntrin::_InterlockedAnd;
1255   case clang::ARM::BI_InterlockedExchange64:
1256     return MSVCIntrin::_InterlockedExchange;
1257   case clang::ARM::BI_InterlockedExchangeAdd64:
1258     return MSVCIntrin::_InterlockedExchangeAdd;
1259   case clang::ARM::BI_InterlockedExchangeSub64:
1260     return MSVCIntrin::_InterlockedExchangeSub;
1261   case clang::ARM::BI_InterlockedOr64:
1262     return MSVCIntrin::_InterlockedOr;
1263   case clang::ARM::BI_InterlockedXor64:
1264     return MSVCIntrin::_InterlockedXor;
1265   case clang::ARM::BI_InterlockedDecrement64:
1266     return MSVCIntrin::_InterlockedDecrement;
1267   case clang::ARM::BI_InterlockedIncrement64:
1268     return MSVCIntrin::_InterlockedIncrement;
1269   case clang::ARM::BI_InterlockedExchangeAdd8_acq:
1270   case clang::ARM::BI_InterlockedExchangeAdd16_acq:
1271   case clang::ARM::BI_InterlockedExchangeAdd_acq:
1272   case clang::ARM::BI_InterlockedExchangeAdd64_acq:
1273     return MSVCIntrin::_InterlockedExchangeAdd_acq;
1274   case clang::ARM::BI_InterlockedExchangeAdd8_rel:
1275   case clang::ARM::BI_InterlockedExchangeAdd16_rel:
1276   case clang::ARM::BI_InterlockedExchangeAdd_rel:
1277   case clang::ARM::BI_InterlockedExchangeAdd64_rel:
1278     return MSVCIntrin::_InterlockedExchangeAdd_rel;
1279   case clang::ARM::BI_InterlockedExchangeAdd8_nf:
1280   case clang::ARM::BI_InterlockedExchangeAdd16_nf:
1281   case clang::ARM::BI_InterlockedExchangeAdd_nf:
1282   case clang::ARM::BI_InterlockedExchangeAdd64_nf:
1283     return MSVCIntrin::_InterlockedExchangeAdd_nf;
1284   case clang::ARM::BI_InterlockedExchange8_acq:
1285   case clang::ARM::BI_InterlockedExchange16_acq:
1286   case clang::ARM::BI_InterlockedExchange_acq:
1287   case clang::ARM::BI_InterlockedExchange64_acq:
1288     return MSVCIntrin::_InterlockedExchange_acq;
1289   case clang::ARM::BI_InterlockedExchange8_rel:
1290   case clang::ARM::BI_InterlockedExchange16_rel:
1291   case clang::ARM::BI_InterlockedExchange_rel:
1292   case clang::ARM::BI_InterlockedExchange64_rel:
1293     return MSVCIntrin::_InterlockedExchange_rel;
1294   case clang::ARM::BI_InterlockedExchange8_nf:
1295   case clang::ARM::BI_InterlockedExchange16_nf:
1296   case clang::ARM::BI_InterlockedExchange_nf:
1297   case clang::ARM::BI_InterlockedExchange64_nf:
1298     return MSVCIntrin::_InterlockedExchange_nf;
1299   case clang::ARM::BI_InterlockedCompareExchange8_acq:
1300   case clang::ARM::BI_InterlockedCompareExchange16_acq:
1301   case clang::ARM::BI_InterlockedCompareExchange_acq:
1302   case clang::ARM::BI_InterlockedCompareExchange64_acq:
1303     return MSVCIntrin::_InterlockedCompareExchange_acq;
1304   case clang::ARM::BI_InterlockedCompareExchange8_rel:
1305   case clang::ARM::BI_InterlockedCompareExchange16_rel:
1306   case clang::ARM::BI_InterlockedCompareExchange_rel:
1307   case clang::ARM::BI_InterlockedCompareExchange64_rel:
1308     return MSVCIntrin::_InterlockedCompareExchange_rel;
1309   case clang::ARM::BI_InterlockedCompareExchange8_nf:
1310   case clang::ARM::BI_InterlockedCompareExchange16_nf:
1311   case clang::ARM::BI_InterlockedCompareExchange_nf:
1312   case clang::ARM::BI_InterlockedCompareExchange64_nf:
1313     return MSVCIntrin::_InterlockedCompareExchange_nf;
1314   case clang::ARM::BI_InterlockedOr8_acq:
1315   case clang::ARM::BI_InterlockedOr16_acq:
1316   case clang::ARM::BI_InterlockedOr_acq:
1317   case clang::ARM::BI_InterlockedOr64_acq:
1318     return MSVCIntrin::_InterlockedOr_acq;
1319   case clang::ARM::BI_InterlockedOr8_rel:
1320   case clang::ARM::BI_InterlockedOr16_rel:
1321   case clang::ARM::BI_InterlockedOr_rel:
1322   case clang::ARM::BI_InterlockedOr64_rel:
1323     return MSVCIntrin::_InterlockedOr_rel;
1324   case clang::ARM::BI_InterlockedOr8_nf:
1325   case clang::ARM::BI_InterlockedOr16_nf:
1326   case clang::ARM::BI_InterlockedOr_nf:
1327   case clang::ARM::BI_InterlockedOr64_nf:
1328     return MSVCIntrin::_InterlockedOr_nf;
1329   case clang::ARM::BI_InterlockedXor8_acq:
1330   case clang::ARM::BI_InterlockedXor16_acq:
1331   case clang::ARM::BI_InterlockedXor_acq:
1332   case clang::ARM::BI_InterlockedXor64_acq:
1333     return MSVCIntrin::_InterlockedXor_acq;
1334   case clang::ARM::BI_InterlockedXor8_rel:
1335   case clang::ARM::BI_InterlockedXor16_rel:
1336   case clang::ARM::BI_InterlockedXor_rel:
1337   case clang::ARM::BI_InterlockedXor64_rel:
1338     return MSVCIntrin::_InterlockedXor_rel;
1339   case clang::ARM::BI_InterlockedXor8_nf:
1340   case clang::ARM::BI_InterlockedXor16_nf:
1341   case clang::ARM::BI_InterlockedXor_nf:
1342   case clang::ARM::BI_InterlockedXor64_nf:
1343     return MSVCIntrin::_InterlockedXor_nf;
1344   case clang::ARM::BI_InterlockedAnd8_acq:
1345   case clang::ARM::BI_InterlockedAnd16_acq:
1346   case clang::ARM::BI_InterlockedAnd_acq:
1347   case clang::ARM::BI_InterlockedAnd64_acq:
1348     return MSVCIntrin::_InterlockedAnd_acq;
1349   case clang::ARM::BI_InterlockedAnd8_rel:
1350   case clang::ARM::BI_InterlockedAnd16_rel:
1351   case clang::ARM::BI_InterlockedAnd_rel:
1352   case clang::ARM::BI_InterlockedAnd64_rel:
1353     return MSVCIntrin::_InterlockedAnd_rel;
1354   case clang::ARM::BI_InterlockedAnd8_nf:
1355   case clang::ARM::BI_InterlockedAnd16_nf:
1356   case clang::ARM::BI_InterlockedAnd_nf:
1357   case clang::ARM::BI_InterlockedAnd64_nf:
1358     return MSVCIntrin::_InterlockedAnd_nf;
1359   case clang::ARM::BI_InterlockedIncrement16_acq:
1360   case clang::ARM::BI_InterlockedIncrement_acq:
1361   case clang::ARM::BI_InterlockedIncrement64_acq:
1362     return MSVCIntrin::_InterlockedIncrement_acq;
1363   case clang::ARM::BI_InterlockedIncrement16_rel:
1364   case clang::ARM::BI_InterlockedIncrement_rel:
1365   case clang::ARM::BI_InterlockedIncrement64_rel:
1366     return MSVCIntrin::_InterlockedIncrement_rel;
1367   case clang::ARM::BI_InterlockedIncrement16_nf:
1368   case clang::ARM::BI_InterlockedIncrement_nf:
1369   case clang::ARM::BI_InterlockedIncrement64_nf:
1370     return MSVCIntrin::_InterlockedIncrement_nf;
1371   case clang::ARM::BI_InterlockedDecrement16_acq:
1372   case clang::ARM::BI_InterlockedDecrement_acq:
1373   case clang::ARM::BI_InterlockedDecrement64_acq:
1374     return MSVCIntrin::_InterlockedDecrement_acq;
1375   case clang::ARM::BI_InterlockedDecrement16_rel:
1376   case clang::ARM::BI_InterlockedDecrement_rel:
1377   case clang::ARM::BI_InterlockedDecrement64_rel:
1378     return MSVCIntrin::_InterlockedDecrement_rel;
1379   case clang::ARM::BI_InterlockedDecrement16_nf:
1380   case clang::ARM::BI_InterlockedDecrement_nf:
1381   case clang::ARM::BI_InterlockedDecrement64_nf:
1382     return MSVCIntrin::_InterlockedDecrement_nf;
1383   }
1384   llvm_unreachable("must return from switch");
1385 }
1386 
1387 static std::optional<CodeGenFunction::MSVCIntrin>
1388 translateAarch64ToMsvcIntrin(unsigned BuiltinID) {
1389   using MSVCIntrin = CodeGenFunction::MSVCIntrin;
1390   switch (BuiltinID) {
1391   default:
1392     return std::nullopt;
1393   case clang::AArch64::BI_BitScanForward:
1394   case clang::AArch64::BI_BitScanForward64:
1395     return MSVCIntrin::_BitScanForward;
1396   case clang::AArch64::BI_BitScanReverse:
1397   case clang::AArch64::BI_BitScanReverse64:
1398     return MSVCIntrin::_BitScanReverse;
1399   case clang::AArch64::BI_InterlockedAnd64:
1400     return MSVCIntrin::_InterlockedAnd;
1401   case clang::AArch64::BI_InterlockedExchange64:
1402     return MSVCIntrin::_InterlockedExchange;
1403   case clang::AArch64::BI_InterlockedExchangeAdd64:
1404     return MSVCIntrin::_InterlockedExchangeAdd;
1405   case clang::AArch64::BI_InterlockedExchangeSub64:
1406     return MSVCIntrin::_InterlockedExchangeSub;
1407   case clang::AArch64::BI_InterlockedOr64:
1408     return MSVCIntrin::_InterlockedOr;
1409   case clang::AArch64::BI_InterlockedXor64:
1410     return MSVCIntrin::_InterlockedXor;
1411   case clang::AArch64::BI_InterlockedDecrement64:
1412     return MSVCIntrin::_InterlockedDecrement;
1413   case clang::AArch64::BI_InterlockedIncrement64:
1414     return MSVCIntrin::_InterlockedIncrement;
1415   case clang::AArch64::BI_InterlockedExchangeAdd8_acq:
1416   case clang::AArch64::BI_InterlockedExchangeAdd16_acq:
1417   case clang::AArch64::BI_InterlockedExchangeAdd_acq:
1418   case clang::AArch64::BI_InterlockedExchangeAdd64_acq:
1419     return MSVCIntrin::_InterlockedExchangeAdd_acq;
1420   case clang::AArch64::BI_InterlockedExchangeAdd8_rel:
1421   case clang::AArch64::BI_InterlockedExchangeAdd16_rel:
1422   case clang::AArch64::BI_InterlockedExchangeAdd_rel:
1423   case clang::AArch64::BI_InterlockedExchangeAdd64_rel:
1424     return MSVCIntrin::_InterlockedExchangeAdd_rel;
1425   case clang::AArch64::BI_InterlockedExchangeAdd8_nf:
1426   case clang::AArch64::BI_InterlockedExchangeAdd16_nf:
1427   case clang::AArch64::BI_InterlockedExchangeAdd_nf:
1428   case clang::AArch64::BI_InterlockedExchangeAdd64_nf:
1429     return MSVCIntrin::_InterlockedExchangeAdd_nf;
1430   case clang::AArch64::BI_InterlockedExchange8_acq:
1431   case clang::AArch64::BI_InterlockedExchange16_acq:
1432   case clang::AArch64::BI_InterlockedExchange_acq:
1433   case clang::AArch64::BI_InterlockedExchange64_acq:
1434     return MSVCIntrin::_InterlockedExchange_acq;
1435   case clang::AArch64::BI_InterlockedExchange8_rel:
1436   case clang::AArch64::BI_InterlockedExchange16_rel:
1437   case clang::AArch64::BI_InterlockedExchange_rel:
1438   case clang::AArch64::BI_InterlockedExchange64_rel:
1439     return MSVCIntrin::_InterlockedExchange_rel;
1440   case clang::AArch64::BI_InterlockedExchange8_nf:
1441   case clang::AArch64::BI_InterlockedExchange16_nf:
1442   case clang::AArch64::BI_InterlockedExchange_nf:
1443   case clang::AArch64::BI_InterlockedExchange64_nf:
1444     return MSVCIntrin::_InterlockedExchange_nf;
1445   case clang::AArch64::BI_InterlockedCompareExchange8_acq:
1446   case clang::AArch64::BI_InterlockedCompareExchange16_acq:
1447   case clang::AArch64::BI_InterlockedCompareExchange_acq:
1448   case clang::AArch64::BI_InterlockedCompareExchange64_acq:
1449     return MSVCIntrin::_InterlockedCompareExchange_acq;
1450   case clang::AArch64::BI_InterlockedCompareExchange8_rel:
1451   case clang::AArch64::BI_InterlockedCompareExchange16_rel:
1452   case clang::AArch64::BI_InterlockedCompareExchange_rel:
1453   case clang::AArch64::BI_InterlockedCompareExchange64_rel:
1454     return MSVCIntrin::_InterlockedCompareExchange_rel;
1455   case clang::AArch64::BI_InterlockedCompareExchange8_nf:
1456   case clang::AArch64::BI_InterlockedCompareExchange16_nf:
1457   case clang::AArch64::BI_InterlockedCompareExchange_nf:
1458   case clang::AArch64::BI_InterlockedCompareExchange64_nf:
1459     return MSVCIntrin::_InterlockedCompareExchange_nf;
1460   case clang::AArch64::BI_InterlockedCompareExchange128:
1461     return MSVCIntrin::_InterlockedCompareExchange128;
1462   case clang::AArch64::BI_InterlockedCompareExchange128_acq:
1463     return MSVCIntrin::_InterlockedCompareExchange128_acq;
1464   case clang::AArch64::BI_InterlockedCompareExchange128_nf:
1465     return MSVCIntrin::_InterlockedCompareExchange128_nf;
1466   case clang::AArch64::BI_InterlockedCompareExchange128_rel:
1467     return MSVCIntrin::_InterlockedCompareExchange128_rel;
1468   case clang::AArch64::BI_InterlockedOr8_acq:
1469   case clang::AArch64::BI_InterlockedOr16_acq:
1470   case clang::AArch64::BI_InterlockedOr_acq:
1471   case clang::AArch64::BI_InterlockedOr64_acq:
1472     return MSVCIntrin::_InterlockedOr_acq;
1473   case clang::AArch64::BI_InterlockedOr8_rel:
1474   case clang::AArch64::BI_InterlockedOr16_rel:
1475   case clang::AArch64::BI_InterlockedOr_rel:
1476   case clang::AArch64::BI_InterlockedOr64_rel:
1477     return MSVCIntrin::_InterlockedOr_rel;
1478   case clang::AArch64::BI_InterlockedOr8_nf:
1479   case clang::AArch64::BI_InterlockedOr16_nf:
1480   case clang::AArch64::BI_InterlockedOr_nf:
1481   case clang::AArch64::BI_InterlockedOr64_nf:
1482     return MSVCIntrin::_InterlockedOr_nf;
1483   case clang::AArch64::BI_InterlockedXor8_acq:
1484   case clang::AArch64::BI_InterlockedXor16_acq:
1485   case clang::AArch64::BI_InterlockedXor_acq:
1486   case clang::AArch64::BI_InterlockedXor64_acq:
1487     return MSVCIntrin::_InterlockedXor_acq;
1488   case clang::AArch64::BI_InterlockedXor8_rel:
1489   case clang::AArch64::BI_InterlockedXor16_rel:
1490   case clang::AArch64::BI_InterlockedXor_rel:
1491   case clang::AArch64::BI_InterlockedXor64_rel:
1492     return MSVCIntrin::_InterlockedXor_rel;
1493   case clang::AArch64::BI_InterlockedXor8_nf:
1494   case clang::AArch64::BI_InterlockedXor16_nf:
1495   case clang::AArch64::BI_InterlockedXor_nf:
1496   case clang::AArch64::BI_InterlockedXor64_nf:
1497     return MSVCIntrin::_InterlockedXor_nf;
1498   case clang::AArch64::BI_InterlockedAnd8_acq:
1499   case clang::AArch64::BI_InterlockedAnd16_acq:
1500   case clang::AArch64::BI_InterlockedAnd_acq:
1501   case clang::AArch64::BI_InterlockedAnd64_acq:
1502     return MSVCIntrin::_InterlockedAnd_acq;
1503   case clang::AArch64::BI_InterlockedAnd8_rel:
1504   case clang::AArch64::BI_InterlockedAnd16_rel:
1505   case clang::AArch64::BI_InterlockedAnd_rel:
1506   case clang::AArch64::BI_InterlockedAnd64_rel:
1507     return MSVCIntrin::_InterlockedAnd_rel;
1508   case clang::AArch64::BI_InterlockedAnd8_nf:
1509   case clang::AArch64::BI_InterlockedAnd16_nf:
1510   case clang::AArch64::BI_InterlockedAnd_nf:
1511   case clang::AArch64::BI_InterlockedAnd64_nf:
1512     return MSVCIntrin::_InterlockedAnd_nf;
1513   case clang::AArch64::BI_InterlockedIncrement16_acq:
1514   case clang::AArch64::BI_InterlockedIncrement_acq:
1515   case clang::AArch64::BI_InterlockedIncrement64_acq:
1516     return MSVCIntrin::_InterlockedIncrement_acq;
1517   case clang::AArch64::BI_InterlockedIncrement16_rel:
1518   case clang::AArch64::BI_InterlockedIncrement_rel:
1519   case clang::AArch64::BI_InterlockedIncrement64_rel:
1520     return MSVCIntrin::_InterlockedIncrement_rel;
1521   case clang::AArch64::BI_InterlockedIncrement16_nf:
1522   case clang::AArch64::BI_InterlockedIncrement_nf:
1523   case clang::AArch64::BI_InterlockedIncrement64_nf:
1524     return MSVCIntrin::_InterlockedIncrement_nf;
1525   case clang::AArch64::BI_InterlockedDecrement16_acq:
1526   case clang::AArch64::BI_InterlockedDecrement_acq:
1527   case clang::AArch64::BI_InterlockedDecrement64_acq:
1528     return MSVCIntrin::_InterlockedDecrement_acq;
1529   case clang::AArch64::BI_InterlockedDecrement16_rel:
1530   case clang::AArch64::BI_InterlockedDecrement_rel:
1531   case clang::AArch64::BI_InterlockedDecrement64_rel:
1532     return MSVCIntrin::_InterlockedDecrement_rel;
1533   case clang::AArch64::BI_InterlockedDecrement16_nf:
1534   case clang::AArch64::BI_InterlockedDecrement_nf:
1535   case clang::AArch64::BI_InterlockedDecrement64_nf:
1536     return MSVCIntrin::_InterlockedDecrement_nf;
1537   }
1538   llvm_unreachable("must return from switch");
1539 }
1540 
1541 static std::optional<CodeGenFunction::MSVCIntrin>
1542 translateX86ToMsvcIntrin(unsigned BuiltinID) {
1543   using MSVCIntrin = CodeGenFunction::MSVCIntrin;
1544   switch (BuiltinID) {
1545   default:
1546     return std::nullopt;
1547   case clang::X86::BI_BitScanForward:
1548   case clang::X86::BI_BitScanForward64:
1549     return MSVCIntrin::_BitScanForward;
1550   case clang::X86::BI_BitScanReverse:
1551   case clang::X86::BI_BitScanReverse64:
1552     return MSVCIntrin::_BitScanReverse;
1553   case clang::X86::BI_InterlockedAnd64:
1554     return MSVCIntrin::_InterlockedAnd;
1555   case clang::X86::BI_InterlockedCompareExchange128:
1556     return MSVCIntrin::_InterlockedCompareExchange128;
1557   case clang::X86::BI_InterlockedExchange64:
1558     return MSVCIntrin::_InterlockedExchange;
1559   case clang::X86::BI_InterlockedExchangeAdd64:
1560     return MSVCIntrin::_InterlockedExchangeAdd;
1561   case clang::X86::BI_InterlockedExchangeSub64:
1562     return MSVCIntrin::_InterlockedExchangeSub;
1563   case clang::X86::BI_InterlockedOr64:
1564     return MSVCIntrin::_InterlockedOr;
1565   case clang::X86::BI_InterlockedXor64:
1566     return MSVCIntrin::_InterlockedXor;
1567   case clang::X86::BI_InterlockedDecrement64:
1568     return MSVCIntrin::_InterlockedDecrement;
1569   case clang::X86::BI_InterlockedIncrement64:
1570     return MSVCIntrin::_InterlockedIncrement;
1571   }
1572   llvm_unreachable("must return from switch");
1573 }
1574 
1575 // Emit an MSVC intrinsic. Assumes that arguments have *not* been evaluated.
1576 Value *CodeGenFunction::EmitMSVCBuiltinExpr(MSVCIntrin BuiltinID,
1577                                             const CallExpr *E) {
1578   switch (BuiltinID) {
1579   case MSVCIntrin::_BitScanForward:
1580   case MSVCIntrin::_BitScanReverse: {
1581     Address IndexAddress(EmitPointerWithAlignment(E->getArg(0)));
1582     Value *ArgValue = EmitScalarExpr(E->getArg(1));
1583 
1584     llvm::Type *ArgType = ArgValue->getType();
1585     llvm::Type *IndexType = IndexAddress.getElementType();
1586     llvm::Type *ResultType = ConvertType(E->getType());
1587 
1588     Value *ArgZero = llvm::Constant::getNullValue(ArgType);
1589     Value *ResZero = llvm::Constant::getNullValue(ResultType);
1590     Value *ResOne = llvm::ConstantInt::get(ResultType, 1);
1591 
1592     BasicBlock *Begin = Builder.GetInsertBlock();
1593     BasicBlock *End = createBasicBlock("bitscan_end", this->CurFn);
1594     Builder.SetInsertPoint(End);
1595     PHINode *Result = Builder.CreatePHI(ResultType, 2, "bitscan_result");
1596 
1597     Builder.SetInsertPoint(Begin);
1598     Value *IsZero = Builder.CreateICmpEQ(ArgValue, ArgZero);
1599     BasicBlock *NotZero = createBasicBlock("bitscan_not_zero", this->CurFn);
1600     Builder.CreateCondBr(IsZero, End, NotZero);
1601     Result->addIncoming(ResZero, Begin);
1602 
1603     Builder.SetInsertPoint(NotZero);
1604 
1605     if (BuiltinID == MSVCIntrin::_BitScanForward) {
1606       Function *F = CGM.getIntrinsic(Intrinsic::cttz, ArgType);
1607       Value *ZeroCount = Builder.CreateCall(F, {ArgValue, Builder.getTrue()});
1608       ZeroCount = Builder.CreateIntCast(ZeroCount, IndexType, false);
1609       Builder.CreateStore(ZeroCount, IndexAddress, false);
1610     } else {
1611       unsigned ArgWidth = cast<llvm::IntegerType>(ArgType)->getBitWidth();
1612       Value *ArgTypeLastIndex = llvm::ConstantInt::get(IndexType, ArgWidth - 1);
1613 
1614       Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
1615       Value *ZeroCount = Builder.CreateCall(F, {ArgValue, Builder.getTrue()});
1616       ZeroCount = Builder.CreateIntCast(ZeroCount, IndexType, false);
1617       Value *Index = Builder.CreateNSWSub(ArgTypeLastIndex, ZeroCount);
1618       Builder.CreateStore(Index, IndexAddress, false);
1619     }
1620     Builder.CreateBr(End);
1621     Result->addIncoming(ResOne, NotZero);
1622 
1623     Builder.SetInsertPoint(End);
1624     return Result;
1625   }
1626   case MSVCIntrin::_InterlockedAnd:
1627     return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E);
1628   case MSVCIntrin::_InterlockedExchange:
1629     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E);
1630   case MSVCIntrin::_InterlockedExchangeAdd:
1631     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E);
1632   case MSVCIntrin::_InterlockedExchangeSub:
1633     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Sub, E);
1634   case MSVCIntrin::_InterlockedOr:
1635     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E);
1636   case MSVCIntrin::_InterlockedXor:
1637     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xor, E);
1638   case MSVCIntrin::_InterlockedExchangeAdd_acq:
1639     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E,
1640                                  AtomicOrdering::Acquire);
1641   case MSVCIntrin::_InterlockedExchangeAdd_rel:
1642     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E,
1643                                  AtomicOrdering::Release);
1644   case MSVCIntrin::_InterlockedExchangeAdd_nf:
1645     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E,
1646                                  AtomicOrdering::Monotonic);
1647   case MSVCIntrin::_InterlockedExchange_acq:
1648     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E,
1649                                  AtomicOrdering::Acquire);
1650   case MSVCIntrin::_InterlockedExchange_rel:
1651     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E,
1652                                  AtomicOrdering::Release);
1653   case MSVCIntrin::_InterlockedExchange_nf:
1654     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E,
1655                                  AtomicOrdering::Monotonic);
1656   case MSVCIntrin::_InterlockedCompareExchange_acq:
1657     return EmitAtomicCmpXchgForMSIntrin(*this, E, AtomicOrdering::Acquire);
1658   case MSVCIntrin::_InterlockedCompareExchange_rel:
1659     return EmitAtomicCmpXchgForMSIntrin(*this, E, AtomicOrdering::Release);
1660   case MSVCIntrin::_InterlockedCompareExchange_nf:
1661     return EmitAtomicCmpXchgForMSIntrin(*this, E, AtomicOrdering::Monotonic);
1662   case MSVCIntrin::_InterlockedCompareExchange128:
1663     return EmitAtomicCmpXchg128ForMSIntrin(
1664         *this, E, AtomicOrdering::SequentiallyConsistent);
1665   case MSVCIntrin::_InterlockedCompareExchange128_acq:
1666     return EmitAtomicCmpXchg128ForMSIntrin(*this, E, AtomicOrdering::Acquire);
1667   case MSVCIntrin::_InterlockedCompareExchange128_rel:
1668     return EmitAtomicCmpXchg128ForMSIntrin(*this, E, AtomicOrdering::Release);
1669   case MSVCIntrin::_InterlockedCompareExchange128_nf:
1670     return EmitAtomicCmpXchg128ForMSIntrin(*this, E, AtomicOrdering::Monotonic);
1671   case MSVCIntrin::_InterlockedOr_acq:
1672     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E,
1673                                  AtomicOrdering::Acquire);
1674   case MSVCIntrin::_InterlockedOr_rel:
1675     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E,
1676                                  AtomicOrdering::Release);
1677   case MSVCIntrin::_InterlockedOr_nf:
1678     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E,
1679                                  AtomicOrdering::Monotonic);
1680   case MSVCIntrin::_InterlockedXor_acq:
1681     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xor, E,
1682                                  AtomicOrdering::Acquire);
1683   case MSVCIntrin::_InterlockedXor_rel:
1684     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xor, E,
1685                                  AtomicOrdering::Release);
1686   case MSVCIntrin::_InterlockedXor_nf:
1687     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xor, E,
1688                                  AtomicOrdering::Monotonic);
1689   case MSVCIntrin::_InterlockedAnd_acq:
1690     return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E,
1691                                  AtomicOrdering::Acquire);
1692   case MSVCIntrin::_InterlockedAnd_rel:
1693     return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E,
1694                                  AtomicOrdering::Release);
1695   case MSVCIntrin::_InterlockedAnd_nf:
1696     return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E,
1697                                  AtomicOrdering::Monotonic);
1698   case MSVCIntrin::_InterlockedIncrement_acq:
1699     return EmitAtomicIncrementValue(*this, E, AtomicOrdering::Acquire);
1700   case MSVCIntrin::_InterlockedIncrement_rel:
1701     return EmitAtomicIncrementValue(*this, E, AtomicOrdering::Release);
1702   case MSVCIntrin::_InterlockedIncrement_nf:
1703     return EmitAtomicIncrementValue(*this, E, AtomicOrdering::Monotonic);
1704   case MSVCIntrin::_InterlockedDecrement_acq:
1705     return EmitAtomicDecrementValue(*this, E, AtomicOrdering::Acquire);
1706   case MSVCIntrin::_InterlockedDecrement_rel:
1707     return EmitAtomicDecrementValue(*this, E, AtomicOrdering::Release);
1708   case MSVCIntrin::_InterlockedDecrement_nf:
1709     return EmitAtomicDecrementValue(*this, E, AtomicOrdering::Monotonic);
1710 
1711   case MSVCIntrin::_InterlockedDecrement:
1712     return EmitAtomicDecrementValue(*this, E);
1713   case MSVCIntrin::_InterlockedIncrement:
1714     return EmitAtomicIncrementValue(*this, E);
1715 
1716   case MSVCIntrin::__fastfail: {
1717     // Request immediate process termination from the kernel. The instruction
1718     // sequences to do this are documented on MSDN:
1719     // https://msdn.microsoft.com/en-us/library/dn774154.aspx
1720     llvm::Triple::ArchType ISA = getTarget().getTriple().getArch();
1721     StringRef Asm, Constraints;
1722     switch (ISA) {
1723     default:
1724       ErrorUnsupported(E, "__fastfail call for this architecture");
1725       break;
1726     case llvm::Triple::x86:
1727     case llvm::Triple::x86_64:
1728       Asm = "int $$0x29";
1729       Constraints = "{cx}";
1730       break;
1731     case llvm::Triple::thumb:
1732       Asm = "udf #251";
1733       Constraints = "{r0}";
1734       break;
1735     case llvm::Triple::aarch64:
1736       Asm = "brk #0xF003";
1737       Constraints = "{w0}";
1738     }
1739     llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, {Int32Ty}, false);
1740     llvm::InlineAsm *IA =
1741         llvm::InlineAsm::get(FTy, Asm, Constraints, /*hasSideEffects=*/true);
1742     llvm::AttributeList NoReturnAttr = llvm::AttributeList::get(
1743         getLLVMContext(), llvm::AttributeList::FunctionIndex,
1744         llvm::Attribute::NoReturn);
1745     llvm::CallInst *CI = Builder.CreateCall(IA, EmitScalarExpr(E->getArg(0)));
1746     CI->setAttributes(NoReturnAttr);
1747     return CI;
1748   }
1749   }
1750   llvm_unreachable("Incorrect MSVC intrinsic!");
1751 }
1752 
1753 namespace {
1754 // ARC cleanup for __builtin_os_log_format
1755 struct CallObjCArcUse final : EHScopeStack::Cleanup {
1756   CallObjCArcUse(llvm::Value *object) : object(object) {}
1757   llvm::Value *object;
1758 
1759   void Emit(CodeGenFunction &CGF, Flags flags) override {
1760     CGF.EmitARCIntrinsicUse(object);
1761   }
1762 };
1763 }
1764 
1765 Value *CodeGenFunction::EmitCheckedArgForBuiltin(const Expr *E,
1766                                                  BuiltinCheckKind Kind) {
1767   assert((Kind == BCK_CLZPassedZero || Kind == BCK_CTZPassedZero)
1768           && "Unsupported builtin check kind");
1769 
1770   Value *ArgValue = EmitScalarExpr(E);
1771   if (!SanOpts.has(SanitizerKind::Builtin))
1772     return ArgValue;
1773 
1774   SanitizerScope SanScope(this);
1775   Value *Cond = Builder.CreateICmpNE(
1776       ArgValue, llvm::Constant::getNullValue(ArgValue->getType()));
1777   EmitCheck(std::make_pair(Cond, SanitizerKind::Builtin),
1778             SanitizerHandler::InvalidBuiltin,
1779             {EmitCheckSourceLocation(E->getExprLoc()),
1780              llvm::ConstantInt::get(Builder.getInt8Ty(), Kind)},
1781             std::nullopt);
1782   return ArgValue;
1783 }
1784 
1785 /// Get the argument type for arguments to os_log_helper.
1786 static CanQualType getOSLogArgType(ASTContext &C, int Size) {
1787   QualType UnsignedTy = C.getIntTypeForBitwidth(Size * 8, /*Signed=*/false);
1788   return C.getCanonicalType(UnsignedTy);
1789 }
1790 
1791 llvm::Function *CodeGenFunction::generateBuiltinOSLogHelperFunction(
1792     const analyze_os_log::OSLogBufferLayout &Layout,
1793     CharUnits BufferAlignment) {
1794   ASTContext &Ctx = getContext();
1795 
1796   llvm::SmallString<64> Name;
1797   {
1798     raw_svector_ostream OS(Name);
1799     OS << "__os_log_helper";
1800     OS << "_" << BufferAlignment.getQuantity();
1801     OS << "_" << int(Layout.getSummaryByte());
1802     OS << "_" << int(Layout.getNumArgsByte());
1803     for (const auto &Item : Layout.Items)
1804       OS << "_" << int(Item.getSizeByte()) << "_"
1805          << int(Item.getDescriptorByte());
1806   }
1807 
1808   if (llvm::Function *F = CGM.getModule().getFunction(Name))
1809     return F;
1810 
1811   llvm::SmallVector<QualType, 4> ArgTys;
1812   FunctionArgList Args;
1813   Args.push_back(ImplicitParamDecl::Create(
1814       Ctx, nullptr, SourceLocation(), &Ctx.Idents.get("buffer"), Ctx.VoidPtrTy,
1815       ImplicitParamDecl::Other));
1816   ArgTys.emplace_back(Ctx.VoidPtrTy);
1817 
1818   for (unsigned int I = 0, E = Layout.Items.size(); I < E; ++I) {
1819     char Size = Layout.Items[I].getSizeByte();
1820     if (!Size)
1821       continue;
1822 
1823     QualType ArgTy = getOSLogArgType(Ctx, Size);
1824     Args.push_back(ImplicitParamDecl::Create(
1825         Ctx, nullptr, SourceLocation(),
1826         &Ctx.Idents.get(std::string("arg") + llvm::to_string(I)), ArgTy,
1827         ImplicitParamDecl::Other));
1828     ArgTys.emplace_back(ArgTy);
1829   }
1830 
1831   QualType ReturnTy = Ctx.VoidTy;
1832 
1833   // The helper function has linkonce_odr linkage to enable the linker to merge
1834   // identical functions. To ensure the merging always happens, 'noinline' is
1835   // attached to the function when compiling with -Oz.
1836   const CGFunctionInfo &FI =
1837       CGM.getTypes().arrangeBuiltinFunctionDeclaration(ReturnTy, Args);
1838   llvm::FunctionType *FuncTy = CGM.getTypes().GetFunctionType(FI);
1839   llvm::Function *Fn = llvm::Function::Create(
1840       FuncTy, llvm::GlobalValue::LinkOnceODRLinkage, Name, &CGM.getModule());
1841   Fn->setVisibility(llvm::GlobalValue::HiddenVisibility);
1842   CGM.SetLLVMFunctionAttributes(GlobalDecl(), FI, Fn, /*IsThunk=*/false);
1843   CGM.SetLLVMFunctionAttributesForDefinition(nullptr, Fn);
1844   Fn->setDoesNotThrow();
1845 
1846   // Attach 'noinline' at -Oz.
1847   if (CGM.getCodeGenOpts().OptimizeSize == 2)
1848     Fn->addFnAttr(llvm::Attribute::NoInline);
1849 
1850   auto NL = ApplyDebugLocation::CreateEmpty(*this);
1851   StartFunction(GlobalDecl(), ReturnTy, Fn, FI, Args);
1852 
1853   // Create a scope with an artificial location for the body of this function.
1854   auto AL = ApplyDebugLocation::CreateArtificial(*this);
1855 
1856   CharUnits Offset;
1857   Address BufAddr =
1858       Address(Builder.CreateLoad(GetAddrOfLocalVar(Args[0]), "buf"), Int8Ty,
1859               BufferAlignment);
1860   Builder.CreateStore(Builder.getInt8(Layout.getSummaryByte()),
1861                       Builder.CreateConstByteGEP(BufAddr, Offset++, "summary"));
1862   Builder.CreateStore(Builder.getInt8(Layout.getNumArgsByte()),
1863                       Builder.CreateConstByteGEP(BufAddr, Offset++, "numArgs"));
1864 
1865   unsigned I = 1;
1866   for (const auto &Item : Layout.Items) {
1867     Builder.CreateStore(
1868         Builder.getInt8(Item.getDescriptorByte()),
1869         Builder.CreateConstByteGEP(BufAddr, Offset++, "argDescriptor"));
1870     Builder.CreateStore(
1871         Builder.getInt8(Item.getSizeByte()),
1872         Builder.CreateConstByteGEP(BufAddr, Offset++, "argSize"));
1873 
1874     CharUnits Size = Item.size();
1875     if (!Size.getQuantity())
1876       continue;
1877 
1878     Address Arg = GetAddrOfLocalVar(Args[I]);
1879     Address Addr = Builder.CreateConstByteGEP(BufAddr, Offset, "argData");
1880     Addr = Addr.withElementType(Arg.getElementType());
1881     Builder.CreateStore(Builder.CreateLoad(Arg), Addr);
1882     Offset += Size;
1883     ++I;
1884   }
1885 
1886   FinishFunction();
1887 
1888   return Fn;
1889 }
1890 
1891 RValue CodeGenFunction::emitBuiltinOSLogFormat(const CallExpr &E) {
1892   assert(E.getNumArgs() >= 2 &&
1893          "__builtin_os_log_format takes at least 2 arguments");
1894   ASTContext &Ctx = getContext();
1895   analyze_os_log::OSLogBufferLayout Layout;
1896   analyze_os_log::computeOSLogBufferLayout(Ctx, &E, Layout);
1897   Address BufAddr = EmitPointerWithAlignment(E.getArg(0));
1898   llvm::SmallVector<llvm::Value *, 4> RetainableOperands;
1899 
1900   // Ignore argument 1, the format string. It is not currently used.
1901   CallArgList Args;
1902   Args.add(RValue::get(BufAddr.getPointer()), Ctx.VoidPtrTy);
1903 
1904   for (const auto &Item : Layout.Items) {
1905     int Size = Item.getSizeByte();
1906     if (!Size)
1907       continue;
1908 
1909     llvm::Value *ArgVal;
1910 
1911     if (Item.getKind() == analyze_os_log::OSLogBufferItem::MaskKind) {
1912       uint64_t Val = 0;
1913       for (unsigned I = 0, E = Item.getMaskType().size(); I < E; ++I)
1914         Val |= ((uint64_t)Item.getMaskType()[I]) << I * 8;
1915       ArgVal = llvm::Constant::getIntegerValue(Int64Ty, llvm::APInt(64, Val));
1916     } else if (const Expr *TheExpr = Item.getExpr()) {
1917       ArgVal = EmitScalarExpr(TheExpr, /*Ignore*/ false);
1918 
1919       // If a temporary object that requires destruction after the full
1920       // expression is passed, push a lifetime-extended cleanup to extend its
1921       // lifetime to the end of the enclosing block scope.
1922       auto LifetimeExtendObject = [&](const Expr *E) {
1923         E = E->IgnoreParenCasts();
1924         // Extend lifetimes of objects returned by function calls and message
1925         // sends.
1926 
1927         // FIXME: We should do this in other cases in which temporaries are
1928         //        created including arguments of non-ARC types (e.g., C++
1929         //        temporaries).
1930         if (isa<CallExpr>(E) || isa<ObjCMessageExpr>(E))
1931           return true;
1932         return false;
1933       };
1934 
1935       if (TheExpr->getType()->isObjCRetainableType() &&
1936           getLangOpts().ObjCAutoRefCount && LifetimeExtendObject(TheExpr)) {
1937         assert(getEvaluationKind(TheExpr->getType()) == TEK_Scalar &&
1938                "Only scalar can be a ObjC retainable type");
1939         if (!isa<Constant>(ArgVal)) {
1940           CleanupKind Cleanup = getARCCleanupKind();
1941           QualType Ty = TheExpr->getType();
1942           Address Alloca = Address::invalid();
1943           Address Addr = CreateMemTemp(Ty, "os.log.arg", &Alloca);
1944           ArgVal = EmitARCRetain(Ty, ArgVal);
1945           Builder.CreateStore(ArgVal, Addr);
1946           pushLifetimeExtendedDestroy(Cleanup, Alloca, Ty,
1947                                       CodeGenFunction::destroyARCStrongPrecise,
1948                                       Cleanup & EHCleanup);
1949 
1950           // Push a clang.arc.use call to ensure ARC optimizer knows that the
1951           // argument has to be alive.
1952           if (CGM.getCodeGenOpts().OptimizationLevel != 0)
1953             pushCleanupAfterFullExpr<CallObjCArcUse>(Cleanup, ArgVal);
1954         }
1955       }
1956     } else {
1957       ArgVal = Builder.getInt32(Item.getConstValue().getQuantity());
1958     }
1959 
1960     unsigned ArgValSize =
1961         CGM.getDataLayout().getTypeSizeInBits(ArgVal->getType());
1962     llvm::IntegerType *IntTy = llvm::Type::getIntNTy(getLLVMContext(),
1963                                                      ArgValSize);
1964     ArgVal = Builder.CreateBitOrPointerCast(ArgVal, IntTy);
1965     CanQualType ArgTy = getOSLogArgType(Ctx, Size);
1966     // If ArgVal has type x86_fp80, zero-extend ArgVal.
1967     ArgVal = Builder.CreateZExtOrBitCast(ArgVal, ConvertType(ArgTy));
1968     Args.add(RValue::get(ArgVal), ArgTy);
1969   }
1970 
1971   const CGFunctionInfo &FI =
1972       CGM.getTypes().arrangeBuiltinFunctionCall(Ctx.VoidTy, Args);
1973   llvm::Function *F = CodeGenFunction(CGM).generateBuiltinOSLogHelperFunction(
1974       Layout, BufAddr.getAlignment());
1975   EmitCall(FI, CGCallee::forDirect(F), ReturnValueSlot(), Args);
1976   return RValue::get(BufAddr.getPointer());
1977 }
1978 
1979 static bool isSpecialUnsignedMultiplySignedResult(
1980     unsigned BuiltinID, WidthAndSignedness Op1Info, WidthAndSignedness Op2Info,
1981     WidthAndSignedness ResultInfo) {
1982   return BuiltinID == Builtin::BI__builtin_mul_overflow &&
1983          Op1Info.Width == Op2Info.Width && Op2Info.Width == ResultInfo.Width &&
1984          !Op1Info.Signed && !Op2Info.Signed && ResultInfo.Signed;
1985 }
1986 
1987 static RValue EmitCheckedUnsignedMultiplySignedResult(
1988     CodeGenFunction &CGF, const clang::Expr *Op1, WidthAndSignedness Op1Info,
1989     const clang::Expr *Op2, WidthAndSignedness Op2Info,
1990     const clang::Expr *ResultArg, QualType ResultQTy,
1991     WidthAndSignedness ResultInfo) {
1992   assert(isSpecialUnsignedMultiplySignedResult(
1993              Builtin::BI__builtin_mul_overflow, Op1Info, Op2Info, ResultInfo) &&
1994          "Cannot specialize this multiply");
1995 
1996   llvm::Value *V1 = CGF.EmitScalarExpr(Op1);
1997   llvm::Value *V2 = CGF.EmitScalarExpr(Op2);
1998 
1999   llvm::Value *HasOverflow;
2000   llvm::Value *Result = EmitOverflowIntrinsic(
2001       CGF, llvm::Intrinsic::umul_with_overflow, V1, V2, HasOverflow);
2002 
2003   // The intrinsic call will detect overflow when the value is > UINT_MAX,
2004   // however, since the original builtin had a signed result, we need to report
2005   // an overflow when the result is greater than INT_MAX.
2006   auto IntMax = llvm::APInt::getSignedMaxValue(ResultInfo.Width);
2007   llvm::Value *IntMaxValue = llvm::ConstantInt::get(Result->getType(), IntMax);
2008 
2009   llvm::Value *IntMaxOverflow = CGF.Builder.CreateICmpUGT(Result, IntMaxValue);
2010   HasOverflow = CGF.Builder.CreateOr(HasOverflow, IntMaxOverflow);
2011 
2012   bool isVolatile =
2013       ResultArg->getType()->getPointeeType().isVolatileQualified();
2014   Address ResultPtr = CGF.EmitPointerWithAlignment(ResultArg);
2015   CGF.Builder.CreateStore(CGF.EmitToMemory(Result, ResultQTy), ResultPtr,
2016                           isVolatile);
2017   return RValue::get(HasOverflow);
2018 }
2019 
2020 /// Determine if a binop is a checked mixed-sign multiply we can specialize.
2021 static bool isSpecialMixedSignMultiply(unsigned BuiltinID,
2022                                        WidthAndSignedness Op1Info,
2023                                        WidthAndSignedness Op2Info,
2024                                        WidthAndSignedness ResultInfo) {
2025   return BuiltinID == Builtin::BI__builtin_mul_overflow &&
2026          std::max(Op1Info.Width, Op2Info.Width) >= ResultInfo.Width &&
2027          Op1Info.Signed != Op2Info.Signed;
2028 }
2029 
2030 /// Emit a checked mixed-sign multiply. This is a cheaper specialization of
2031 /// the generic checked-binop irgen.
2032 static RValue
2033 EmitCheckedMixedSignMultiply(CodeGenFunction &CGF, const clang::Expr *Op1,
2034                              WidthAndSignedness Op1Info, const clang::Expr *Op2,
2035                              WidthAndSignedness Op2Info,
2036                              const clang::Expr *ResultArg, QualType ResultQTy,
2037                              WidthAndSignedness ResultInfo) {
2038   assert(isSpecialMixedSignMultiply(Builtin::BI__builtin_mul_overflow, Op1Info,
2039                                     Op2Info, ResultInfo) &&
2040          "Not a mixed-sign multipliction we can specialize");
2041 
2042   // Emit the signed and unsigned operands.
2043   const clang::Expr *SignedOp = Op1Info.Signed ? Op1 : Op2;
2044   const clang::Expr *UnsignedOp = Op1Info.Signed ? Op2 : Op1;
2045   llvm::Value *Signed = CGF.EmitScalarExpr(SignedOp);
2046   llvm::Value *Unsigned = CGF.EmitScalarExpr(UnsignedOp);
2047   unsigned SignedOpWidth = Op1Info.Signed ? Op1Info.Width : Op2Info.Width;
2048   unsigned UnsignedOpWidth = Op1Info.Signed ? Op2Info.Width : Op1Info.Width;
2049 
2050   // One of the operands may be smaller than the other. If so, [s|z]ext it.
2051   if (SignedOpWidth < UnsignedOpWidth)
2052     Signed = CGF.Builder.CreateSExt(Signed, Unsigned->getType(), "op.sext");
2053   if (UnsignedOpWidth < SignedOpWidth)
2054     Unsigned = CGF.Builder.CreateZExt(Unsigned, Signed->getType(), "op.zext");
2055 
2056   llvm::Type *OpTy = Signed->getType();
2057   llvm::Value *Zero = llvm::Constant::getNullValue(OpTy);
2058   Address ResultPtr = CGF.EmitPointerWithAlignment(ResultArg);
2059   llvm::Type *ResTy = ResultPtr.getElementType();
2060   unsigned OpWidth = std::max(Op1Info.Width, Op2Info.Width);
2061 
2062   // Take the absolute value of the signed operand.
2063   llvm::Value *IsNegative = CGF.Builder.CreateICmpSLT(Signed, Zero);
2064   llvm::Value *AbsOfNegative = CGF.Builder.CreateSub(Zero, Signed);
2065   llvm::Value *AbsSigned =
2066       CGF.Builder.CreateSelect(IsNegative, AbsOfNegative, Signed);
2067 
2068   // Perform a checked unsigned multiplication.
2069   llvm::Value *UnsignedOverflow;
2070   llvm::Value *UnsignedResult =
2071       EmitOverflowIntrinsic(CGF, llvm::Intrinsic::umul_with_overflow, AbsSigned,
2072                             Unsigned, UnsignedOverflow);
2073 
2074   llvm::Value *Overflow, *Result;
2075   if (ResultInfo.Signed) {
2076     // Signed overflow occurs if the result is greater than INT_MAX or lesser
2077     // than INT_MIN, i.e when |Result| > (INT_MAX + IsNegative).
2078     auto IntMax =
2079         llvm::APInt::getSignedMaxValue(ResultInfo.Width).zext(OpWidth);
2080     llvm::Value *MaxResult =
2081         CGF.Builder.CreateAdd(llvm::ConstantInt::get(OpTy, IntMax),
2082                               CGF.Builder.CreateZExt(IsNegative, OpTy));
2083     llvm::Value *SignedOverflow =
2084         CGF.Builder.CreateICmpUGT(UnsignedResult, MaxResult);
2085     Overflow = CGF.Builder.CreateOr(UnsignedOverflow, SignedOverflow);
2086 
2087     // Prepare the signed result (possibly by negating it).
2088     llvm::Value *NegativeResult = CGF.Builder.CreateNeg(UnsignedResult);
2089     llvm::Value *SignedResult =
2090         CGF.Builder.CreateSelect(IsNegative, NegativeResult, UnsignedResult);
2091     Result = CGF.Builder.CreateTrunc(SignedResult, ResTy);
2092   } else {
2093     // Unsigned overflow occurs if the result is < 0 or greater than UINT_MAX.
2094     llvm::Value *Underflow = CGF.Builder.CreateAnd(
2095         IsNegative, CGF.Builder.CreateIsNotNull(UnsignedResult));
2096     Overflow = CGF.Builder.CreateOr(UnsignedOverflow, Underflow);
2097     if (ResultInfo.Width < OpWidth) {
2098       auto IntMax =
2099           llvm::APInt::getMaxValue(ResultInfo.Width).zext(OpWidth);
2100       llvm::Value *TruncOverflow = CGF.Builder.CreateICmpUGT(
2101           UnsignedResult, llvm::ConstantInt::get(OpTy, IntMax));
2102       Overflow = CGF.Builder.CreateOr(Overflow, TruncOverflow);
2103     }
2104 
2105     // Negate the product if it would be negative in infinite precision.
2106     Result = CGF.Builder.CreateSelect(
2107         IsNegative, CGF.Builder.CreateNeg(UnsignedResult), UnsignedResult);
2108 
2109     Result = CGF.Builder.CreateTrunc(Result, ResTy);
2110   }
2111   assert(Overflow && Result && "Missing overflow or result");
2112 
2113   bool isVolatile =
2114       ResultArg->getType()->getPointeeType().isVolatileQualified();
2115   CGF.Builder.CreateStore(CGF.EmitToMemory(Result, ResultQTy), ResultPtr,
2116                           isVolatile);
2117   return RValue::get(Overflow);
2118 }
2119 
2120 static bool
2121 TypeRequiresBuiltinLaunderImp(const ASTContext &Ctx, QualType Ty,
2122                               llvm::SmallPtrSetImpl<const Decl *> &Seen) {
2123   if (const auto *Arr = Ctx.getAsArrayType(Ty))
2124     Ty = Ctx.getBaseElementType(Arr);
2125 
2126   const auto *Record = Ty->getAsCXXRecordDecl();
2127   if (!Record)
2128     return false;
2129 
2130   // We've already checked this type, or are in the process of checking it.
2131   if (!Seen.insert(Record).second)
2132     return false;
2133 
2134   assert(Record->hasDefinition() &&
2135          "Incomplete types should already be diagnosed");
2136 
2137   if (Record->isDynamicClass())
2138     return true;
2139 
2140   for (FieldDecl *F : Record->fields()) {
2141     if (TypeRequiresBuiltinLaunderImp(Ctx, F->getType(), Seen))
2142       return true;
2143   }
2144   return false;
2145 }
2146 
2147 /// Determine if the specified type requires laundering by checking if it is a
2148 /// dynamic class type or contains a subobject which is a dynamic class type.
2149 static bool TypeRequiresBuiltinLaunder(CodeGenModule &CGM, QualType Ty) {
2150   if (!CGM.getCodeGenOpts().StrictVTablePointers)
2151     return false;
2152   llvm::SmallPtrSet<const Decl *, 16> Seen;
2153   return TypeRequiresBuiltinLaunderImp(CGM.getContext(), Ty, Seen);
2154 }
2155 
2156 RValue CodeGenFunction::emitRotate(const CallExpr *E, bool IsRotateRight) {
2157   llvm::Value *Src = EmitScalarExpr(E->getArg(0));
2158   llvm::Value *ShiftAmt = EmitScalarExpr(E->getArg(1));
2159 
2160   // The builtin's shift arg may have a different type than the source arg and
2161   // result, but the LLVM intrinsic uses the same type for all values.
2162   llvm::Type *Ty = Src->getType();
2163   ShiftAmt = Builder.CreateIntCast(ShiftAmt, Ty, false);
2164 
2165   // Rotate is a special case of LLVM funnel shift - 1st 2 args are the same.
2166   unsigned IID = IsRotateRight ? Intrinsic::fshr : Intrinsic::fshl;
2167   Function *F = CGM.getIntrinsic(IID, Ty);
2168   return RValue::get(Builder.CreateCall(F, { Src, Src, ShiftAmt }));
2169 }
2170 
2171 // Map math builtins for long-double to f128 version.
2172 static unsigned mutateLongDoubleBuiltin(unsigned BuiltinID) {
2173   switch (BuiltinID) {
2174 #define MUTATE_LDBL(func) \
2175   case Builtin::BI__builtin_##func##l: \
2176     return Builtin::BI__builtin_##func##f128;
2177   MUTATE_LDBL(sqrt)
2178   MUTATE_LDBL(cbrt)
2179   MUTATE_LDBL(fabs)
2180   MUTATE_LDBL(log)
2181   MUTATE_LDBL(log2)
2182   MUTATE_LDBL(log10)
2183   MUTATE_LDBL(log1p)
2184   MUTATE_LDBL(logb)
2185   MUTATE_LDBL(exp)
2186   MUTATE_LDBL(exp2)
2187   MUTATE_LDBL(expm1)
2188   MUTATE_LDBL(fdim)
2189   MUTATE_LDBL(hypot)
2190   MUTATE_LDBL(ilogb)
2191   MUTATE_LDBL(pow)
2192   MUTATE_LDBL(fmin)
2193   MUTATE_LDBL(fmax)
2194   MUTATE_LDBL(ceil)
2195   MUTATE_LDBL(trunc)
2196   MUTATE_LDBL(rint)
2197   MUTATE_LDBL(nearbyint)
2198   MUTATE_LDBL(round)
2199   MUTATE_LDBL(floor)
2200   MUTATE_LDBL(lround)
2201   MUTATE_LDBL(llround)
2202   MUTATE_LDBL(lrint)
2203   MUTATE_LDBL(llrint)
2204   MUTATE_LDBL(fmod)
2205   MUTATE_LDBL(modf)
2206   MUTATE_LDBL(nan)
2207   MUTATE_LDBL(nans)
2208   MUTATE_LDBL(inf)
2209   MUTATE_LDBL(fma)
2210   MUTATE_LDBL(sin)
2211   MUTATE_LDBL(cos)
2212   MUTATE_LDBL(tan)
2213   MUTATE_LDBL(sinh)
2214   MUTATE_LDBL(cosh)
2215   MUTATE_LDBL(tanh)
2216   MUTATE_LDBL(asin)
2217   MUTATE_LDBL(acos)
2218   MUTATE_LDBL(atan)
2219   MUTATE_LDBL(asinh)
2220   MUTATE_LDBL(acosh)
2221   MUTATE_LDBL(atanh)
2222   MUTATE_LDBL(atan2)
2223   MUTATE_LDBL(erf)
2224   MUTATE_LDBL(erfc)
2225   MUTATE_LDBL(ldexp)
2226   MUTATE_LDBL(frexp)
2227   MUTATE_LDBL(huge_val)
2228   MUTATE_LDBL(copysign)
2229   MUTATE_LDBL(nextafter)
2230   MUTATE_LDBL(nexttoward)
2231   MUTATE_LDBL(remainder)
2232   MUTATE_LDBL(remquo)
2233   MUTATE_LDBL(scalbln)
2234   MUTATE_LDBL(scalbn)
2235   MUTATE_LDBL(tgamma)
2236   MUTATE_LDBL(lgamma)
2237 #undef MUTATE_LDBL
2238   default:
2239     return BuiltinID;
2240   }
2241 }
2242 
2243 static Value *tryUseTestFPKind(CodeGenFunction &CGF, unsigned BuiltinID,
2244                                Value *V) {
2245   if (CGF.Builder.getIsFPConstrained() &&
2246       CGF.Builder.getDefaultConstrainedExcept() != fp::ebIgnore) {
2247     if (Value *Result =
2248             CGF.getTargetHooks().testFPKind(V, BuiltinID, CGF.Builder, CGF.CGM))
2249       return Result;
2250   }
2251   return nullptr;
2252 }
2253 
2254 RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
2255                                         const CallExpr *E,
2256                                         ReturnValueSlot ReturnValue) {
2257   const FunctionDecl *FD = GD.getDecl()->getAsFunction();
2258   // See if we can constant fold this builtin.  If so, don't emit it at all.
2259   // TODO: Extend this handling to all builtin calls that we can constant-fold.
2260   Expr::EvalResult Result;
2261   if (E->isPRValue() && E->EvaluateAsRValue(Result, CGM.getContext()) &&
2262       !Result.hasSideEffects()) {
2263     if (Result.Val.isInt())
2264       return RValue::get(llvm::ConstantInt::get(getLLVMContext(),
2265                                                 Result.Val.getInt()));
2266     if (Result.Val.isFloat())
2267       return RValue::get(llvm::ConstantFP::get(getLLVMContext(),
2268                                                Result.Val.getFloat()));
2269   }
2270 
2271   // If current long-double semantics is IEEE 128-bit, replace math builtins
2272   // of long-double with f128 equivalent.
2273   // TODO: This mutation should also be applied to other targets other than PPC,
2274   // after backend supports IEEE 128-bit style libcalls.
2275   if (getTarget().getTriple().isPPC64() &&
2276       &getTarget().getLongDoubleFormat() == &llvm::APFloat::IEEEquad())
2277     BuiltinID = mutateLongDoubleBuiltin(BuiltinID);
2278 
2279   // If the builtin has been declared explicitly with an assembler label,
2280   // disable the specialized emitting below. Ideally we should communicate the
2281   // rename in IR, or at least avoid generating the intrinsic calls that are
2282   // likely to get lowered to the renamed library functions.
2283   const unsigned BuiltinIDIfNoAsmLabel =
2284       FD->hasAttr<AsmLabelAttr>() ? 0 : BuiltinID;
2285 
2286   // There are LLVM math intrinsics/instructions corresponding to math library
2287   // functions except the LLVM op will never set errno while the math library
2288   // might. Also, math builtins have the same semantics as their math library
2289   // twins. Thus, we can transform math library and builtin calls to their
2290   // LLVM counterparts if the call is marked 'const' (known to never set errno).
2291   // In case FP exceptions are enabled, the experimental versions of the
2292   // intrinsics model those.
2293   bool ConstWithoutErrnoAndExceptions =
2294       getContext().BuiltinInfo.isConstWithoutErrnoAndExceptions(BuiltinID);
2295   bool ConstWithoutExceptions =
2296       getContext().BuiltinInfo.isConstWithoutExceptions(BuiltinID);
2297   if (FD->hasAttr<ConstAttr>() ||
2298       ((ConstWithoutErrnoAndExceptions || ConstWithoutExceptions) &&
2299        (!ConstWithoutErrnoAndExceptions || (!getLangOpts().MathErrno)))) {
2300     switch (BuiltinIDIfNoAsmLabel) {
2301     case Builtin::BIceil:
2302     case Builtin::BIceilf:
2303     case Builtin::BIceill:
2304     case Builtin::BI__builtin_ceil:
2305     case Builtin::BI__builtin_ceilf:
2306     case Builtin::BI__builtin_ceilf16:
2307     case Builtin::BI__builtin_ceill:
2308     case Builtin::BI__builtin_ceilf128:
2309       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2310                                    Intrinsic::ceil,
2311                                    Intrinsic::experimental_constrained_ceil));
2312 
2313     case Builtin::BIcopysign:
2314     case Builtin::BIcopysignf:
2315     case Builtin::BIcopysignl:
2316     case Builtin::BI__builtin_copysign:
2317     case Builtin::BI__builtin_copysignf:
2318     case Builtin::BI__builtin_copysignf16:
2319     case Builtin::BI__builtin_copysignl:
2320     case Builtin::BI__builtin_copysignf128:
2321       return RValue::get(emitBinaryBuiltin(*this, E, Intrinsic::copysign));
2322 
2323     case Builtin::BIcos:
2324     case Builtin::BIcosf:
2325     case Builtin::BIcosl:
2326     case Builtin::BI__builtin_cos:
2327     case Builtin::BI__builtin_cosf:
2328     case Builtin::BI__builtin_cosf16:
2329     case Builtin::BI__builtin_cosl:
2330     case Builtin::BI__builtin_cosf128:
2331       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2332                                    Intrinsic::cos,
2333                                    Intrinsic::experimental_constrained_cos));
2334 
2335     case Builtin::BIexp:
2336     case Builtin::BIexpf:
2337     case Builtin::BIexpl:
2338     case Builtin::BI__builtin_exp:
2339     case Builtin::BI__builtin_expf:
2340     case Builtin::BI__builtin_expf16:
2341     case Builtin::BI__builtin_expl:
2342     case Builtin::BI__builtin_expf128:
2343       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2344                                    Intrinsic::exp,
2345                                    Intrinsic::experimental_constrained_exp));
2346 
2347     case Builtin::BIexp2:
2348     case Builtin::BIexp2f:
2349     case Builtin::BIexp2l:
2350     case Builtin::BI__builtin_exp2:
2351     case Builtin::BI__builtin_exp2f:
2352     case Builtin::BI__builtin_exp2f16:
2353     case Builtin::BI__builtin_exp2l:
2354     case Builtin::BI__builtin_exp2f128:
2355       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2356                                    Intrinsic::exp2,
2357                                    Intrinsic::experimental_constrained_exp2));
2358 
2359     case Builtin::BIfabs:
2360     case Builtin::BIfabsf:
2361     case Builtin::BIfabsl:
2362     case Builtin::BI__builtin_fabs:
2363     case Builtin::BI__builtin_fabsf:
2364     case Builtin::BI__builtin_fabsf16:
2365     case Builtin::BI__builtin_fabsl:
2366     case Builtin::BI__builtin_fabsf128:
2367       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::fabs));
2368 
2369     case Builtin::BIfloor:
2370     case Builtin::BIfloorf:
2371     case Builtin::BIfloorl:
2372     case Builtin::BI__builtin_floor:
2373     case Builtin::BI__builtin_floorf:
2374     case Builtin::BI__builtin_floorf16:
2375     case Builtin::BI__builtin_floorl:
2376     case Builtin::BI__builtin_floorf128:
2377       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2378                                    Intrinsic::floor,
2379                                    Intrinsic::experimental_constrained_floor));
2380 
2381     case Builtin::BIfma:
2382     case Builtin::BIfmaf:
2383     case Builtin::BIfmal:
2384     case Builtin::BI__builtin_fma:
2385     case Builtin::BI__builtin_fmaf:
2386     case Builtin::BI__builtin_fmaf16:
2387     case Builtin::BI__builtin_fmal:
2388     case Builtin::BI__builtin_fmaf128:
2389       return RValue::get(emitTernaryMaybeConstrainedFPBuiltin(*this, E,
2390                                    Intrinsic::fma,
2391                                    Intrinsic::experimental_constrained_fma));
2392 
2393     case Builtin::BIfmax:
2394     case Builtin::BIfmaxf:
2395     case Builtin::BIfmaxl:
2396     case Builtin::BI__builtin_fmax:
2397     case Builtin::BI__builtin_fmaxf:
2398     case Builtin::BI__builtin_fmaxf16:
2399     case Builtin::BI__builtin_fmaxl:
2400     case Builtin::BI__builtin_fmaxf128:
2401       return RValue::get(emitBinaryMaybeConstrainedFPBuiltin(*this, E,
2402                                    Intrinsic::maxnum,
2403                                    Intrinsic::experimental_constrained_maxnum));
2404 
2405     case Builtin::BIfmin:
2406     case Builtin::BIfminf:
2407     case Builtin::BIfminl:
2408     case Builtin::BI__builtin_fmin:
2409     case Builtin::BI__builtin_fminf:
2410     case Builtin::BI__builtin_fminf16:
2411     case Builtin::BI__builtin_fminl:
2412     case Builtin::BI__builtin_fminf128:
2413       return RValue::get(emitBinaryMaybeConstrainedFPBuiltin(*this, E,
2414                                    Intrinsic::minnum,
2415                                    Intrinsic::experimental_constrained_minnum));
2416 
2417     // fmod() is a special-case. It maps to the frem instruction rather than an
2418     // LLVM intrinsic.
2419     case Builtin::BIfmod:
2420     case Builtin::BIfmodf:
2421     case Builtin::BIfmodl:
2422     case Builtin::BI__builtin_fmod:
2423     case Builtin::BI__builtin_fmodf:
2424     case Builtin::BI__builtin_fmodf16:
2425     case Builtin::BI__builtin_fmodl:
2426     case Builtin::BI__builtin_fmodf128: {
2427       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
2428       Value *Arg1 = EmitScalarExpr(E->getArg(0));
2429       Value *Arg2 = EmitScalarExpr(E->getArg(1));
2430       return RValue::get(Builder.CreateFRem(Arg1, Arg2, "fmod"));
2431     }
2432 
2433     case Builtin::BIlog:
2434     case Builtin::BIlogf:
2435     case Builtin::BIlogl:
2436     case Builtin::BI__builtin_log:
2437     case Builtin::BI__builtin_logf:
2438     case Builtin::BI__builtin_logf16:
2439     case Builtin::BI__builtin_logl:
2440     case Builtin::BI__builtin_logf128:
2441       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2442                                    Intrinsic::log,
2443                                    Intrinsic::experimental_constrained_log));
2444 
2445     case Builtin::BIlog10:
2446     case Builtin::BIlog10f:
2447     case Builtin::BIlog10l:
2448     case Builtin::BI__builtin_log10:
2449     case Builtin::BI__builtin_log10f:
2450     case Builtin::BI__builtin_log10f16:
2451     case Builtin::BI__builtin_log10l:
2452     case Builtin::BI__builtin_log10f128:
2453       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2454                                    Intrinsic::log10,
2455                                    Intrinsic::experimental_constrained_log10));
2456 
2457     case Builtin::BIlog2:
2458     case Builtin::BIlog2f:
2459     case Builtin::BIlog2l:
2460     case Builtin::BI__builtin_log2:
2461     case Builtin::BI__builtin_log2f:
2462     case Builtin::BI__builtin_log2f16:
2463     case Builtin::BI__builtin_log2l:
2464     case Builtin::BI__builtin_log2f128:
2465       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2466                                    Intrinsic::log2,
2467                                    Intrinsic::experimental_constrained_log2));
2468 
2469     case Builtin::BInearbyint:
2470     case Builtin::BInearbyintf:
2471     case Builtin::BInearbyintl:
2472     case Builtin::BI__builtin_nearbyint:
2473     case Builtin::BI__builtin_nearbyintf:
2474     case Builtin::BI__builtin_nearbyintl:
2475     case Builtin::BI__builtin_nearbyintf128:
2476       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2477                                 Intrinsic::nearbyint,
2478                                 Intrinsic::experimental_constrained_nearbyint));
2479 
2480     case Builtin::BIpow:
2481     case Builtin::BIpowf:
2482     case Builtin::BIpowl:
2483     case Builtin::BI__builtin_pow:
2484     case Builtin::BI__builtin_powf:
2485     case Builtin::BI__builtin_powf16:
2486     case Builtin::BI__builtin_powl:
2487     case Builtin::BI__builtin_powf128:
2488       return RValue::get(emitBinaryMaybeConstrainedFPBuiltin(*this, E,
2489                                    Intrinsic::pow,
2490                                    Intrinsic::experimental_constrained_pow));
2491 
2492     case Builtin::BIrint:
2493     case Builtin::BIrintf:
2494     case Builtin::BIrintl:
2495     case Builtin::BI__builtin_rint:
2496     case Builtin::BI__builtin_rintf:
2497     case Builtin::BI__builtin_rintf16:
2498     case Builtin::BI__builtin_rintl:
2499     case Builtin::BI__builtin_rintf128:
2500       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2501                                    Intrinsic::rint,
2502                                    Intrinsic::experimental_constrained_rint));
2503 
2504     case Builtin::BIround:
2505     case Builtin::BIroundf:
2506     case Builtin::BIroundl:
2507     case Builtin::BI__builtin_round:
2508     case Builtin::BI__builtin_roundf:
2509     case Builtin::BI__builtin_roundf16:
2510     case Builtin::BI__builtin_roundl:
2511     case Builtin::BI__builtin_roundf128:
2512       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2513                                    Intrinsic::round,
2514                                    Intrinsic::experimental_constrained_round));
2515 
2516     case Builtin::BIroundeven:
2517     case Builtin::BIroundevenf:
2518     case Builtin::BIroundevenl:
2519     case Builtin::BI__builtin_roundeven:
2520     case Builtin::BI__builtin_roundevenf:
2521     case Builtin::BI__builtin_roundevenf16:
2522     case Builtin::BI__builtin_roundevenl:
2523     case Builtin::BI__builtin_roundevenf128:
2524       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2525                                    Intrinsic::roundeven,
2526                                    Intrinsic::experimental_constrained_roundeven));
2527 
2528     case Builtin::BIsin:
2529     case Builtin::BIsinf:
2530     case Builtin::BIsinl:
2531     case Builtin::BI__builtin_sin:
2532     case Builtin::BI__builtin_sinf:
2533     case Builtin::BI__builtin_sinf16:
2534     case Builtin::BI__builtin_sinl:
2535     case Builtin::BI__builtin_sinf128:
2536       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2537                                    Intrinsic::sin,
2538                                    Intrinsic::experimental_constrained_sin));
2539 
2540     case Builtin::BIsqrt:
2541     case Builtin::BIsqrtf:
2542     case Builtin::BIsqrtl:
2543     case Builtin::BI__builtin_sqrt:
2544     case Builtin::BI__builtin_sqrtf:
2545     case Builtin::BI__builtin_sqrtf16:
2546     case Builtin::BI__builtin_sqrtl:
2547     case Builtin::BI__builtin_sqrtf128: {
2548       llvm::Value *Call = emitUnaryMaybeConstrainedFPBuiltin(
2549           *this, E, Intrinsic::sqrt, Intrinsic::experimental_constrained_sqrt);
2550       SetSqrtFPAccuracy(Call);
2551       return RValue::get(Call);
2552     }
2553     case Builtin::BItrunc:
2554     case Builtin::BItruncf:
2555     case Builtin::BItruncl:
2556     case Builtin::BI__builtin_trunc:
2557     case Builtin::BI__builtin_truncf:
2558     case Builtin::BI__builtin_truncf16:
2559     case Builtin::BI__builtin_truncl:
2560     case Builtin::BI__builtin_truncf128:
2561       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2562                                    Intrinsic::trunc,
2563                                    Intrinsic::experimental_constrained_trunc));
2564 
2565     case Builtin::BIlround:
2566     case Builtin::BIlroundf:
2567     case Builtin::BIlroundl:
2568     case Builtin::BI__builtin_lround:
2569     case Builtin::BI__builtin_lroundf:
2570     case Builtin::BI__builtin_lroundl:
2571     case Builtin::BI__builtin_lroundf128:
2572       return RValue::get(emitMaybeConstrainedFPToIntRoundBuiltin(
2573           *this, E, Intrinsic::lround,
2574           Intrinsic::experimental_constrained_lround));
2575 
2576     case Builtin::BIllround:
2577     case Builtin::BIllroundf:
2578     case Builtin::BIllroundl:
2579     case Builtin::BI__builtin_llround:
2580     case Builtin::BI__builtin_llroundf:
2581     case Builtin::BI__builtin_llroundl:
2582     case Builtin::BI__builtin_llroundf128:
2583       return RValue::get(emitMaybeConstrainedFPToIntRoundBuiltin(
2584           *this, E, Intrinsic::llround,
2585           Intrinsic::experimental_constrained_llround));
2586 
2587     case Builtin::BIlrint:
2588     case Builtin::BIlrintf:
2589     case Builtin::BIlrintl:
2590     case Builtin::BI__builtin_lrint:
2591     case Builtin::BI__builtin_lrintf:
2592     case Builtin::BI__builtin_lrintl:
2593     case Builtin::BI__builtin_lrintf128:
2594       return RValue::get(emitMaybeConstrainedFPToIntRoundBuiltin(
2595           *this, E, Intrinsic::lrint,
2596           Intrinsic::experimental_constrained_lrint));
2597 
2598     case Builtin::BIllrint:
2599     case Builtin::BIllrintf:
2600     case Builtin::BIllrintl:
2601     case Builtin::BI__builtin_llrint:
2602     case Builtin::BI__builtin_llrintf:
2603     case Builtin::BI__builtin_llrintl:
2604     case Builtin::BI__builtin_llrintf128:
2605       return RValue::get(emitMaybeConstrainedFPToIntRoundBuiltin(
2606           *this, E, Intrinsic::llrint,
2607           Intrinsic::experimental_constrained_llrint));
2608     case Builtin::BI__builtin_ldexp:
2609     case Builtin::BI__builtin_ldexpf:
2610     case Builtin::BI__builtin_ldexpl:
2611     case Builtin::BI__builtin_ldexpf16:
2612     case Builtin::BI__builtin_ldexpf128: {
2613       return RValue::get(emitBinaryExpMaybeConstrainedFPBuiltin(
2614           *this, E, Intrinsic::ldexp,
2615           Intrinsic::experimental_constrained_ldexp));
2616     }
2617     default:
2618       break;
2619     }
2620   }
2621 
2622   switch (BuiltinIDIfNoAsmLabel) {
2623   default: break;
2624   case Builtin::BI__builtin___CFStringMakeConstantString:
2625   case Builtin::BI__builtin___NSStringMakeConstantString:
2626     return RValue::get(ConstantEmitter(*this).emitAbstract(E, E->getType()));
2627   case Builtin::BI__builtin_stdarg_start:
2628   case Builtin::BI__builtin_va_start:
2629   case Builtin::BI__va_start:
2630   case Builtin::BI__builtin_va_end:
2631     EmitVAStartEnd(BuiltinID == Builtin::BI__va_start
2632                        ? EmitScalarExpr(E->getArg(0))
2633                        : EmitVAListRef(E->getArg(0)).getPointer(),
2634                    BuiltinID != Builtin::BI__builtin_va_end);
2635     return RValue::get(nullptr);
2636   case Builtin::BI__builtin_va_copy: {
2637     Value *DstPtr = EmitVAListRef(E->getArg(0)).getPointer();
2638     Value *SrcPtr = EmitVAListRef(E->getArg(1)).getPointer();
2639 
2640     llvm::Type *Type = Int8PtrTy;
2641 
2642     DstPtr = Builder.CreateBitCast(DstPtr, Type);
2643     SrcPtr = Builder.CreateBitCast(SrcPtr, Type);
2644     Builder.CreateCall(CGM.getIntrinsic(Intrinsic::vacopy), {DstPtr, SrcPtr});
2645     return RValue::get(nullptr);
2646   }
2647   case Builtin::BI__builtin_abs:
2648   case Builtin::BI__builtin_labs:
2649   case Builtin::BI__builtin_llabs: {
2650     // X < 0 ? -X : X
2651     // The negation has 'nsw' because abs of INT_MIN is undefined.
2652     Value *ArgValue = EmitScalarExpr(E->getArg(0));
2653     Value *NegOp = Builder.CreateNSWNeg(ArgValue, "neg");
2654     Constant *Zero = llvm::Constant::getNullValue(ArgValue->getType());
2655     Value *CmpResult = Builder.CreateICmpSLT(ArgValue, Zero, "abscond");
2656     Value *Result = Builder.CreateSelect(CmpResult, NegOp, ArgValue, "abs");
2657     return RValue::get(Result);
2658   }
2659   case Builtin::BI__builtin_complex: {
2660     Value *Real = EmitScalarExpr(E->getArg(0));
2661     Value *Imag = EmitScalarExpr(E->getArg(1));
2662     return RValue::getComplex({Real, Imag});
2663   }
2664   case Builtin::BI__builtin_conj:
2665   case Builtin::BI__builtin_conjf:
2666   case Builtin::BI__builtin_conjl:
2667   case Builtin::BIconj:
2668   case Builtin::BIconjf:
2669   case Builtin::BIconjl: {
2670     ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
2671     Value *Real = ComplexVal.first;
2672     Value *Imag = ComplexVal.second;
2673     Imag = Builder.CreateFNeg(Imag, "neg");
2674     return RValue::getComplex(std::make_pair(Real, Imag));
2675   }
2676   case Builtin::BI__builtin_creal:
2677   case Builtin::BI__builtin_crealf:
2678   case Builtin::BI__builtin_creall:
2679   case Builtin::BIcreal:
2680   case Builtin::BIcrealf:
2681   case Builtin::BIcreall: {
2682     ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
2683     return RValue::get(ComplexVal.first);
2684   }
2685 
2686   case Builtin::BI__builtin_preserve_access_index: {
2687     // Only enabled preserved access index region when debuginfo
2688     // is available as debuginfo is needed to preserve user-level
2689     // access pattern.
2690     if (!getDebugInfo()) {
2691       CGM.Error(E->getExprLoc(), "using builtin_preserve_access_index() without -g");
2692       return RValue::get(EmitScalarExpr(E->getArg(0)));
2693     }
2694 
2695     // Nested builtin_preserve_access_index() not supported
2696     if (IsInPreservedAIRegion) {
2697       CGM.Error(E->getExprLoc(), "nested builtin_preserve_access_index() not supported");
2698       return RValue::get(EmitScalarExpr(E->getArg(0)));
2699     }
2700 
2701     IsInPreservedAIRegion = true;
2702     Value *Res = EmitScalarExpr(E->getArg(0));
2703     IsInPreservedAIRegion = false;
2704     return RValue::get(Res);
2705   }
2706 
2707   case Builtin::BI__builtin_cimag:
2708   case Builtin::BI__builtin_cimagf:
2709   case Builtin::BI__builtin_cimagl:
2710   case Builtin::BIcimag:
2711   case Builtin::BIcimagf:
2712   case Builtin::BIcimagl: {
2713     ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
2714     return RValue::get(ComplexVal.second);
2715   }
2716 
2717   case Builtin::BI__builtin_clrsb:
2718   case Builtin::BI__builtin_clrsbl:
2719   case Builtin::BI__builtin_clrsbll: {
2720     // clrsb(x) -> clz(x < 0 ? ~x : x) - 1 or
2721     Value *ArgValue = EmitScalarExpr(E->getArg(0));
2722 
2723     llvm::Type *ArgType = ArgValue->getType();
2724     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
2725 
2726     llvm::Type *ResultType = ConvertType(E->getType());
2727     Value *Zero = llvm::Constant::getNullValue(ArgType);
2728     Value *IsNeg = Builder.CreateICmpSLT(ArgValue, Zero, "isneg");
2729     Value *Inverse = Builder.CreateNot(ArgValue, "not");
2730     Value *Tmp = Builder.CreateSelect(IsNeg, Inverse, ArgValue);
2731     Value *Ctlz = Builder.CreateCall(F, {Tmp, Builder.getFalse()});
2732     Value *Result = Builder.CreateSub(Ctlz, llvm::ConstantInt::get(ArgType, 1));
2733     Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
2734                                    "cast");
2735     return RValue::get(Result);
2736   }
2737   case Builtin::BI__builtin_ctzs:
2738   case Builtin::BI__builtin_ctz:
2739   case Builtin::BI__builtin_ctzl:
2740   case Builtin::BI__builtin_ctzll: {
2741     Value *ArgValue = EmitCheckedArgForBuiltin(E->getArg(0), BCK_CTZPassedZero);
2742 
2743     llvm::Type *ArgType = ArgValue->getType();
2744     Function *F = CGM.getIntrinsic(Intrinsic::cttz, ArgType);
2745 
2746     llvm::Type *ResultType = ConvertType(E->getType());
2747     Value *ZeroUndef = Builder.getInt1(getTarget().isCLZForZeroUndef());
2748     Value *Result = Builder.CreateCall(F, {ArgValue, ZeroUndef});
2749     if (Result->getType() != ResultType)
2750       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
2751                                      "cast");
2752     return RValue::get(Result);
2753   }
2754   case Builtin::BI__builtin_clzs:
2755   case Builtin::BI__builtin_clz:
2756   case Builtin::BI__builtin_clzl:
2757   case Builtin::BI__builtin_clzll: {
2758     Value *ArgValue = EmitCheckedArgForBuiltin(E->getArg(0), BCK_CLZPassedZero);
2759 
2760     llvm::Type *ArgType = ArgValue->getType();
2761     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
2762 
2763     llvm::Type *ResultType = ConvertType(E->getType());
2764     Value *ZeroUndef = Builder.getInt1(getTarget().isCLZForZeroUndef());
2765     Value *Result = Builder.CreateCall(F, {ArgValue, ZeroUndef});
2766     if (Result->getType() != ResultType)
2767       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
2768                                      "cast");
2769     return RValue::get(Result);
2770   }
2771   case Builtin::BI__builtin_ffs:
2772   case Builtin::BI__builtin_ffsl:
2773   case Builtin::BI__builtin_ffsll: {
2774     // ffs(x) -> x ? cttz(x) + 1 : 0
2775     Value *ArgValue = EmitScalarExpr(E->getArg(0));
2776 
2777     llvm::Type *ArgType = ArgValue->getType();
2778     Function *F = CGM.getIntrinsic(Intrinsic::cttz, ArgType);
2779 
2780     llvm::Type *ResultType = ConvertType(E->getType());
2781     Value *Tmp =
2782         Builder.CreateAdd(Builder.CreateCall(F, {ArgValue, Builder.getTrue()}),
2783                           llvm::ConstantInt::get(ArgType, 1));
2784     Value *Zero = llvm::Constant::getNullValue(ArgType);
2785     Value *IsZero = Builder.CreateICmpEQ(ArgValue, Zero, "iszero");
2786     Value *Result = Builder.CreateSelect(IsZero, Zero, Tmp, "ffs");
2787     if (Result->getType() != ResultType)
2788       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
2789                                      "cast");
2790     return RValue::get(Result);
2791   }
2792   case Builtin::BI__builtin_parity:
2793   case Builtin::BI__builtin_parityl:
2794   case Builtin::BI__builtin_parityll: {
2795     // parity(x) -> ctpop(x) & 1
2796     Value *ArgValue = EmitScalarExpr(E->getArg(0));
2797 
2798     llvm::Type *ArgType = ArgValue->getType();
2799     Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType);
2800 
2801     llvm::Type *ResultType = ConvertType(E->getType());
2802     Value *Tmp = Builder.CreateCall(F, ArgValue);
2803     Value *Result = Builder.CreateAnd(Tmp, llvm::ConstantInt::get(ArgType, 1));
2804     if (Result->getType() != ResultType)
2805       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
2806                                      "cast");
2807     return RValue::get(Result);
2808   }
2809   case Builtin::BI__lzcnt16:
2810   case Builtin::BI__lzcnt:
2811   case Builtin::BI__lzcnt64: {
2812     Value *ArgValue = EmitScalarExpr(E->getArg(0));
2813 
2814     llvm::Type *ArgType = ArgValue->getType();
2815     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
2816 
2817     llvm::Type *ResultType = ConvertType(E->getType());
2818     Value *Result = Builder.CreateCall(F, {ArgValue, Builder.getFalse()});
2819     if (Result->getType() != ResultType)
2820       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
2821                                      "cast");
2822     return RValue::get(Result);
2823   }
2824   case Builtin::BI__popcnt16:
2825   case Builtin::BI__popcnt:
2826   case Builtin::BI__popcnt64:
2827   case Builtin::BI__builtin_popcount:
2828   case Builtin::BI__builtin_popcountl:
2829   case Builtin::BI__builtin_popcountll: {
2830     Value *ArgValue = EmitScalarExpr(E->getArg(0));
2831 
2832     llvm::Type *ArgType = ArgValue->getType();
2833     Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType);
2834 
2835     llvm::Type *ResultType = ConvertType(E->getType());
2836     Value *Result = Builder.CreateCall(F, ArgValue);
2837     if (Result->getType() != ResultType)
2838       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
2839                                      "cast");
2840     return RValue::get(Result);
2841   }
2842   case Builtin::BI__builtin_unpredictable: {
2843     // Always return the argument of __builtin_unpredictable. LLVM does not
2844     // handle this builtin. Metadata for this builtin should be added directly
2845     // to instructions such as branches or switches that use it.
2846     return RValue::get(EmitScalarExpr(E->getArg(0)));
2847   }
2848   case Builtin::BI__builtin_expect: {
2849     Value *ArgValue = EmitScalarExpr(E->getArg(0));
2850     llvm::Type *ArgType = ArgValue->getType();
2851 
2852     Value *ExpectedValue = EmitScalarExpr(E->getArg(1));
2853     // Don't generate llvm.expect on -O0 as the backend won't use it for
2854     // anything.
2855     // Note, we still IRGen ExpectedValue because it could have side-effects.
2856     if (CGM.getCodeGenOpts().OptimizationLevel == 0)
2857       return RValue::get(ArgValue);
2858 
2859     Function *FnExpect = CGM.getIntrinsic(Intrinsic::expect, ArgType);
2860     Value *Result =
2861         Builder.CreateCall(FnExpect, {ArgValue, ExpectedValue}, "expval");
2862     return RValue::get(Result);
2863   }
2864   case Builtin::BI__builtin_expect_with_probability: {
2865     Value *ArgValue = EmitScalarExpr(E->getArg(0));
2866     llvm::Type *ArgType = ArgValue->getType();
2867 
2868     Value *ExpectedValue = EmitScalarExpr(E->getArg(1));
2869     llvm::APFloat Probability(0.0);
2870     const Expr *ProbArg = E->getArg(2);
2871     bool EvalSucceed = ProbArg->EvaluateAsFloat(Probability, CGM.getContext());
2872     assert(EvalSucceed && "probability should be able to evaluate as float");
2873     (void)EvalSucceed;
2874     bool LoseInfo = false;
2875     Probability.convert(llvm::APFloat::IEEEdouble(),
2876                         llvm::RoundingMode::Dynamic, &LoseInfo);
2877     llvm::Type *Ty = ConvertType(ProbArg->getType());
2878     Constant *Confidence = ConstantFP::get(Ty, Probability);
2879     // Don't generate llvm.expect.with.probability on -O0 as the backend
2880     // won't use it for anything.
2881     // Note, we still IRGen ExpectedValue because it could have side-effects.
2882     if (CGM.getCodeGenOpts().OptimizationLevel == 0)
2883       return RValue::get(ArgValue);
2884 
2885     Function *FnExpect =
2886         CGM.getIntrinsic(Intrinsic::expect_with_probability, ArgType);
2887     Value *Result = Builder.CreateCall(
2888         FnExpect, {ArgValue, ExpectedValue, Confidence}, "expval");
2889     return RValue::get(Result);
2890   }
2891   case Builtin::BI__builtin_assume_aligned: {
2892     const Expr *Ptr = E->getArg(0);
2893     Value *PtrValue = EmitScalarExpr(Ptr);
2894     Value *OffsetValue =
2895       (E->getNumArgs() > 2) ? EmitScalarExpr(E->getArg(2)) : nullptr;
2896 
2897     Value *AlignmentValue = EmitScalarExpr(E->getArg(1));
2898     ConstantInt *AlignmentCI = cast<ConstantInt>(AlignmentValue);
2899     if (AlignmentCI->getValue().ugt(llvm::Value::MaximumAlignment))
2900       AlignmentCI = ConstantInt::get(AlignmentCI->getType(),
2901                                      llvm::Value::MaximumAlignment);
2902 
2903     emitAlignmentAssumption(PtrValue, Ptr,
2904                             /*The expr loc is sufficient.*/ SourceLocation(),
2905                             AlignmentCI, OffsetValue);
2906     return RValue::get(PtrValue);
2907   }
2908   case Builtin::BI__assume:
2909   case Builtin::BI__builtin_assume: {
2910     if (E->getArg(0)->HasSideEffects(getContext()))
2911       return RValue::get(nullptr);
2912 
2913     Value *ArgValue = EmitScalarExpr(E->getArg(0));
2914     Function *FnAssume = CGM.getIntrinsic(Intrinsic::assume);
2915     Builder.CreateCall(FnAssume, ArgValue);
2916     return RValue::get(nullptr);
2917   }
2918   case Builtin::BI__builtin_assume_separate_storage: {
2919     const Expr *Arg0 = E->getArg(0);
2920     const Expr *Arg1 = E->getArg(1);
2921 
2922     Value *Value0 = EmitScalarExpr(Arg0);
2923     Value *Value1 = EmitScalarExpr(Arg1);
2924 
2925     Value *Values[] = {Value0, Value1};
2926     OperandBundleDefT<Value *> OBD("separate_storage", Values);
2927     Builder.CreateAssumption(ConstantInt::getTrue(getLLVMContext()), {OBD});
2928     return RValue::get(nullptr);
2929   }
2930   case Builtin::BI__arithmetic_fence: {
2931     // Create the builtin call if FastMath is selected, and the target
2932     // supports the builtin, otherwise just return the argument.
2933     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
2934     llvm::FastMathFlags FMF = Builder.getFastMathFlags();
2935     bool isArithmeticFenceEnabled =
2936         FMF.allowReassoc() &&
2937         getContext().getTargetInfo().checkArithmeticFenceSupported();
2938     QualType ArgType = E->getArg(0)->getType();
2939     if (ArgType->isComplexType()) {
2940       if (isArithmeticFenceEnabled) {
2941         QualType ElementType = ArgType->castAs<ComplexType>()->getElementType();
2942         ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
2943         Value *Real = Builder.CreateArithmeticFence(ComplexVal.first,
2944                                                     ConvertType(ElementType));
2945         Value *Imag = Builder.CreateArithmeticFence(ComplexVal.second,
2946                                                     ConvertType(ElementType));
2947         return RValue::getComplex(std::make_pair(Real, Imag));
2948       }
2949       ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
2950       Value *Real = ComplexVal.first;
2951       Value *Imag = ComplexVal.second;
2952       return RValue::getComplex(std::make_pair(Real, Imag));
2953     }
2954     Value *ArgValue = EmitScalarExpr(E->getArg(0));
2955     if (isArithmeticFenceEnabled)
2956       return RValue::get(
2957           Builder.CreateArithmeticFence(ArgValue, ConvertType(ArgType)));
2958     return RValue::get(ArgValue);
2959   }
2960   case Builtin::BI__builtin_bswap16:
2961   case Builtin::BI__builtin_bswap32:
2962   case Builtin::BI__builtin_bswap64:
2963   case Builtin::BI_byteswap_ushort:
2964   case Builtin::BI_byteswap_ulong:
2965   case Builtin::BI_byteswap_uint64: {
2966     return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::bswap));
2967   }
2968   case Builtin::BI__builtin_bitreverse8:
2969   case Builtin::BI__builtin_bitreverse16:
2970   case Builtin::BI__builtin_bitreverse32:
2971   case Builtin::BI__builtin_bitreverse64: {
2972     return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::bitreverse));
2973   }
2974   case Builtin::BI__builtin_rotateleft8:
2975   case Builtin::BI__builtin_rotateleft16:
2976   case Builtin::BI__builtin_rotateleft32:
2977   case Builtin::BI__builtin_rotateleft64:
2978   case Builtin::BI_rotl8: // Microsoft variants of rotate left
2979   case Builtin::BI_rotl16:
2980   case Builtin::BI_rotl:
2981   case Builtin::BI_lrotl:
2982   case Builtin::BI_rotl64:
2983     return emitRotate(E, false);
2984 
2985   case Builtin::BI__builtin_rotateright8:
2986   case Builtin::BI__builtin_rotateright16:
2987   case Builtin::BI__builtin_rotateright32:
2988   case Builtin::BI__builtin_rotateright64:
2989   case Builtin::BI_rotr8: // Microsoft variants of rotate right
2990   case Builtin::BI_rotr16:
2991   case Builtin::BI_rotr:
2992   case Builtin::BI_lrotr:
2993   case Builtin::BI_rotr64:
2994     return emitRotate(E, true);
2995 
2996   case Builtin::BI__builtin_constant_p: {
2997     llvm::Type *ResultType = ConvertType(E->getType());
2998 
2999     const Expr *Arg = E->getArg(0);
3000     QualType ArgType = Arg->getType();
3001     // FIXME: The allowance for Obj-C pointers and block pointers is historical
3002     // and likely a mistake.
3003     if (!ArgType->isIntegralOrEnumerationType() && !ArgType->isFloatingType() &&
3004         !ArgType->isObjCObjectPointerType() && !ArgType->isBlockPointerType())
3005       // Per the GCC documentation, only numeric constants are recognized after
3006       // inlining.
3007       return RValue::get(ConstantInt::get(ResultType, 0));
3008 
3009     if (Arg->HasSideEffects(getContext()))
3010       // The argument is unevaluated, so be conservative if it might have
3011       // side-effects.
3012       return RValue::get(ConstantInt::get(ResultType, 0));
3013 
3014     Value *ArgValue = EmitScalarExpr(Arg);
3015     if (ArgType->isObjCObjectPointerType()) {
3016       // Convert Objective-C objects to id because we cannot distinguish between
3017       // LLVM types for Obj-C classes as they are opaque.
3018       ArgType = CGM.getContext().getObjCIdType();
3019       ArgValue = Builder.CreateBitCast(ArgValue, ConvertType(ArgType));
3020     }
3021     Function *F =
3022         CGM.getIntrinsic(Intrinsic::is_constant, ConvertType(ArgType));
3023     Value *Result = Builder.CreateCall(F, ArgValue);
3024     if (Result->getType() != ResultType)
3025       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/false);
3026     return RValue::get(Result);
3027   }
3028   case Builtin::BI__builtin_dynamic_object_size:
3029   case Builtin::BI__builtin_object_size: {
3030     unsigned Type =
3031         E->getArg(1)->EvaluateKnownConstInt(getContext()).getZExtValue();
3032     auto *ResType = cast<llvm::IntegerType>(ConvertType(E->getType()));
3033 
3034     // We pass this builtin onto the optimizer so that it can figure out the
3035     // object size in more complex cases.
3036     bool IsDynamic = BuiltinID == Builtin::BI__builtin_dynamic_object_size;
3037     return RValue::get(emitBuiltinObjectSize(E->getArg(0), Type, ResType,
3038                                              /*EmittedE=*/nullptr, IsDynamic));
3039   }
3040   case Builtin::BI__builtin_prefetch: {
3041     Value *Locality, *RW, *Address = EmitScalarExpr(E->getArg(0));
3042     // FIXME: Technically these constants should of type 'int', yes?
3043     RW = (E->getNumArgs() > 1) ? EmitScalarExpr(E->getArg(1)) :
3044       llvm::ConstantInt::get(Int32Ty, 0);
3045     Locality = (E->getNumArgs() > 2) ? EmitScalarExpr(E->getArg(2)) :
3046       llvm::ConstantInt::get(Int32Ty, 3);
3047     Value *Data = llvm::ConstantInt::get(Int32Ty, 1);
3048     Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType());
3049     Builder.CreateCall(F, {Address, RW, Locality, Data});
3050     return RValue::get(nullptr);
3051   }
3052   case Builtin::BI__builtin_readcyclecounter: {
3053     Function *F = CGM.getIntrinsic(Intrinsic::readcyclecounter);
3054     return RValue::get(Builder.CreateCall(F));
3055   }
3056   case Builtin::BI__builtin___clear_cache: {
3057     Value *Begin = EmitScalarExpr(E->getArg(0));
3058     Value *End = EmitScalarExpr(E->getArg(1));
3059     Function *F = CGM.getIntrinsic(Intrinsic::clear_cache);
3060     return RValue::get(Builder.CreateCall(F, {Begin, End}));
3061   }
3062   case Builtin::BI__builtin_trap:
3063     EmitTrapCall(Intrinsic::trap);
3064     return RValue::get(nullptr);
3065   case Builtin::BI__debugbreak:
3066     EmitTrapCall(Intrinsic::debugtrap);
3067     return RValue::get(nullptr);
3068   case Builtin::BI__builtin_unreachable: {
3069     EmitUnreachable(E->getExprLoc());
3070 
3071     // We do need to preserve an insertion point.
3072     EmitBlock(createBasicBlock("unreachable.cont"));
3073 
3074     return RValue::get(nullptr);
3075   }
3076 
3077   case Builtin::BI__builtin_powi:
3078   case Builtin::BI__builtin_powif:
3079   case Builtin::BI__builtin_powil: {
3080     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
3081     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
3082 
3083     if (Builder.getIsFPConstrained()) {
3084       // FIXME: llvm.powi has 2 mangling types,
3085       // llvm.experimental.constrained.powi has one.
3086       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3087       Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_powi,
3088                                      Src0->getType());
3089       return RValue::get(Builder.CreateConstrainedFPCall(F, { Src0, Src1 }));
3090     }
3091 
3092     Function *F = CGM.getIntrinsic(Intrinsic::powi,
3093                                    { Src0->getType(), Src1->getType() });
3094     return RValue::get(Builder.CreateCall(F, { Src0, Src1 }));
3095   }
3096   case Builtin::BI__builtin_frexp:
3097   case Builtin::BI__builtin_frexpf:
3098   case Builtin::BI__builtin_frexpl:
3099   case Builtin::BI__builtin_frexpf128:
3100   case Builtin::BI__builtin_frexpf16:
3101     return RValue::get(emitFrexpBuiltin(*this, E, Intrinsic::frexp));
3102   case Builtin::BI__builtin_isgreater:
3103   case Builtin::BI__builtin_isgreaterequal:
3104   case Builtin::BI__builtin_isless:
3105   case Builtin::BI__builtin_islessequal:
3106   case Builtin::BI__builtin_islessgreater:
3107   case Builtin::BI__builtin_isunordered: {
3108     // Ordered comparisons: we know the arguments to these are matching scalar
3109     // floating point values.
3110     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3111     Value *LHS = EmitScalarExpr(E->getArg(0));
3112     Value *RHS = EmitScalarExpr(E->getArg(1));
3113 
3114     switch (BuiltinID) {
3115     default: llvm_unreachable("Unknown ordered comparison");
3116     case Builtin::BI__builtin_isgreater:
3117       LHS = Builder.CreateFCmpOGT(LHS, RHS, "cmp");
3118       break;
3119     case Builtin::BI__builtin_isgreaterequal:
3120       LHS = Builder.CreateFCmpOGE(LHS, RHS, "cmp");
3121       break;
3122     case Builtin::BI__builtin_isless:
3123       LHS = Builder.CreateFCmpOLT(LHS, RHS, "cmp");
3124       break;
3125     case Builtin::BI__builtin_islessequal:
3126       LHS = Builder.CreateFCmpOLE(LHS, RHS, "cmp");
3127       break;
3128     case Builtin::BI__builtin_islessgreater:
3129       LHS = Builder.CreateFCmpONE(LHS, RHS, "cmp");
3130       break;
3131     case Builtin::BI__builtin_isunordered:
3132       LHS = Builder.CreateFCmpUNO(LHS, RHS, "cmp");
3133       break;
3134     }
3135     // ZExt bool to int type.
3136     return RValue::get(Builder.CreateZExt(LHS, ConvertType(E->getType())));
3137   }
3138 
3139   case Builtin::BI__builtin_isnan: {
3140     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3141     Value *V = EmitScalarExpr(E->getArg(0));
3142     if (Value *Result = tryUseTestFPKind(*this, BuiltinID, V))
3143       return RValue::get(Result);
3144     return RValue::get(
3145         Builder.CreateZExt(Builder.createIsFPClass(V, FPClassTest::fcNan),
3146                            ConvertType(E->getType())));
3147   }
3148 
3149   case Builtin::BI__builtin_isinf: {
3150     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3151     Value *V = EmitScalarExpr(E->getArg(0));
3152     if (Value *Result = tryUseTestFPKind(*this, BuiltinID, V))
3153       return RValue::get(Result);
3154     return RValue::get(
3155         Builder.CreateZExt(Builder.createIsFPClass(V, FPClassTest::fcInf),
3156                            ConvertType(E->getType())));
3157   }
3158 
3159   case Builtin::BIfinite:
3160   case Builtin::BI__finite:
3161   case Builtin::BIfinitef:
3162   case Builtin::BI__finitef:
3163   case Builtin::BIfinitel:
3164   case Builtin::BI__finitel:
3165   case Builtin::BI__builtin_isfinite: {
3166     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3167     Value *V = EmitScalarExpr(E->getArg(0));
3168     if (Value *Result = tryUseTestFPKind(*this, BuiltinID, V))
3169       return RValue::get(Result);
3170     return RValue::get(
3171         Builder.CreateZExt(Builder.createIsFPClass(V, FPClassTest::fcFinite),
3172                            ConvertType(E->getType())));
3173   }
3174 
3175   case Builtin::BI__builtin_isnormal: {
3176     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3177     Value *V = EmitScalarExpr(E->getArg(0));
3178     return RValue::get(
3179         Builder.CreateZExt(Builder.createIsFPClass(V, FPClassTest::fcNormal),
3180                            ConvertType(E->getType())));
3181   }
3182 
3183   case Builtin::BI__builtin_isfpclass: {
3184     Expr::EvalResult Result;
3185     if (!E->getArg(1)->EvaluateAsInt(Result, CGM.getContext()))
3186       break;
3187     uint64_t Test = Result.Val.getInt().getLimitedValue();
3188     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3189     Value *V = EmitScalarExpr(E->getArg(0));
3190     return RValue::get(Builder.CreateZExt(Builder.createIsFPClass(V, Test),
3191                                           ConvertType(E->getType())));
3192   }
3193 
3194   case Builtin::BI__builtin_nondeterministic_value: {
3195     llvm::Type *Ty = ConvertType(E->getArg(0)->getType());
3196 
3197     Value *Result = PoisonValue::get(Ty);
3198     Result = Builder.CreateFreeze(Result);
3199 
3200     return RValue::get(Result);
3201   }
3202 
3203   case Builtin::BI__builtin_elementwise_abs: {
3204     Value *Result;
3205     QualType QT = E->getArg(0)->getType();
3206 
3207     if (auto *VecTy = QT->getAs<VectorType>())
3208       QT = VecTy->getElementType();
3209     if (QT->isIntegerType())
3210       Result = Builder.CreateBinaryIntrinsic(
3211           llvm::Intrinsic::abs, EmitScalarExpr(E->getArg(0)),
3212           Builder.getFalse(), nullptr, "elt.abs");
3213     else
3214       Result = emitUnaryBuiltin(*this, E, llvm::Intrinsic::fabs, "elt.abs");
3215 
3216     return RValue::get(Result);
3217   }
3218 
3219   case Builtin::BI__builtin_elementwise_ceil:
3220     return RValue::get(
3221         emitUnaryBuiltin(*this, E, llvm::Intrinsic::ceil, "elt.ceil"));
3222   case Builtin::BI__builtin_elementwise_exp:
3223     return RValue::get(
3224         emitUnaryBuiltin(*this, E, llvm::Intrinsic::exp, "elt.exp"));
3225   case Builtin::BI__builtin_elementwise_exp2:
3226     return RValue::get(
3227         emitUnaryBuiltin(*this, E, llvm::Intrinsic::exp2, "elt.exp2"));
3228   case Builtin::BI__builtin_elementwise_log:
3229     return RValue::get(
3230         emitUnaryBuiltin(*this, E, llvm::Intrinsic::log, "elt.log"));
3231   case Builtin::BI__builtin_elementwise_log2:
3232     return RValue::get(
3233         emitUnaryBuiltin(*this, E, llvm::Intrinsic::log2, "elt.log2"));
3234   case Builtin::BI__builtin_elementwise_log10:
3235     return RValue::get(
3236         emitUnaryBuiltin(*this, E, llvm::Intrinsic::log10, "elt.log10"));
3237   case Builtin::BI__builtin_elementwise_pow: {
3238     return RValue::get(emitBinaryBuiltin(*this, E, llvm::Intrinsic::pow));
3239   }
3240   case Builtin::BI__builtin_elementwise_cos:
3241     return RValue::get(
3242         emitUnaryBuiltin(*this, E, llvm::Intrinsic::cos, "elt.cos"));
3243   case Builtin::BI__builtin_elementwise_floor:
3244     return RValue::get(
3245         emitUnaryBuiltin(*this, E, llvm::Intrinsic::floor, "elt.floor"));
3246   case Builtin::BI__builtin_elementwise_roundeven:
3247     return RValue::get(emitUnaryBuiltin(*this, E, llvm::Intrinsic::roundeven,
3248                                         "elt.roundeven"));
3249   case Builtin::BI__builtin_elementwise_round:
3250     return RValue::get(emitUnaryBuiltin(*this, E, llvm::Intrinsic::round,
3251                                         "elt.round"));
3252   case Builtin::BI__builtin_elementwise_rint:
3253     return RValue::get(emitUnaryBuiltin(*this, E, llvm::Intrinsic::rint,
3254                                         "elt.rint"));
3255   case Builtin::BI__builtin_elementwise_nearbyint:
3256     return RValue::get(emitUnaryBuiltin(*this, E, llvm::Intrinsic::nearbyint,
3257                                         "elt.nearbyint"));
3258   case Builtin::BI__builtin_elementwise_sin:
3259     return RValue::get(
3260         emitUnaryBuiltin(*this, E, llvm::Intrinsic::sin, "elt.sin"));
3261 
3262   case Builtin::BI__builtin_elementwise_trunc:
3263     return RValue::get(
3264         emitUnaryBuiltin(*this, E, llvm::Intrinsic::trunc, "elt.trunc"));
3265   case Builtin::BI__builtin_elementwise_canonicalize:
3266     return RValue::get(
3267         emitUnaryBuiltin(*this, E, llvm::Intrinsic::canonicalize, "elt.canonicalize"));
3268   case Builtin::BI__builtin_elementwise_copysign:
3269     return RValue::get(emitBinaryBuiltin(*this, E, llvm::Intrinsic::copysign));
3270   case Builtin::BI__builtin_elementwise_fma:
3271     return RValue::get(emitTernaryBuiltin(*this, E, llvm::Intrinsic::fma));
3272   case Builtin::BI__builtin_elementwise_add_sat:
3273   case Builtin::BI__builtin_elementwise_sub_sat: {
3274     Value *Op0 = EmitScalarExpr(E->getArg(0));
3275     Value *Op1 = EmitScalarExpr(E->getArg(1));
3276     Value *Result;
3277     assert(Op0->getType()->isIntOrIntVectorTy() && "integer type expected");
3278     QualType Ty = E->getArg(0)->getType();
3279     if (auto *VecTy = Ty->getAs<VectorType>())
3280       Ty = VecTy->getElementType();
3281     bool IsSigned = Ty->isSignedIntegerType();
3282     unsigned Opc;
3283     if (BuiltinIDIfNoAsmLabel == Builtin::BI__builtin_elementwise_add_sat)
3284       Opc = IsSigned ? llvm::Intrinsic::sadd_sat : llvm::Intrinsic::uadd_sat;
3285     else
3286       Opc = IsSigned ? llvm::Intrinsic::ssub_sat : llvm::Intrinsic::usub_sat;
3287     Result = Builder.CreateBinaryIntrinsic(Opc, Op0, Op1, nullptr, "elt.sat");
3288     return RValue::get(Result);
3289   }
3290 
3291   case Builtin::BI__builtin_elementwise_max: {
3292     Value *Op0 = EmitScalarExpr(E->getArg(0));
3293     Value *Op1 = EmitScalarExpr(E->getArg(1));
3294     Value *Result;
3295     if (Op0->getType()->isIntOrIntVectorTy()) {
3296       QualType Ty = E->getArg(0)->getType();
3297       if (auto *VecTy = Ty->getAs<VectorType>())
3298         Ty = VecTy->getElementType();
3299       Result = Builder.CreateBinaryIntrinsic(Ty->isSignedIntegerType()
3300                                                  ? llvm::Intrinsic::smax
3301                                                  : llvm::Intrinsic::umax,
3302                                              Op0, Op1, nullptr, "elt.max");
3303     } else
3304       Result = Builder.CreateMaxNum(Op0, Op1, "elt.max");
3305     return RValue::get(Result);
3306   }
3307   case Builtin::BI__builtin_elementwise_min: {
3308     Value *Op0 = EmitScalarExpr(E->getArg(0));
3309     Value *Op1 = EmitScalarExpr(E->getArg(1));
3310     Value *Result;
3311     if (Op0->getType()->isIntOrIntVectorTy()) {
3312       QualType Ty = E->getArg(0)->getType();
3313       if (auto *VecTy = Ty->getAs<VectorType>())
3314         Ty = VecTy->getElementType();
3315       Result = Builder.CreateBinaryIntrinsic(Ty->isSignedIntegerType()
3316                                                  ? llvm::Intrinsic::smin
3317                                                  : llvm::Intrinsic::umin,
3318                                              Op0, Op1, nullptr, "elt.min");
3319     } else
3320       Result = Builder.CreateMinNum(Op0, Op1, "elt.min");
3321     return RValue::get(Result);
3322   }
3323 
3324   case Builtin::BI__builtin_reduce_max: {
3325     auto GetIntrinsicID = [](QualType QT) {
3326       if (auto *VecTy = QT->getAs<VectorType>())
3327         QT = VecTy->getElementType();
3328       if (QT->isSignedIntegerType())
3329         return llvm::Intrinsic::vector_reduce_smax;
3330       if (QT->isUnsignedIntegerType())
3331         return llvm::Intrinsic::vector_reduce_umax;
3332       assert(QT->isFloatingType() && "must have a float here");
3333       return llvm::Intrinsic::vector_reduce_fmax;
3334     };
3335     return RValue::get(emitUnaryBuiltin(
3336         *this, E, GetIntrinsicID(E->getArg(0)->getType()), "rdx.min"));
3337   }
3338 
3339   case Builtin::BI__builtin_reduce_min: {
3340     auto GetIntrinsicID = [](QualType QT) {
3341       if (auto *VecTy = QT->getAs<VectorType>())
3342         QT = VecTy->getElementType();
3343       if (QT->isSignedIntegerType())
3344         return llvm::Intrinsic::vector_reduce_smin;
3345       if (QT->isUnsignedIntegerType())
3346         return llvm::Intrinsic::vector_reduce_umin;
3347       assert(QT->isFloatingType() && "must have a float here");
3348       return llvm::Intrinsic::vector_reduce_fmin;
3349     };
3350 
3351     return RValue::get(emitUnaryBuiltin(
3352         *this, E, GetIntrinsicID(E->getArg(0)->getType()), "rdx.min"));
3353   }
3354 
3355   case Builtin::BI__builtin_reduce_add:
3356     return RValue::get(emitUnaryBuiltin(
3357         *this, E, llvm::Intrinsic::vector_reduce_add, "rdx.add"));
3358   case Builtin::BI__builtin_reduce_mul:
3359     return RValue::get(emitUnaryBuiltin(
3360         *this, E, llvm::Intrinsic::vector_reduce_mul, "rdx.mul"));
3361   case Builtin::BI__builtin_reduce_xor:
3362     return RValue::get(emitUnaryBuiltin(
3363         *this, E, llvm::Intrinsic::vector_reduce_xor, "rdx.xor"));
3364   case Builtin::BI__builtin_reduce_or:
3365     return RValue::get(emitUnaryBuiltin(
3366         *this, E, llvm::Intrinsic::vector_reduce_or, "rdx.or"));
3367   case Builtin::BI__builtin_reduce_and:
3368     return RValue::get(emitUnaryBuiltin(
3369         *this, E, llvm::Intrinsic::vector_reduce_and, "rdx.and"));
3370 
3371   case Builtin::BI__builtin_matrix_transpose: {
3372     auto *MatrixTy = E->getArg(0)->getType()->castAs<ConstantMatrixType>();
3373     Value *MatValue = EmitScalarExpr(E->getArg(0));
3374     MatrixBuilder MB(Builder);
3375     Value *Result = MB.CreateMatrixTranspose(MatValue, MatrixTy->getNumRows(),
3376                                              MatrixTy->getNumColumns());
3377     return RValue::get(Result);
3378   }
3379 
3380   case Builtin::BI__builtin_matrix_column_major_load: {
3381     MatrixBuilder MB(Builder);
3382     // Emit everything that isn't dependent on the first parameter type
3383     Value *Stride = EmitScalarExpr(E->getArg(3));
3384     const auto *ResultTy = E->getType()->getAs<ConstantMatrixType>();
3385     auto *PtrTy = E->getArg(0)->getType()->getAs<PointerType>();
3386     assert(PtrTy && "arg0 must be of pointer type");
3387     bool IsVolatile = PtrTy->getPointeeType().isVolatileQualified();
3388 
3389     Address Src = EmitPointerWithAlignment(E->getArg(0));
3390     EmitNonNullArgCheck(RValue::get(Src.getPointer()), E->getArg(0)->getType(),
3391                         E->getArg(0)->getExprLoc(), FD, 0);
3392     Value *Result = MB.CreateColumnMajorLoad(
3393         Src.getElementType(), Src.getPointer(),
3394         Align(Src.getAlignment().getQuantity()), Stride, IsVolatile,
3395         ResultTy->getNumRows(), ResultTy->getNumColumns(),
3396         "matrix");
3397     return RValue::get(Result);
3398   }
3399 
3400   case Builtin::BI__builtin_matrix_column_major_store: {
3401     MatrixBuilder MB(Builder);
3402     Value *Matrix = EmitScalarExpr(E->getArg(0));
3403     Address Dst = EmitPointerWithAlignment(E->getArg(1));
3404     Value *Stride = EmitScalarExpr(E->getArg(2));
3405 
3406     const auto *MatrixTy = E->getArg(0)->getType()->getAs<ConstantMatrixType>();
3407     auto *PtrTy = E->getArg(1)->getType()->getAs<PointerType>();
3408     assert(PtrTy && "arg1 must be of pointer type");
3409     bool IsVolatile = PtrTy->getPointeeType().isVolatileQualified();
3410 
3411     EmitNonNullArgCheck(RValue::get(Dst.getPointer()), E->getArg(1)->getType(),
3412                         E->getArg(1)->getExprLoc(), FD, 0);
3413     Value *Result = MB.CreateColumnMajorStore(
3414         Matrix, Dst.getPointer(), Align(Dst.getAlignment().getQuantity()),
3415         Stride, IsVolatile, MatrixTy->getNumRows(), MatrixTy->getNumColumns());
3416     return RValue::get(Result);
3417   }
3418 
3419   case Builtin::BI__builtin_isinf_sign: {
3420     // isinf_sign(x) -> fabs(x) == infinity ? (signbit(x) ? -1 : 1) : 0
3421     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3422     // FIXME: for strictfp/IEEE-754 we need to not trap on SNaN here.
3423     Value *Arg = EmitScalarExpr(E->getArg(0));
3424     Value *AbsArg = EmitFAbs(*this, Arg);
3425     Value *IsInf = Builder.CreateFCmpOEQ(
3426         AbsArg, ConstantFP::getInfinity(Arg->getType()), "isinf");
3427     Value *IsNeg = EmitSignBit(*this, Arg);
3428 
3429     llvm::Type *IntTy = ConvertType(E->getType());
3430     Value *Zero = Constant::getNullValue(IntTy);
3431     Value *One = ConstantInt::get(IntTy, 1);
3432     Value *NegativeOne = ConstantInt::get(IntTy, -1);
3433     Value *SignResult = Builder.CreateSelect(IsNeg, NegativeOne, One);
3434     Value *Result = Builder.CreateSelect(IsInf, SignResult, Zero);
3435     return RValue::get(Result);
3436   }
3437 
3438   case Builtin::BI__builtin_flt_rounds: {
3439     Function *F = CGM.getIntrinsic(Intrinsic::get_rounding);
3440 
3441     llvm::Type *ResultType = ConvertType(E->getType());
3442     Value *Result = Builder.CreateCall(F);
3443     if (Result->getType() != ResultType)
3444       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
3445                                      "cast");
3446     return RValue::get(Result);
3447   }
3448 
3449   case Builtin::BI__builtin_set_flt_rounds: {
3450     Function *F = CGM.getIntrinsic(Intrinsic::set_rounding);
3451 
3452     Value *V = EmitScalarExpr(E->getArg(0));
3453     Builder.CreateCall(F, V);
3454     return RValue::get(nullptr);
3455   }
3456 
3457   case Builtin::BI__builtin_fpclassify: {
3458     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3459     // FIXME: for strictfp/IEEE-754 we need to not trap on SNaN here.
3460     Value *V = EmitScalarExpr(E->getArg(5));
3461     llvm::Type *Ty = ConvertType(E->getArg(5)->getType());
3462 
3463     // Create Result
3464     BasicBlock *Begin = Builder.GetInsertBlock();
3465     BasicBlock *End = createBasicBlock("fpclassify_end", this->CurFn);
3466     Builder.SetInsertPoint(End);
3467     PHINode *Result =
3468       Builder.CreatePHI(ConvertType(E->getArg(0)->getType()), 4,
3469                         "fpclassify_result");
3470 
3471     // if (V==0) return FP_ZERO
3472     Builder.SetInsertPoint(Begin);
3473     Value *IsZero = Builder.CreateFCmpOEQ(V, Constant::getNullValue(Ty),
3474                                           "iszero");
3475     Value *ZeroLiteral = EmitScalarExpr(E->getArg(4));
3476     BasicBlock *NotZero = createBasicBlock("fpclassify_not_zero", this->CurFn);
3477     Builder.CreateCondBr(IsZero, End, NotZero);
3478     Result->addIncoming(ZeroLiteral, Begin);
3479 
3480     // if (V != V) return FP_NAN
3481     Builder.SetInsertPoint(NotZero);
3482     Value *IsNan = Builder.CreateFCmpUNO(V, V, "cmp");
3483     Value *NanLiteral = EmitScalarExpr(E->getArg(0));
3484     BasicBlock *NotNan = createBasicBlock("fpclassify_not_nan", this->CurFn);
3485     Builder.CreateCondBr(IsNan, End, NotNan);
3486     Result->addIncoming(NanLiteral, NotZero);
3487 
3488     // if (fabs(V) == infinity) return FP_INFINITY
3489     Builder.SetInsertPoint(NotNan);
3490     Value *VAbs = EmitFAbs(*this, V);
3491     Value *IsInf =
3492       Builder.CreateFCmpOEQ(VAbs, ConstantFP::getInfinity(V->getType()),
3493                             "isinf");
3494     Value *InfLiteral = EmitScalarExpr(E->getArg(1));
3495     BasicBlock *NotInf = createBasicBlock("fpclassify_not_inf", this->CurFn);
3496     Builder.CreateCondBr(IsInf, End, NotInf);
3497     Result->addIncoming(InfLiteral, NotNan);
3498 
3499     // if (fabs(V) >= MIN_NORMAL) return FP_NORMAL else FP_SUBNORMAL
3500     Builder.SetInsertPoint(NotInf);
3501     APFloat Smallest = APFloat::getSmallestNormalized(
3502         getContext().getFloatTypeSemantics(E->getArg(5)->getType()));
3503     Value *IsNormal =
3504       Builder.CreateFCmpUGE(VAbs, ConstantFP::get(V->getContext(), Smallest),
3505                             "isnormal");
3506     Value *NormalResult =
3507       Builder.CreateSelect(IsNormal, EmitScalarExpr(E->getArg(2)),
3508                            EmitScalarExpr(E->getArg(3)));
3509     Builder.CreateBr(End);
3510     Result->addIncoming(NormalResult, NotInf);
3511 
3512     // return Result
3513     Builder.SetInsertPoint(End);
3514     return RValue::get(Result);
3515   }
3516 
3517   case Builtin::BIalloca:
3518   case Builtin::BI_alloca:
3519   case Builtin::BI__builtin_alloca_uninitialized:
3520   case Builtin::BI__builtin_alloca: {
3521     Value *Size = EmitScalarExpr(E->getArg(0));
3522     const TargetInfo &TI = getContext().getTargetInfo();
3523     // The alignment of the alloca should correspond to __BIGGEST_ALIGNMENT__.
3524     const Align SuitableAlignmentInBytes =
3525         CGM.getContext()
3526             .toCharUnitsFromBits(TI.getSuitableAlign())
3527             .getAsAlign();
3528     AllocaInst *AI = Builder.CreateAlloca(Builder.getInt8Ty(), Size);
3529     AI->setAlignment(SuitableAlignmentInBytes);
3530     if (BuiltinID != Builtin::BI__builtin_alloca_uninitialized)
3531       initializeAlloca(*this, AI, Size, SuitableAlignmentInBytes);
3532     return RValue::get(AI);
3533   }
3534 
3535   case Builtin::BI__builtin_alloca_with_align_uninitialized:
3536   case Builtin::BI__builtin_alloca_with_align: {
3537     Value *Size = EmitScalarExpr(E->getArg(0));
3538     Value *AlignmentInBitsValue = EmitScalarExpr(E->getArg(1));
3539     auto *AlignmentInBitsCI = cast<ConstantInt>(AlignmentInBitsValue);
3540     unsigned AlignmentInBits = AlignmentInBitsCI->getZExtValue();
3541     const Align AlignmentInBytes =
3542         CGM.getContext().toCharUnitsFromBits(AlignmentInBits).getAsAlign();
3543     AllocaInst *AI = Builder.CreateAlloca(Builder.getInt8Ty(), Size);
3544     AI->setAlignment(AlignmentInBytes);
3545     if (BuiltinID != Builtin::BI__builtin_alloca_with_align_uninitialized)
3546       initializeAlloca(*this, AI, Size, AlignmentInBytes);
3547     return RValue::get(AI);
3548   }
3549 
3550   case Builtin::BIbzero:
3551   case Builtin::BI__builtin_bzero: {
3552     Address Dest = EmitPointerWithAlignment(E->getArg(0));
3553     Value *SizeVal = EmitScalarExpr(E->getArg(1));
3554     EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
3555                         E->getArg(0)->getExprLoc(), FD, 0);
3556     Builder.CreateMemSet(Dest, Builder.getInt8(0), SizeVal, false);
3557     return RValue::get(nullptr);
3558   }
3559   case Builtin::BImemcpy:
3560   case Builtin::BI__builtin_memcpy:
3561   case Builtin::BImempcpy:
3562   case Builtin::BI__builtin_mempcpy: {
3563     Address Dest = EmitPointerWithAlignment(E->getArg(0));
3564     Address Src = EmitPointerWithAlignment(E->getArg(1));
3565     Value *SizeVal = EmitScalarExpr(E->getArg(2));
3566     EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
3567                         E->getArg(0)->getExprLoc(), FD, 0);
3568     EmitNonNullArgCheck(RValue::get(Src.getPointer()), E->getArg(1)->getType(),
3569                         E->getArg(1)->getExprLoc(), FD, 1);
3570     Builder.CreateMemCpy(Dest, Src, SizeVal, false);
3571     if (BuiltinID == Builtin::BImempcpy ||
3572         BuiltinID == Builtin::BI__builtin_mempcpy)
3573       return RValue::get(Builder.CreateInBoundsGEP(Dest.getElementType(),
3574                                                    Dest.getPointer(), SizeVal));
3575     else
3576       return RValue::get(Dest.getPointer());
3577   }
3578 
3579   case Builtin::BI__builtin_memcpy_inline: {
3580     Address Dest = EmitPointerWithAlignment(E->getArg(0));
3581     Address Src = EmitPointerWithAlignment(E->getArg(1));
3582     uint64_t Size =
3583         E->getArg(2)->EvaluateKnownConstInt(getContext()).getZExtValue();
3584     EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
3585                         E->getArg(0)->getExprLoc(), FD, 0);
3586     EmitNonNullArgCheck(RValue::get(Src.getPointer()), E->getArg(1)->getType(),
3587                         E->getArg(1)->getExprLoc(), FD, 1);
3588     Builder.CreateMemCpyInline(Dest, Src, Size);
3589     return RValue::get(nullptr);
3590   }
3591 
3592   case Builtin::BI__builtin_char_memchr:
3593     BuiltinID = Builtin::BI__builtin_memchr;
3594     break;
3595 
3596   case Builtin::BI__builtin___memcpy_chk: {
3597     // fold __builtin_memcpy_chk(x, y, cst1, cst2) to memcpy iff cst1<=cst2.
3598     Expr::EvalResult SizeResult, DstSizeResult;
3599     if (!E->getArg(2)->EvaluateAsInt(SizeResult, CGM.getContext()) ||
3600         !E->getArg(3)->EvaluateAsInt(DstSizeResult, CGM.getContext()))
3601       break;
3602     llvm::APSInt Size = SizeResult.Val.getInt();
3603     llvm::APSInt DstSize = DstSizeResult.Val.getInt();
3604     if (Size.ugt(DstSize))
3605       break;
3606     Address Dest = EmitPointerWithAlignment(E->getArg(0));
3607     Address Src = EmitPointerWithAlignment(E->getArg(1));
3608     Value *SizeVal = llvm::ConstantInt::get(Builder.getContext(), Size);
3609     Builder.CreateMemCpy(Dest, Src, SizeVal, false);
3610     return RValue::get(Dest.getPointer());
3611   }
3612 
3613   case Builtin::BI__builtin_objc_memmove_collectable: {
3614     Address DestAddr = EmitPointerWithAlignment(E->getArg(0));
3615     Address SrcAddr = EmitPointerWithAlignment(E->getArg(1));
3616     Value *SizeVal = EmitScalarExpr(E->getArg(2));
3617     CGM.getObjCRuntime().EmitGCMemmoveCollectable(*this,
3618                                                   DestAddr, SrcAddr, SizeVal);
3619     return RValue::get(DestAddr.getPointer());
3620   }
3621 
3622   case Builtin::BI__builtin___memmove_chk: {
3623     // fold __builtin_memmove_chk(x, y, cst1, cst2) to memmove iff cst1<=cst2.
3624     Expr::EvalResult SizeResult, DstSizeResult;
3625     if (!E->getArg(2)->EvaluateAsInt(SizeResult, CGM.getContext()) ||
3626         !E->getArg(3)->EvaluateAsInt(DstSizeResult, CGM.getContext()))
3627       break;
3628     llvm::APSInt Size = SizeResult.Val.getInt();
3629     llvm::APSInt DstSize = DstSizeResult.Val.getInt();
3630     if (Size.ugt(DstSize))
3631       break;
3632     Address Dest = EmitPointerWithAlignment(E->getArg(0));
3633     Address Src = EmitPointerWithAlignment(E->getArg(1));
3634     Value *SizeVal = llvm::ConstantInt::get(Builder.getContext(), Size);
3635     Builder.CreateMemMove(Dest, Src, SizeVal, false);
3636     return RValue::get(Dest.getPointer());
3637   }
3638 
3639   case Builtin::BImemmove:
3640   case Builtin::BI__builtin_memmove: {
3641     Address Dest = EmitPointerWithAlignment(E->getArg(0));
3642     Address Src = EmitPointerWithAlignment(E->getArg(1));
3643     Value *SizeVal = EmitScalarExpr(E->getArg(2));
3644     EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
3645                         E->getArg(0)->getExprLoc(), FD, 0);
3646     EmitNonNullArgCheck(RValue::get(Src.getPointer()), E->getArg(1)->getType(),
3647                         E->getArg(1)->getExprLoc(), FD, 1);
3648     Builder.CreateMemMove(Dest, Src, SizeVal, false);
3649     return RValue::get(Dest.getPointer());
3650   }
3651   case Builtin::BImemset:
3652   case Builtin::BI__builtin_memset: {
3653     Address Dest = EmitPointerWithAlignment(E->getArg(0));
3654     Value *ByteVal = Builder.CreateTrunc(EmitScalarExpr(E->getArg(1)),
3655                                          Builder.getInt8Ty());
3656     Value *SizeVal = EmitScalarExpr(E->getArg(2));
3657     EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
3658                         E->getArg(0)->getExprLoc(), FD, 0);
3659     Builder.CreateMemSet(Dest, ByteVal, SizeVal, false);
3660     return RValue::get(Dest.getPointer());
3661   }
3662   case Builtin::BI__builtin_memset_inline: {
3663     Address Dest = EmitPointerWithAlignment(E->getArg(0));
3664     Value *ByteVal =
3665         Builder.CreateTrunc(EmitScalarExpr(E->getArg(1)), Builder.getInt8Ty());
3666     uint64_t Size =
3667         E->getArg(2)->EvaluateKnownConstInt(getContext()).getZExtValue();
3668     EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
3669                         E->getArg(0)->getExprLoc(), FD, 0);
3670     Builder.CreateMemSetInline(Dest, ByteVal, Size);
3671     return RValue::get(nullptr);
3672   }
3673   case Builtin::BI__builtin___memset_chk: {
3674     // fold __builtin_memset_chk(x, y, cst1, cst2) to memset iff cst1<=cst2.
3675     Expr::EvalResult SizeResult, DstSizeResult;
3676     if (!E->getArg(2)->EvaluateAsInt(SizeResult, CGM.getContext()) ||
3677         !E->getArg(3)->EvaluateAsInt(DstSizeResult, CGM.getContext()))
3678       break;
3679     llvm::APSInt Size = SizeResult.Val.getInt();
3680     llvm::APSInt DstSize = DstSizeResult.Val.getInt();
3681     if (Size.ugt(DstSize))
3682       break;
3683     Address Dest = EmitPointerWithAlignment(E->getArg(0));
3684     Value *ByteVal = Builder.CreateTrunc(EmitScalarExpr(E->getArg(1)),
3685                                          Builder.getInt8Ty());
3686     Value *SizeVal = llvm::ConstantInt::get(Builder.getContext(), Size);
3687     Builder.CreateMemSet(Dest, ByteVal, SizeVal, false);
3688     return RValue::get(Dest.getPointer());
3689   }
3690   case Builtin::BI__builtin_wmemchr: {
3691     // The MSVC runtime library does not provide a definition of wmemchr, so we
3692     // need an inline implementation.
3693     if (!getTarget().getTriple().isOSMSVCRT())
3694       break;
3695 
3696     llvm::Type *WCharTy = ConvertType(getContext().WCharTy);
3697     Value *Str = EmitScalarExpr(E->getArg(0));
3698     Value *Chr = EmitScalarExpr(E->getArg(1));
3699     Value *Size = EmitScalarExpr(E->getArg(2));
3700 
3701     BasicBlock *Entry = Builder.GetInsertBlock();
3702     BasicBlock *CmpEq = createBasicBlock("wmemchr.eq");
3703     BasicBlock *Next = createBasicBlock("wmemchr.next");
3704     BasicBlock *Exit = createBasicBlock("wmemchr.exit");
3705     Value *SizeEq0 = Builder.CreateICmpEQ(Size, ConstantInt::get(SizeTy, 0));
3706     Builder.CreateCondBr(SizeEq0, Exit, CmpEq);
3707 
3708     EmitBlock(CmpEq);
3709     PHINode *StrPhi = Builder.CreatePHI(Str->getType(), 2);
3710     StrPhi->addIncoming(Str, Entry);
3711     PHINode *SizePhi = Builder.CreatePHI(SizeTy, 2);
3712     SizePhi->addIncoming(Size, Entry);
3713     CharUnits WCharAlign =
3714         getContext().getTypeAlignInChars(getContext().WCharTy);
3715     Value *StrCh = Builder.CreateAlignedLoad(WCharTy, StrPhi, WCharAlign);
3716     Value *FoundChr = Builder.CreateConstInBoundsGEP1_32(WCharTy, StrPhi, 0);
3717     Value *StrEqChr = Builder.CreateICmpEQ(StrCh, Chr);
3718     Builder.CreateCondBr(StrEqChr, Exit, Next);
3719 
3720     EmitBlock(Next);
3721     Value *NextStr = Builder.CreateConstInBoundsGEP1_32(WCharTy, StrPhi, 1);
3722     Value *NextSize = Builder.CreateSub(SizePhi, ConstantInt::get(SizeTy, 1));
3723     Value *NextSizeEq0 =
3724         Builder.CreateICmpEQ(NextSize, ConstantInt::get(SizeTy, 0));
3725     Builder.CreateCondBr(NextSizeEq0, Exit, CmpEq);
3726     StrPhi->addIncoming(NextStr, Next);
3727     SizePhi->addIncoming(NextSize, Next);
3728 
3729     EmitBlock(Exit);
3730     PHINode *Ret = Builder.CreatePHI(Str->getType(), 3);
3731     Ret->addIncoming(llvm::Constant::getNullValue(Str->getType()), Entry);
3732     Ret->addIncoming(llvm::Constant::getNullValue(Str->getType()), Next);
3733     Ret->addIncoming(FoundChr, CmpEq);
3734     return RValue::get(Ret);
3735   }
3736   case Builtin::BI__builtin_wmemcmp: {
3737     // The MSVC runtime library does not provide a definition of wmemcmp, so we
3738     // need an inline implementation.
3739     if (!getTarget().getTriple().isOSMSVCRT())
3740       break;
3741 
3742     llvm::Type *WCharTy = ConvertType(getContext().WCharTy);
3743 
3744     Value *Dst = EmitScalarExpr(E->getArg(0));
3745     Value *Src = EmitScalarExpr(E->getArg(1));
3746     Value *Size = EmitScalarExpr(E->getArg(2));
3747 
3748     BasicBlock *Entry = Builder.GetInsertBlock();
3749     BasicBlock *CmpGT = createBasicBlock("wmemcmp.gt");
3750     BasicBlock *CmpLT = createBasicBlock("wmemcmp.lt");
3751     BasicBlock *Next = createBasicBlock("wmemcmp.next");
3752     BasicBlock *Exit = createBasicBlock("wmemcmp.exit");
3753     Value *SizeEq0 = Builder.CreateICmpEQ(Size, ConstantInt::get(SizeTy, 0));
3754     Builder.CreateCondBr(SizeEq0, Exit, CmpGT);
3755 
3756     EmitBlock(CmpGT);
3757     PHINode *DstPhi = Builder.CreatePHI(Dst->getType(), 2);
3758     DstPhi->addIncoming(Dst, Entry);
3759     PHINode *SrcPhi = Builder.CreatePHI(Src->getType(), 2);
3760     SrcPhi->addIncoming(Src, Entry);
3761     PHINode *SizePhi = Builder.CreatePHI(SizeTy, 2);
3762     SizePhi->addIncoming(Size, Entry);
3763     CharUnits WCharAlign =
3764         getContext().getTypeAlignInChars(getContext().WCharTy);
3765     Value *DstCh = Builder.CreateAlignedLoad(WCharTy, DstPhi, WCharAlign);
3766     Value *SrcCh = Builder.CreateAlignedLoad(WCharTy, SrcPhi, WCharAlign);
3767     Value *DstGtSrc = Builder.CreateICmpUGT(DstCh, SrcCh);
3768     Builder.CreateCondBr(DstGtSrc, Exit, CmpLT);
3769 
3770     EmitBlock(CmpLT);
3771     Value *DstLtSrc = Builder.CreateICmpULT(DstCh, SrcCh);
3772     Builder.CreateCondBr(DstLtSrc, Exit, Next);
3773 
3774     EmitBlock(Next);
3775     Value *NextDst = Builder.CreateConstInBoundsGEP1_32(WCharTy, DstPhi, 1);
3776     Value *NextSrc = Builder.CreateConstInBoundsGEP1_32(WCharTy, SrcPhi, 1);
3777     Value *NextSize = Builder.CreateSub(SizePhi, ConstantInt::get(SizeTy, 1));
3778     Value *NextSizeEq0 =
3779         Builder.CreateICmpEQ(NextSize, ConstantInt::get(SizeTy, 0));
3780     Builder.CreateCondBr(NextSizeEq0, Exit, CmpGT);
3781     DstPhi->addIncoming(NextDst, Next);
3782     SrcPhi->addIncoming(NextSrc, Next);
3783     SizePhi->addIncoming(NextSize, Next);
3784 
3785     EmitBlock(Exit);
3786     PHINode *Ret = Builder.CreatePHI(IntTy, 4);
3787     Ret->addIncoming(ConstantInt::get(IntTy, 0), Entry);
3788     Ret->addIncoming(ConstantInt::get(IntTy, 1), CmpGT);
3789     Ret->addIncoming(ConstantInt::get(IntTy, -1), CmpLT);
3790     Ret->addIncoming(ConstantInt::get(IntTy, 0), Next);
3791     return RValue::get(Ret);
3792   }
3793   case Builtin::BI__builtin_dwarf_cfa: {
3794     // The offset in bytes from the first argument to the CFA.
3795     //
3796     // Why on earth is this in the frontend?  Is there any reason at
3797     // all that the backend can't reasonably determine this while
3798     // lowering llvm.eh.dwarf.cfa()?
3799     //
3800     // TODO: If there's a satisfactory reason, add a target hook for
3801     // this instead of hard-coding 0, which is correct for most targets.
3802     int32_t Offset = 0;
3803 
3804     Function *F = CGM.getIntrinsic(Intrinsic::eh_dwarf_cfa);
3805     return RValue::get(Builder.CreateCall(F,
3806                                       llvm::ConstantInt::get(Int32Ty, Offset)));
3807   }
3808   case Builtin::BI__builtin_return_address: {
3809     Value *Depth = ConstantEmitter(*this).emitAbstract(E->getArg(0),
3810                                                    getContext().UnsignedIntTy);
3811     Function *F = CGM.getIntrinsic(Intrinsic::returnaddress);
3812     return RValue::get(Builder.CreateCall(F, Depth));
3813   }
3814   case Builtin::BI_ReturnAddress: {
3815     Function *F = CGM.getIntrinsic(Intrinsic::returnaddress);
3816     return RValue::get(Builder.CreateCall(F, Builder.getInt32(0)));
3817   }
3818   case Builtin::BI__builtin_frame_address: {
3819     Value *Depth = ConstantEmitter(*this).emitAbstract(E->getArg(0),
3820                                                    getContext().UnsignedIntTy);
3821     Function *F = CGM.getIntrinsic(Intrinsic::frameaddress, AllocaInt8PtrTy);
3822     return RValue::get(Builder.CreateCall(F, Depth));
3823   }
3824   case Builtin::BI__builtin_extract_return_addr: {
3825     Value *Address = EmitScalarExpr(E->getArg(0));
3826     Value *Result = getTargetHooks().decodeReturnAddress(*this, Address);
3827     return RValue::get(Result);
3828   }
3829   case Builtin::BI__builtin_frob_return_addr: {
3830     Value *Address = EmitScalarExpr(E->getArg(0));
3831     Value *Result = getTargetHooks().encodeReturnAddress(*this, Address);
3832     return RValue::get(Result);
3833   }
3834   case Builtin::BI__builtin_dwarf_sp_column: {
3835     llvm::IntegerType *Ty
3836       = cast<llvm::IntegerType>(ConvertType(E->getType()));
3837     int Column = getTargetHooks().getDwarfEHStackPointer(CGM);
3838     if (Column == -1) {
3839       CGM.ErrorUnsupported(E, "__builtin_dwarf_sp_column");
3840       return RValue::get(llvm::UndefValue::get(Ty));
3841     }
3842     return RValue::get(llvm::ConstantInt::get(Ty, Column, true));
3843   }
3844   case Builtin::BI__builtin_init_dwarf_reg_size_table: {
3845     Value *Address = EmitScalarExpr(E->getArg(0));
3846     if (getTargetHooks().initDwarfEHRegSizeTable(*this, Address))
3847       CGM.ErrorUnsupported(E, "__builtin_init_dwarf_reg_size_table");
3848     return RValue::get(llvm::UndefValue::get(ConvertType(E->getType())));
3849   }
3850   case Builtin::BI__builtin_eh_return: {
3851     Value *Int = EmitScalarExpr(E->getArg(0));
3852     Value *Ptr = EmitScalarExpr(E->getArg(1));
3853 
3854     llvm::IntegerType *IntTy = cast<llvm::IntegerType>(Int->getType());
3855     assert((IntTy->getBitWidth() == 32 || IntTy->getBitWidth() == 64) &&
3856            "LLVM's __builtin_eh_return only supports 32- and 64-bit variants");
3857     Function *F =
3858         CGM.getIntrinsic(IntTy->getBitWidth() == 32 ? Intrinsic::eh_return_i32
3859                                                     : Intrinsic::eh_return_i64);
3860     Builder.CreateCall(F, {Int, Ptr});
3861     Builder.CreateUnreachable();
3862 
3863     // We do need to preserve an insertion point.
3864     EmitBlock(createBasicBlock("builtin_eh_return.cont"));
3865 
3866     return RValue::get(nullptr);
3867   }
3868   case Builtin::BI__builtin_unwind_init: {
3869     Function *F = CGM.getIntrinsic(Intrinsic::eh_unwind_init);
3870     Builder.CreateCall(F);
3871     return RValue::get(nullptr);
3872   }
3873   case Builtin::BI__builtin_extend_pointer: {
3874     // Extends a pointer to the size of an _Unwind_Word, which is
3875     // uint64_t on all platforms.  Generally this gets poked into a
3876     // register and eventually used as an address, so if the
3877     // addressing registers are wider than pointers and the platform
3878     // doesn't implicitly ignore high-order bits when doing
3879     // addressing, we need to make sure we zext / sext based on
3880     // the platform's expectations.
3881     //
3882     // See: http://gcc.gnu.org/ml/gcc-bugs/2002-02/msg00237.html
3883 
3884     // Cast the pointer to intptr_t.
3885     Value *Ptr = EmitScalarExpr(E->getArg(0));
3886     Value *Result = Builder.CreatePtrToInt(Ptr, IntPtrTy, "extend.cast");
3887 
3888     // If that's 64 bits, we're done.
3889     if (IntPtrTy->getBitWidth() == 64)
3890       return RValue::get(Result);
3891 
3892     // Otherwise, ask the codegen data what to do.
3893     if (getTargetHooks().extendPointerWithSExt())
3894       return RValue::get(Builder.CreateSExt(Result, Int64Ty, "extend.sext"));
3895     else
3896       return RValue::get(Builder.CreateZExt(Result, Int64Ty, "extend.zext"));
3897   }
3898   case Builtin::BI__builtin_setjmp: {
3899     // Buffer is a void**.
3900     Address Buf = EmitPointerWithAlignment(E->getArg(0));
3901 
3902     // Store the frame pointer to the setjmp buffer.
3903     Value *FrameAddr = Builder.CreateCall(
3904         CGM.getIntrinsic(Intrinsic::frameaddress, AllocaInt8PtrTy),
3905         ConstantInt::get(Int32Ty, 0));
3906     Builder.CreateStore(FrameAddr, Buf);
3907 
3908     // Store the stack pointer to the setjmp buffer.
3909     Value *StackAddr =
3910         Builder.CreateCall(CGM.getIntrinsic(Intrinsic::stacksave));
3911     Address StackSaveSlot = Builder.CreateConstInBoundsGEP(Buf, 2);
3912     Builder.CreateStore(StackAddr, StackSaveSlot);
3913 
3914     // Call LLVM's EH setjmp, which is lightweight.
3915     Function *F = CGM.getIntrinsic(Intrinsic::eh_sjlj_setjmp);
3916     return RValue::get(Builder.CreateCall(F, Buf.getPointer()));
3917   }
3918   case Builtin::BI__builtin_longjmp: {
3919     Value *Buf = EmitScalarExpr(E->getArg(0));
3920     Buf = Builder.CreateBitCast(Buf, Int8PtrTy);
3921 
3922     // Call LLVM's EH longjmp, which is lightweight.
3923     Builder.CreateCall(CGM.getIntrinsic(Intrinsic::eh_sjlj_longjmp), Buf);
3924 
3925     // longjmp doesn't return; mark this as unreachable.
3926     Builder.CreateUnreachable();
3927 
3928     // We do need to preserve an insertion point.
3929     EmitBlock(createBasicBlock("longjmp.cont"));
3930 
3931     return RValue::get(nullptr);
3932   }
3933   case Builtin::BI__builtin_launder: {
3934     const Expr *Arg = E->getArg(0);
3935     QualType ArgTy = Arg->getType()->getPointeeType();
3936     Value *Ptr = EmitScalarExpr(Arg);
3937     if (TypeRequiresBuiltinLaunder(CGM, ArgTy))
3938       Ptr = Builder.CreateLaunderInvariantGroup(Ptr);
3939 
3940     return RValue::get(Ptr);
3941   }
3942   case Builtin::BI__sync_fetch_and_add:
3943   case Builtin::BI__sync_fetch_and_sub:
3944   case Builtin::BI__sync_fetch_and_or:
3945   case Builtin::BI__sync_fetch_and_and:
3946   case Builtin::BI__sync_fetch_and_xor:
3947   case Builtin::BI__sync_fetch_and_nand:
3948   case Builtin::BI__sync_add_and_fetch:
3949   case Builtin::BI__sync_sub_and_fetch:
3950   case Builtin::BI__sync_and_and_fetch:
3951   case Builtin::BI__sync_or_and_fetch:
3952   case Builtin::BI__sync_xor_and_fetch:
3953   case Builtin::BI__sync_nand_and_fetch:
3954   case Builtin::BI__sync_val_compare_and_swap:
3955   case Builtin::BI__sync_bool_compare_and_swap:
3956   case Builtin::BI__sync_lock_test_and_set:
3957   case Builtin::BI__sync_lock_release:
3958   case Builtin::BI__sync_swap:
3959     llvm_unreachable("Shouldn't make it through sema");
3960   case Builtin::BI__sync_fetch_and_add_1:
3961   case Builtin::BI__sync_fetch_and_add_2:
3962   case Builtin::BI__sync_fetch_and_add_4:
3963   case Builtin::BI__sync_fetch_and_add_8:
3964   case Builtin::BI__sync_fetch_and_add_16:
3965     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Add, E);
3966   case Builtin::BI__sync_fetch_and_sub_1:
3967   case Builtin::BI__sync_fetch_and_sub_2:
3968   case Builtin::BI__sync_fetch_and_sub_4:
3969   case Builtin::BI__sync_fetch_and_sub_8:
3970   case Builtin::BI__sync_fetch_and_sub_16:
3971     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Sub, E);
3972   case Builtin::BI__sync_fetch_and_or_1:
3973   case Builtin::BI__sync_fetch_and_or_2:
3974   case Builtin::BI__sync_fetch_and_or_4:
3975   case Builtin::BI__sync_fetch_and_or_8:
3976   case Builtin::BI__sync_fetch_and_or_16:
3977     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Or, E);
3978   case Builtin::BI__sync_fetch_and_and_1:
3979   case Builtin::BI__sync_fetch_and_and_2:
3980   case Builtin::BI__sync_fetch_and_and_4:
3981   case Builtin::BI__sync_fetch_and_and_8:
3982   case Builtin::BI__sync_fetch_and_and_16:
3983     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::And, E);
3984   case Builtin::BI__sync_fetch_and_xor_1:
3985   case Builtin::BI__sync_fetch_and_xor_2:
3986   case Builtin::BI__sync_fetch_and_xor_4:
3987   case Builtin::BI__sync_fetch_and_xor_8:
3988   case Builtin::BI__sync_fetch_and_xor_16:
3989     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xor, E);
3990   case Builtin::BI__sync_fetch_and_nand_1:
3991   case Builtin::BI__sync_fetch_and_nand_2:
3992   case Builtin::BI__sync_fetch_and_nand_4:
3993   case Builtin::BI__sync_fetch_and_nand_8:
3994   case Builtin::BI__sync_fetch_and_nand_16:
3995     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Nand, E);
3996 
3997   // Clang extensions: not overloaded yet.
3998   case Builtin::BI__sync_fetch_and_min:
3999     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Min, E);
4000   case Builtin::BI__sync_fetch_and_max:
4001     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Max, E);
4002   case Builtin::BI__sync_fetch_and_umin:
4003     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::UMin, E);
4004   case Builtin::BI__sync_fetch_and_umax:
4005     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::UMax, E);
4006 
4007   case Builtin::BI__sync_add_and_fetch_1:
4008   case Builtin::BI__sync_add_and_fetch_2:
4009   case Builtin::BI__sync_add_and_fetch_4:
4010   case Builtin::BI__sync_add_and_fetch_8:
4011   case Builtin::BI__sync_add_and_fetch_16:
4012     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Add, E,
4013                                 llvm::Instruction::Add);
4014   case Builtin::BI__sync_sub_and_fetch_1:
4015   case Builtin::BI__sync_sub_and_fetch_2:
4016   case Builtin::BI__sync_sub_and_fetch_4:
4017   case Builtin::BI__sync_sub_and_fetch_8:
4018   case Builtin::BI__sync_sub_and_fetch_16:
4019     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Sub, E,
4020                                 llvm::Instruction::Sub);
4021   case Builtin::BI__sync_and_and_fetch_1:
4022   case Builtin::BI__sync_and_and_fetch_2:
4023   case Builtin::BI__sync_and_and_fetch_4:
4024   case Builtin::BI__sync_and_and_fetch_8:
4025   case Builtin::BI__sync_and_and_fetch_16:
4026     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::And, E,
4027                                 llvm::Instruction::And);
4028   case Builtin::BI__sync_or_and_fetch_1:
4029   case Builtin::BI__sync_or_and_fetch_2:
4030   case Builtin::BI__sync_or_and_fetch_4:
4031   case Builtin::BI__sync_or_and_fetch_8:
4032   case Builtin::BI__sync_or_and_fetch_16:
4033     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Or, E,
4034                                 llvm::Instruction::Or);
4035   case Builtin::BI__sync_xor_and_fetch_1:
4036   case Builtin::BI__sync_xor_and_fetch_2:
4037   case Builtin::BI__sync_xor_and_fetch_4:
4038   case Builtin::BI__sync_xor_and_fetch_8:
4039   case Builtin::BI__sync_xor_and_fetch_16:
4040     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Xor, E,
4041                                 llvm::Instruction::Xor);
4042   case Builtin::BI__sync_nand_and_fetch_1:
4043   case Builtin::BI__sync_nand_and_fetch_2:
4044   case Builtin::BI__sync_nand_and_fetch_4:
4045   case Builtin::BI__sync_nand_and_fetch_8:
4046   case Builtin::BI__sync_nand_and_fetch_16:
4047     return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Nand, E,
4048                                 llvm::Instruction::And, true);
4049 
4050   case Builtin::BI__sync_val_compare_and_swap_1:
4051   case Builtin::BI__sync_val_compare_and_swap_2:
4052   case Builtin::BI__sync_val_compare_and_swap_4:
4053   case Builtin::BI__sync_val_compare_and_swap_8:
4054   case Builtin::BI__sync_val_compare_and_swap_16:
4055     return RValue::get(MakeAtomicCmpXchgValue(*this, E, false));
4056 
4057   case Builtin::BI__sync_bool_compare_and_swap_1:
4058   case Builtin::BI__sync_bool_compare_and_swap_2:
4059   case Builtin::BI__sync_bool_compare_and_swap_4:
4060   case Builtin::BI__sync_bool_compare_and_swap_8:
4061   case Builtin::BI__sync_bool_compare_and_swap_16:
4062     return RValue::get(MakeAtomicCmpXchgValue(*this, E, true));
4063 
4064   case Builtin::BI__sync_swap_1:
4065   case Builtin::BI__sync_swap_2:
4066   case Builtin::BI__sync_swap_4:
4067   case Builtin::BI__sync_swap_8:
4068   case Builtin::BI__sync_swap_16:
4069     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xchg, E);
4070 
4071   case Builtin::BI__sync_lock_test_and_set_1:
4072   case Builtin::BI__sync_lock_test_and_set_2:
4073   case Builtin::BI__sync_lock_test_and_set_4:
4074   case Builtin::BI__sync_lock_test_and_set_8:
4075   case Builtin::BI__sync_lock_test_and_set_16:
4076     return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xchg, E);
4077 
4078   case Builtin::BI__sync_lock_release_1:
4079   case Builtin::BI__sync_lock_release_2:
4080   case Builtin::BI__sync_lock_release_4:
4081   case Builtin::BI__sync_lock_release_8:
4082   case Builtin::BI__sync_lock_release_16: {
4083     Value *Ptr = CheckAtomicAlignment(*this, E);
4084     QualType ElTy = E->getArg(0)->getType()->getPointeeType();
4085     CharUnits StoreSize = getContext().getTypeSizeInChars(ElTy);
4086     llvm::Type *ITy =
4087         llvm::IntegerType::get(getLLVMContext(), StoreSize.getQuantity() * 8);
4088     llvm::StoreInst *Store =
4089       Builder.CreateAlignedStore(llvm::Constant::getNullValue(ITy), Ptr,
4090                                  StoreSize);
4091     Store->setAtomic(llvm::AtomicOrdering::Release);
4092     return RValue::get(nullptr);
4093   }
4094 
4095   case Builtin::BI__sync_synchronize: {
4096     // We assume this is supposed to correspond to a C++0x-style
4097     // sequentially-consistent fence (i.e. this is only usable for
4098     // synchronization, not device I/O or anything like that). This intrinsic
4099     // is really badly designed in the sense that in theory, there isn't
4100     // any way to safely use it... but in practice, it mostly works
4101     // to use it with non-atomic loads and stores to get acquire/release
4102     // semantics.
4103     Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent);
4104     return RValue::get(nullptr);
4105   }
4106 
4107   case Builtin::BI__builtin_nontemporal_load:
4108     return RValue::get(EmitNontemporalLoad(*this, E));
4109   case Builtin::BI__builtin_nontemporal_store:
4110     return RValue::get(EmitNontemporalStore(*this, E));
4111   case Builtin::BI__c11_atomic_is_lock_free:
4112   case Builtin::BI__atomic_is_lock_free: {
4113     // Call "bool __atomic_is_lock_free(size_t size, void *ptr)". For the
4114     // __c11 builtin, ptr is 0 (indicating a properly-aligned object), since
4115     // _Atomic(T) is always properly-aligned.
4116     const char *LibCallName = "__atomic_is_lock_free";
4117     CallArgList Args;
4118     Args.add(RValue::get(EmitScalarExpr(E->getArg(0))),
4119              getContext().getSizeType());
4120     if (BuiltinID == Builtin::BI__atomic_is_lock_free)
4121       Args.add(RValue::get(EmitScalarExpr(E->getArg(1))),
4122                getContext().VoidPtrTy);
4123     else
4124       Args.add(RValue::get(llvm::Constant::getNullValue(VoidPtrTy)),
4125                getContext().VoidPtrTy);
4126     const CGFunctionInfo &FuncInfo =
4127         CGM.getTypes().arrangeBuiltinFunctionCall(E->getType(), Args);
4128     llvm::FunctionType *FTy = CGM.getTypes().GetFunctionType(FuncInfo);
4129     llvm::FunctionCallee Func = CGM.CreateRuntimeFunction(FTy, LibCallName);
4130     return EmitCall(FuncInfo, CGCallee::forDirect(Func),
4131                     ReturnValueSlot(), Args);
4132   }
4133 
4134   case Builtin::BI__atomic_test_and_set: {
4135     // Look at the argument type to determine whether this is a volatile
4136     // operation. The parameter type is always volatile.
4137     QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
4138     bool Volatile =
4139         PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
4140 
4141     Value *Ptr = EmitScalarExpr(E->getArg(0));
4142     Value *NewVal = Builder.getInt8(1);
4143     Value *Order = EmitScalarExpr(E->getArg(1));
4144     if (isa<llvm::ConstantInt>(Order)) {
4145       int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
4146       AtomicRMWInst *Result = nullptr;
4147       switch (ord) {
4148       case 0:  // memory_order_relaxed
4149       default: // invalid order
4150         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
4151                                          llvm::AtomicOrdering::Monotonic);
4152         break;
4153       case 1: // memory_order_consume
4154       case 2: // memory_order_acquire
4155         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
4156                                          llvm::AtomicOrdering::Acquire);
4157         break;
4158       case 3: // memory_order_release
4159         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
4160                                          llvm::AtomicOrdering::Release);
4161         break;
4162       case 4: // memory_order_acq_rel
4163 
4164         Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
4165                                          llvm::AtomicOrdering::AcquireRelease);
4166         break;
4167       case 5: // memory_order_seq_cst
4168         Result = Builder.CreateAtomicRMW(
4169             llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
4170             llvm::AtomicOrdering::SequentiallyConsistent);
4171         break;
4172       }
4173       Result->setVolatile(Volatile);
4174       return RValue::get(Builder.CreateIsNotNull(Result, "tobool"));
4175     }
4176 
4177     llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
4178 
4179     llvm::BasicBlock *BBs[5] = {
4180       createBasicBlock("monotonic", CurFn),
4181       createBasicBlock("acquire", CurFn),
4182       createBasicBlock("release", CurFn),
4183       createBasicBlock("acqrel", CurFn),
4184       createBasicBlock("seqcst", CurFn)
4185     };
4186     llvm::AtomicOrdering Orders[5] = {
4187         llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Acquire,
4188         llvm::AtomicOrdering::Release, llvm::AtomicOrdering::AcquireRelease,
4189         llvm::AtomicOrdering::SequentiallyConsistent};
4190 
4191     Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
4192     llvm::SwitchInst *SI = Builder.CreateSwitch(Order, BBs[0]);
4193 
4194     Builder.SetInsertPoint(ContBB);
4195     PHINode *Result = Builder.CreatePHI(Int8Ty, 5, "was_set");
4196 
4197     for (unsigned i = 0; i < 5; ++i) {
4198       Builder.SetInsertPoint(BBs[i]);
4199       AtomicRMWInst *RMW = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg,
4200                                                    Ptr, NewVal, Orders[i]);
4201       RMW->setVolatile(Volatile);
4202       Result->addIncoming(RMW, BBs[i]);
4203       Builder.CreateBr(ContBB);
4204     }
4205 
4206     SI->addCase(Builder.getInt32(0), BBs[0]);
4207     SI->addCase(Builder.getInt32(1), BBs[1]);
4208     SI->addCase(Builder.getInt32(2), BBs[1]);
4209     SI->addCase(Builder.getInt32(3), BBs[2]);
4210     SI->addCase(Builder.getInt32(4), BBs[3]);
4211     SI->addCase(Builder.getInt32(5), BBs[4]);
4212 
4213     Builder.SetInsertPoint(ContBB);
4214     return RValue::get(Builder.CreateIsNotNull(Result, "tobool"));
4215   }
4216 
4217   case Builtin::BI__atomic_clear: {
4218     QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
4219     bool Volatile =
4220         PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
4221 
4222     Address Ptr = EmitPointerWithAlignment(E->getArg(0));
4223     Ptr = Ptr.withElementType(Int8Ty);
4224     Value *NewVal = Builder.getInt8(0);
4225     Value *Order = EmitScalarExpr(E->getArg(1));
4226     if (isa<llvm::ConstantInt>(Order)) {
4227       int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
4228       StoreInst *Store = Builder.CreateStore(NewVal, Ptr, Volatile);
4229       switch (ord) {
4230       case 0:  // memory_order_relaxed
4231       default: // invalid order
4232         Store->setOrdering(llvm::AtomicOrdering::Monotonic);
4233         break;
4234       case 3:  // memory_order_release
4235         Store->setOrdering(llvm::AtomicOrdering::Release);
4236         break;
4237       case 5:  // memory_order_seq_cst
4238         Store->setOrdering(llvm::AtomicOrdering::SequentiallyConsistent);
4239         break;
4240       }
4241       return RValue::get(nullptr);
4242     }
4243 
4244     llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
4245 
4246     llvm::BasicBlock *BBs[3] = {
4247       createBasicBlock("monotonic", CurFn),
4248       createBasicBlock("release", CurFn),
4249       createBasicBlock("seqcst", CurFn)
4250     };
4251     llvm::AtomicOrdering Orders[3] = {
4252         llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Release,
4253         llvm::AtomicOrdering::SequentiallyConsistent};
4254 
4255     Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
4256     llvm::SwitchInst *SI = Builder.CreateSwitch(Order, BBs[0]);
4257 
4258     for (unsigned i = 0; i < 3; ++i) {
4259       Builder.SetInsertPoint(BBs[i]);
4260       StoreInst *Store = Builder.CreateStore(NewVal, Ptr, Volatile);
4261       Store->setOrdering(Orders[i]);
4262       Builder.CreateBr(ContBB);
4263     }
4264 
4265     SI->addCase(Builder.getInt32(0), BBs[0]);
4266     SI->addCase(Builder.getInt32(3), BBs[1]);
4267     SI->addCase(Builder.getInt32(5), BBs[2]);
4268 
4269     Builder.SetInsertPoint(ContBB);
4270     return RValue::get(nullptr);
4271   }
4272 
4273   case Builtin::BI__atomic_thread_fence:
4274   case Builtin::BI__atomic_signal_fence:
4275   case Builtin::BI__c11_atomic_thread_fence:
4276   case Builtin::BI__c11_atomic_signal_fence: {
4277     llvm::SyncScope::ID SSID;
4278     if (BuiltinID == Builtin::BI__atomic_signal_fence ||
4279         BuiltinID == Builtin::BI__c11_atomic_signal_fence)
4280       SSID = llvm::SyncScope::SingleThread;
4281     else
4282       SSID = llvm::SyncScope::System;
4283     Value *Order = EmitScalarExpr(E->getArg(0));
4284     if (isa<llvm::ConstantInt>(Order)) {
4285       int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
4286       switch (ord) {
4287       case 0:  // memory_order_relaxed
4288       default: // invalid order
4289         break;
4290       case 1:  // memory_order_consume
4291       case 2:  // memory_order_acquire
4292         Builder.CreateFence(llvm::AtomicOrdering::Acquire, SSID);
4293         break;
4294       case 3:  // memory_order_release
4295         Builder.CreateFence(llvm::AtomicOrdering::Release, SSID);
4296         break;
4297       case 4:  // memory_order_acq_rel
4298         Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, SSID);
4299         break;
4300       case 5:  // memory_order_seq_cst
4301         Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, SSID);
4302         break;
4303       }
4304       return RValue::get(nullptr);
4305     }
4306 
4307     llvm::BasicBlock *AcquireBB, *ReleaseBB, *AcqRelBB, *SeqCstBB;
4308     AcquireBB = createBasicBlock("acquire", CurFn);
4309     ReleaseBB = createBasicBlock("release", CurFn);
4310     AcqRelBB = createBasicBlock("acqrel", CurFn);
4311     SeqCstBB = createBasicBlock("seqcst", CurFn);
4312     llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
4313 
4314     Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
4315     llvm::SwitchInst *SI = Builder.CreateSwitch(Order, ContBB);
4316 
4317     Builder.SetInsertPoint(AcquireBB);
4318     Builder.CreateFence(llvm::AtomicOrdering::Acquire, SSID);
4319     Builder.CreateBr(ContBB);
4320     SI->addCase(Builder.getInt32(1), AcquireBB);
4321     SI->addCase(Builder.getInt32(2), AcquireBB);
4322 
4323     Builder.SetInsertPoint(ReleaseBB);
4324     Builder.CreateFence(llvm::AtomicOrdering::Release, SSID);
4325     Builder.CreateBr(ContBB);
4326     SI->addCase(Builder.getInt32(3), ReleaseBB);
4327 
4328     Builder.SetInsertPoint(AcqRelBB);
4329     Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, SSID);
4330     Builder.CreateBr(ContBB);
4331     SI->addCase(Builder.getInt32(4), AcqRelBB);
4332 
4333     Builder.SetInsertPoint(SeqCstBB);
4334     Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, SSID);
4335     Builder.CreateBr(ContBB);
4336     SI->addCase(Builder.getInt32(5), SeqCstBB);
4337 
4338     Builder.SetInsertPoint(ContBB);
4339     return RValue::get(nullptr);
4340   }
4341 
4342   case Builtin::BI__builtin_signbit:
4343   case Builtin::BI__builtin_signbitf:
4344   case Builtin::BI__builtin_signbitl: {
4345     return RValue::get(
4346         Builder.CreateZExt(EmitSignBit(*this, EmitScalarExpr(E->getArg(0))),
4347                            ConvertType(E->getType())));
4348   }
4349   case Builtin::BI__warn_memset_zero_len:
4350     return RValue::getIgnored();
4351   case Builtin::BI__annotation: {
4352     // Re-encode each wide string to UTF8 and make an MDString.
4353     SmallVector<Metadata *, 1> Strings;
4354     for (const Expr *Arg : E->arguments()) {
4355       const auto *Str = cast<StringLiteral>(Arg->IgnoreParenCasts());
4356       assert(Str->getCharByteWidth() == 2);
4357       StringRef WideBytes = Str->getBytes();
4358       std::string StrUtf8;
4359       if (!convertUTF16ToUTF8String(
4360               ArrayRef(WideBytes.data(), WideBytes.size()), StrUtf8)) {
4361         CGM.ErrorUnsupported(E, "non-UTF16 __annotation argument");
4362         continue;
4363       }
4364       Strings.push_back(llvm::MDString::get(getLLVMContext(), StrUtf8));
4365     }
4366 
4367     // Build and MDTuple of MDStrings and emit the intrinsic call.
4368     llvm::Function *F =
4369         CGM.getIntrinsic(llvm::Intrinsic::codeview_annotation, {});
4370     MDTuple *StrTuple = MDTuple::get(getLLVMContext(), Strings);
4371     Builder.CreateCall(F, MetadataAsValue::get(getLLVMContext(), StrTuple));
4372     return RValue::getIgnored();
4373   }
4374   case Builtin::BI__builtin_annotation: {
4375     llvm::Value *AnnVal = EmitScalarExpr(E->getArg(0));
4376     llvm::Function *F =
4377         CGM.getIntrinsic(llvm::Intrinsic::annotation,
4378                          {AnnVal->getType(), CGM.ConstGlobalsPtrTy});
4379 
4380     // Get the annotation string, go through casts. Sema requires this to be a
4381     // non-wide string literal, potentially casted, so the cast<> is safe.
4382     const Expr *AnnotationStrExpr = E->getArg(1)->IgnoreParenCasts();
4383     StringRef Str = cast<StringLiteral>(AnnotationStrExpr)->getString();
4384     return RValue::get(
4385         EmitAnnotationCall(F, AnnVal, Str, E->getExprLoc(), nullptr));
4386   }
4387   case Builtin::BI__builtin_addcb:
4388   case Builtin::BI__builtin_addcs:
4389   case Builtin::BI__builtin_addc:
4390   case Builtin::BI__builtin_addcl:
4391   case Builtin::BI__builtin_addcll:
4392   case Builtin::BI__builtin_subcb:
4393   case Builtin::BI__builtin_subcs:
4394   case Builtin::BI__builtin_subc:
4395   case Builtin::BI__builtin_subcl:
4396   case Builtin::BI__builtin_subcll: {
4397 
4398     // We translate all of these builtins from expressions of the form:
4399     //   int x = ..., y = ..., carryin = ..., carryout, result;
4400     //   result = __builtin_addc(x, y, carryin, &carryout);
4401     //
4402     // to LLVM IR of the form:
4403     //
4404     //   %tmp1 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %x, i32 %y)
4405     //   %tmpsum1 = extractvalue {i32, i1} %tmp1, 0
4406     //   %carry1 = extractvalue {i32, i1} %tmp1, 1
4407     //   %tmp2 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %tmpsum1,
4408     //                                                       i32 %carryin)
4409     //   %result = extractvalue {i32, i1} %tmp2, 0
4410     //   %carry2 = extractvalue {i32, i1} %tmp2, 1
4411     //   %tmp3 = or i1 %carry1, %carry2
4412     //   %tmp4 = zext i1 %tmp3 to i32
4413     //   store i32 %tmp4, i32* %carryout
4414 
4415     // Scalarize our inputs.
4416     llvm::Value *X = EmitScalarExpr(E->getArg(0));
4417     llvm::Value *Y = EmitScalarExpr(E->getArg(1));
4418     llvm::Value *Carryin = EmitScalarExpr(E->getArg(2));
4419     Address CarryOutPtr = EmitPointerWithAlignment(E->getArg(3));
4420 
4421     // Decide if we are lowering to a uadd.with.overflow or usub.with.overflow.
4422     llvm::Intrinsic::ID IntrinsicId;
4423     switch (BuiltinID) {
4424     default: llvm_unreachable("Unknown multiprecision builtin id.");
4425     case Builtin::BI__builtin_addcb:
4426     case Builtin::BI__builtin_addcs:
4427     case Builtin::BI__builtin_addc:
4428     case Builtin::BI__builtin_addcl:
4429     case Builtin::BI__builtin_addcll:
4430       IntrinsicId = llvm::Intrinsic::uadd_with_overflow;
4431       break;
4432     case Builtin::BI__builtin_subcb:
4433     case Builtin::BI__builtin_subcs:
4434     case Builtin::BI__builtin_subc:
4435     case Builtin::BI__builtin_subcl:
4436     case Builtin::BI__builtin_subcll:
4437       IntrinsicId = llvm::Intrinsic::usub_with_overflow;
4438       break;
4439     }
4440 
4441     // Construct our resulting LLVM IR expression.
4442     llvm::Value *Carry1;
4443     llvm::Value *Sum1 = EmitOverflowIntrinsic(*this, IntrinsicId,
4444                                               X, Y, Carry1);
4445     llvm::Value *Carry2;
4446     llvm::Value *Sum2 = EmitOverflowIntrinsic(*this, IntrinsicId,
4447                                               Sum1, Carryin, Carry2);
4448     llvm::Value *CarryOut = Builder.CreateZExt(Builder.CreateOr(Carry1, Carry2),
4449                                                X->getType());
4450     Builder.CreateStore(CarryOut, CarryOutPtr);
4451     return RValue::get(Sum2);
4452   }
4453 
4454   case Builtin::BI__builtin_add_overflow:
4455   case Builtin::BI__builtin_sub_overflow:
4456   case Builtin::BI__builtin_mul_overflow: {
4457     const clang::Expr *LeftArg = E->getArg(0);
4458     const clang::Expr *RightArg = E->getArg(1);
4459     const clang::Expr *ResultArg = E->getArg(2);
4460 
4461     clang::QualType ResultQTy =
4462         ResultArg->getType()->castAs<PointerType>()->getPointeeType();
4463 
4464     WidthAndSignedness LeftInfo =
4465         getIntegerWidthAndSignedness(CGM.getContext(), LeftArg->getType());
4466     WidthAndSignedness RightInfo =
4467         getIntegerWidthAndSignedness(CGM.getContext(), RightArg->getType());
4468     WidthAndSignedness ResultInfo =
4469         getIntegerWidthAndSignedness(CGM.getContext(), ResultQTy);
4470 
4471     // Handle mixed-sign multiplication as a special case, because adding
4472     // runtime or backend support for our generic irgen would be too expensive.
4473     if (isSpecialMixedSignMultiply(BuiltinID, LeftInfo, RightInfo, ResultInfo))
4474       return EmitCheckedMixedSignMultiply(*this, LeftArg, LeftInfo, RightArg,
4475                                           RightInfo, ResultArg, ResultQTy,
4476                                           ResultInfo);
4477 
4478     if (isSpecialUnsignedMultiplySignedResult(BuiltinID, LeftInfo, RightInfo,
4479                                               ResultInfo))
4480       return EmitCheckedUnsignedMultiplySignedResult(
4481           *this, LeftArg, LeftInfo, RightArg, RightInfo, ResultArg, ResultQTy,
4482           ResultInfo);
4483 
4484     WidthAndSignedness EncompassingInfo =
4485         EncompassingIntegerType({LeftInfo, RightInfo, ResultInfo});
4486 
4487     llvm::Type *EncompassingLLVMTy =
4488         llvm::IntegerType::get(CGM.getLLVMContext(), EncompassingInfo.Width);
4489 
4490     llvm::Type *ResultLLVMTy = CGM.getTypes().ConvertType(ResultQTy);
4491 
4492     llvm::Intrinsic::ID IntrinsicId;
4493     switch (BuiltinID) {
4494     default:
4495       llvm_unreachable("Unknown overflow builtin id.");
4496     case Builtin::BI__builtin_add_overflow:
4497       IntrinsicId = EncompassingInfo.Signed
4498                         ? llvm::Intrinsic::sadd_with_overflow
4499                         : llvm::Intrinsic::uadd_with_overflow;
4500       break;
4501     case Builtin::BI__builtin_sub_overflow:
4502       IntrinsicId = EncompassingInfo.Signed
4503                         ? llvm::Intrinsic::ssub_with_overflow
4504                         : llvm::Intrinsic::usub_with_overflow;
4505       break;
4506     case Builtin::BI__builtin_mul_overflow:
4507       IntrinsicId = EncompassingInfo.Signed
4508                         ? llvm::Intrinsic::smul_with_overflow
4509                         : llvm::Intrinsic::umul_with_overflow;
4510       break;
4511     }
4512 
4513     llvm::Value *Left = EmitScalarExpr(LeftArg);
4514     llvm::Value *Right = EmitScalarExpr(RightArg);
4515     Address ResultPtr = EmitPointerWithAlignment(ResultArg);
4516 
4517     // Extend each operand to the encompassing type.
4518     Left = Builder.CreateIntCast(Left, EncompassingLLVMTy, LeftInfo.Signed);
4519     Right = Builder.CreateIntCast(Right, EncompassingLLVMTy, RightInfo.Signed);
4520 
4521     // Perform the operation on the extended values.
4522     llvm::Value *Overflow, *Result;
4523     Result = EmitOverflowIntrinsic(*this, IntrinsicId, Left, Right, Overflow);
4524 
4525     if (EncompassingInfo.Width > ResultInfo.Width) {
4526       // The encompassing type is wider than the result type, so we need to
4527       // truncate it.
4528       llvm::Value *ResultTrunc = Builder.CreateTrunc(Result, ResultLLVMTy);
4529 
4530       // To see if the truncation caused an overflow, we will extend
4531       // the result and then compare it to the original result.
4532       llvm::Value *ResultTruncExt = Builder.CreateIntCast(
4533           ResultTrunc, EncompassingLLVMTy, ResultInfo.Signed);
4534       llvm::Value *TruncationOverflow =
4535           Builder.CreateICmpNE(Result, ResultTruncExt);
4536 
4537       Overflow = Builder.CreateOr(Overflow, TruncationOverflow);
4538       Result = ResultTrunc;
4539     }
4540 
4541     // Finally, store the result using the pointer.
4542     bool isVolatile =
4543       ResultArg->getType()->getPointeeType().isVolatileQualified();
4544     Builder.CreateStore(EmitToMemory(Result, ResultQTy), ResultPtr, isVolatile);
4545 
4546     return RValue::get(Overflow);
4547   }
4548 
4549   case Builtin::BI__builtin_uadd_overflow:
4550   case Builtin::BI__builtin_uaddl_overflow:
4551   case Builtin::BI__builtin_uaddll_overflow:
4552   case Builtin::BI__builtin_usub_overflow:
4553   case Builtin::BI__builtin_usubl_overflow:
4554   case Builtin::BI__builtin_usubll_overflow:
4555   case Builtin::BI__builtin_umul_overflow:
4556   case Builtin::BI__builtin_umull_overflow:
4557   case Builtin::BI__builtin_umulll_overflow:
4558   case Builtin::BI__builtin_sadd_overflow:
4559   case Builtin::BI__builtin_saddl_overflow:
4560   case Builtin::BI__builtin_saddll_overflow:
4561   case Builtin::BI__builtin_ssub_overflow:
4562   case Builtin::BI__builtin_ssubl_overflow:
4563   case Builtin::BI__builtin_ssubll_overflow:
4564   case Builtin::BI__builtin_smul_overflow:
4565   case Builtin::BI__builtin_smull_overflow:
4566   case Builtin::BI__builtin_smulll_overflow: {
4567 
4568     // We translate all of these builtins directly to the relevant llvm IR node.
4569 
4570     // Scalarize our inputs.
4571     llvm::Value *X = EmitScalarExpr(E->getArg(0));
4572     llvm::Value *Y = EmitScalarExpr(E->getArg(1));
4573     Address SumOutPtr = EmitPointerWithAlignment(E->getArg(2));
4574 
4575     // Decide which of the overflow intrinsics we are lowering to:
4576     llvm::Intrinsic::ID IntrinsicId;
4577     switch (BuiltinID) {
4578     default: llvm_unreachable("Unknown overflow builtin id.");
4579     case Builtin::BI__builtin_uadd_overflow:
4580     case Builtin::BI__builtin_uaddl_overflow:
4581     case Builtin::BI__builtin_uaddll_overflow:
4582       IntrinsicId = llvm::Intrinsic::uadd_with_overflow;
4583       break;
4584     case Builtin::BI__builtin_usub_overflow:
4585     case Builtin::BI__builtin_usubl_overflow:
4586     case Builtin::BI__builtin_usubll_overflow:
4587       IntrinsicId = llvm::Intrinsic::usub_with_overflow;
4588       break;
4589     case Builtin::BI__builtin_umul_overflow:
4590     case Builtin::BI__builtin_umull_overflow:
4591     case Builtin::BI__builtin_umulll_overflow:
4592       IntrinsicId = llvm::Intrinsic::umul_with_overflow;
4593       break;
4594     case Builtin::BI__builtin_sadd_overflow:
4595     case Builtin::BI__builtin_saddl_overflow:
4596     case Builtin::BI__builtin_saddll_overflow:
4597       IntrinsicId = llvm::Intrinsic::sadd_with_overflow;
4598       break;
4599     case Builtin::BI__builtin_ssub_overflow:
4600     case Builtin::BI__builtin_ssubl_overflow:
4601     case Builtin::BI__builtin_ssubll_overflow:
4602       IntrinsicId = llvm::Intrinsic::ssub_with_overflow;
4603       break;
4604     case Builtin::BI__builtin_smul_overflow:
4605     case Builtin::BI__builtin_smull_overflow:
4606     case Builtin::BI__builtin_smulll_overflow:
4607       IntrinsicId = llvm::Intrinsic::smul_with_overflow;
4608       break;
4609     }
4610 
4611 
4612     llvm::Value *Carry;
4613     llvm::Value *Sum = EmitOverflowIntrinsic(*this, IntrinsicId, X, Y, Carry);
4614     Builder.CreateStore(Sum, SumOutPtr);
4615 
4616     return RValue::get(Carry);
4617   }
4618   case Builtin::BIaddressof:
4619   case Builtin::BI__addressof:
4620   case Builtin::BI__builtin_addressof:
4621     return RValue::get(EmitLValue(E->getArg(0)).getPointer(*this));
4622   case Builtin::BI__builtin_function_start:
4623     return RValue::get(CGM.GetFunctionStart(
4624         E->getArg(0)->getAsBuiltinConstantDeclRef(CGM.getContext())));
4625   case Builtin::BI__builtin_operator_new:
4626     return EmitBuiltinNewDeleteCall(
4627         E->getCallee()->getType()->castAs<FunctionProtoType>(), E, false);
4628   case Builtin::BI__builtin_operator_delete:
4629     EmitBuiltinNewDeleteCall(
4630         E->getCallee()->getType()->castAs<FunctionProtoType>(), E, true);
4631     return RValue::get(nullptr);
4632 
4633   case Builtin::BI__builtin_is_aligned:
4634     return EmitBuiltinIsAligned(E);
4635   case Builtin::BI__builtin_align_up:
4636     return EmitBuiltinAlignTo(E, true);
4637   case Builtin::BI__builtin_align_down:
4638     return EmitBuiltinAlignTo(E, false);
4639 
4640   case Builtin::BI__noop:
4641     // __noop always evaluates to an integer literal zero.
4642     return RValue::get(ConstantInt::get(IntTy, 0));
4643   case Builtin::BI__builtin_call_with_static_chain: {
4644     const CallExpr *Call = cast<CallExpr>(E->getArg(0));
4645     const Expr *Chain = E->getArg(1);
4646     return EmitCall(Call->getCallee()->getType(),
4647                     EmitCallee(Call->getCallee()), Call, ReturnValue,
4648                     EmitScalarExpr(Chain));
4649   }
4650   case Builtin::BI_InterlockedExchange8:
4651   case Builtin::BI_InterlockedExchange16:
4652   case Builtin::BI_InterlockedExchange:
4653   case Builtin::BI_InterlockedExchangePointer:
4654     return RValue::get(
4655         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchange, E));
4656   case Builtin::BI_InterlockedCompareExchangePointer:
4657   case Builtin::BI_InterlockedCompareExchangePointer_nf: {
4658     llvm::Type *RTy;
4659     llvm::IntegerType *IntType = IntegerType::get(
4660         getLLVMContext(), getContext().getTypeSize(E->getType()));
4661 
4662     llvm::Value *Destination = EmitScalarExpr(E->getArg(0));
4663 
4664     llvm::Value *Exchange = EmitScalarExpr(E->getArg(1));
4665     RTy = Exchange->getType();
4666     Exchange = Builder.CreatePtrToInt(Exchange, IntType);
4667 
4668     llvm::Value *Comparand =
4669       Builder.CreatePtrToInt(EmitScalarExpr(E->getArg(2)), IntType);
4670 
4671     auto Ordering =
4672       BuiltinID == Builtin::BI_InterlockedCompareExchangePointer_nf ?
4673       AtomicOrdering::Monotonic : AtomicOrdering::SequentiallyConsistent;
4674 
4675     auto Result = Builder.CreateAtomicCmpXchg(Destination, Comparand, Exchange,
4676                                               Ordering, Ordering);
4677     Result->setVolatile(true);
4678 
4679     return RValue::get(Builder.CreateIntToPtr(Builder.CreateExtractValue(Result,
4680                                                                          0),
4681                                               RTy));
4682   }
4683   case Builtin::BI_InterlockedCompareExchange8:
4684   case Builtin::BI_InterlockedCompareExchange16:
4685   case Builtin::BI_InterlockedCompareExchange:
4686   case Builtin::BI_InterlockedCompareExchange64:
4687     return RValue::get(EmitAtomicCmpXchgForMSIntrin(*this, E));
4688   case Builtin::BI_InterlockedIncrement16:
4689   case Builtin::BI_InterlockedIncrement:
4690     return RValue::get(
4691         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedIncrement, E));
4692   case Builtin::BI_InterlockedDecrement16:
4693   case Builtin::BI_InterlockedDecrement:
4694     return RValue::get(
4695         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedDecrement, E));
4696   case Builtin::BI_InterlockedAnd8:
4697   case Builtin::BI_InterlockedAnd16:
4698   case Builtin::BI_InterlockedAnd:
4699     return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedAnd, E));
4700   case Builtin::BI_InterlockedExchangeAdd8:
4701   case Builtin::BI_InterlockedExchangeAdd16:
4702   case Builtin::BI_InterlockedExchangeAdd:
4703     return RValue::get(
4704         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeAdd, E));
4705   case Builtin::BI_InterlockedExchangeSub8:
4706   case Builtin::BI_InterlockedExchangeSub16:
4707   case Builtin::BI_InterlockedExchangeSub:
4708     return RValue::get(
4709         EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeSub, E));
4710   case Builtin::BI_InterlockedOr8:
4711   case Builtin::BI_InterlockedOr16:
4712   case Builtin::BI_InterlockedOr:
4713     return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedOr, E));
4714   case Builtin::BI_InterlockedXor8:
4715   case Builtin::BI_InterlockedXor16:
4716   case Builtin::BI_InterlockedXor:
4717     return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedXor, E));
4718 
4719   case Builtin::BI_bittest64:
4720   case Builtin::BI_bittest:
4721   case Builtin::BI_bittestandcomplement64:
4722   case Builtin::BI_bittestandcomplement:
4723   case Builtin::BI_bittestandreset64:
4724   case Builtin::BI_bittestandreset:
4725   case Builtin::BI_bittestandset64:
4726   case Builtin::BI_bittestandset:
4727   case Builtin::BI_interlockedbittestandreset:
4728   case Builtin::BI_interlockedbittestandreset64:
4729   case Builtin::BI_interlockedbittestandset64:
4730   case Builtin::BI_interlockedbittestandset:
4731   case Builtin::BI_interlockedbittestandset_acq:
4732   case Builtin::BI_interlockedbittestandset_rel:
4733   case Builtin::BI_interlockedbittestandset_nf:
4734   case Builtin::BI_interlockedbittestandreset_acq:
4735   case Builtin::BI_interlockedbittestandreset_rel:
4736   case Builtin::BI_interlockedbittestandreset_nf:
4737     return RValue::get(EmitBitTestIntrinsic(*this, BuiltinID, E));
4738 
4739     // These builtins exist to emit regular volatile loads and stores not
4740     // affected by the -fms-volatile setting.
4741   case Builtin::BI__iso_volatile_load8:
4742   case Builtin::BI__iso_volatile_load16:
4743   case Builtin::BI__iso_volatile_load32:
4744   case Builtin::BI__iso_volatile_load64:
4745     return RValue::get(EmitISOVolatileLoad(*this, E));
4746   case Builtin::BI__iso_volatile_store8:
4747   case Builtin::BI__iso_volatile_store16:
4748   case Builtin::BI__iso_volatile_store32:
4749   case Builtin::BI__iso_volatile_store64:
4750     return RValue::get(EmitISOVolatileStore(*this, E));
4751 
4752   case Builtin::BI__exception_code:
4753   case Builtin::BI_exception_code:
4754     return RValue::get(EmitSEHExceptionCode());
4755   case Builtin::BI__exception_info:
4756   case Builtin::BI_exception_info:
4757     return RValue::get(EmitSEHExceptionInfo());
4758   case Builtin::BI__abnormal_termination:
4759   case Builtin::BI_abnormal_termination:
4760     return RValue::get(EmitSEHAbnormalTermination());
4761   case Builtin::BI_setjmpex:
4762     if (getTarget().getTriple().isOSMSVCRT() && E->getNumArgs() == 1 &&
4763         E->getArg(0)->getType()->isPointerType())
4764       return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmpex, E);
4765     break;
4766   case Builtin::BI_setjmp:
4767     if (getTarget().getTriple().isOSMSVCRT() && E->getNumArgs() == 1 &&
4768         E->getArg(0)->getType()->isPointerType()) {
4769       if (getTarget().getTriple().getArch() == llvm::Triple::x86)
4770         return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmp3, E);
4771       else if (getTarget().getTriple().getArch() == llvm::Triple::aarch64)
4772         return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmpex, E);
4773       return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmp, E);
4774     }
4775     break;
4776 
4777   // C++ std:: builtins.
4778   case Builtin::BImove:
4779   case Builtin::BImove_if_noexcept:
4780   case Builtin::BIforward:
4781   case Builtin::BIforward_like:
4782   case Builtin::BIas_const:
4783     return RValue::get(EmitLValue(E->getArg(0)).getPointer(*this));
4784   case Builtin::BI__GetExceptionInfo: {
4785     if (llvm::GlobalVariable *GV =
4786             CGM.getCXXABI().getThrowInfo(FD->getParamDecl(0)->getType()))
4787       return RValue::get(llvm::ConstantExpr::getBitCast(GV, CGM.Int8PtrTy));
4788     break;
4789   }
4790 
4791   case Builtin::BI__fastfail:
4792     return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::__fastfail, E));
4793 
4794   case Builtin::BI__builtin_coro_id:
4795     return EmitCoroutineIntrinsic(E, Intrinsic::coro_id);
4796   case Builtin::BI__builtin_coro_promise:
4797     return EmitCoroutineIntrinsic(E, Intrinsic::coro_promise);
4798   case Builtin::BI__builtin_coro_resume:
4799     EmitCoroutineIntrinsic(E, Intrinsic::coro_resume);
4800     return RValue::get(nullptr);
4801   case Builtin::BI__builtin_coro_frame:
4802     return EmitCoroutineIntrinsic(E, Intrinsic::coro_frame);
4803   case Builtin::BI__builtin_coro_noop:
4804     return EmitCoroutineIntrinsic(E, Intrinsic::coro_noop);
4805   case Builtin::BI__builtin_coro_free:
4806     return EmitCoroutineIntrinsic(E, Intrinsic::coro_free);
4807   case Builtin::BI__builtin_coro_destroy:
4808     EmitCoroutineIntrinsic(E, Intrinsic::coro_destroy);
4809     return RValue::get(nullptr);
4810   case Builtin::BI__builtin_coro_done:
4811     return EmitCoroutineIntrinsic(E, Intrinsic::coro_done);
4812   case Builtin::BI__builtin_coro_alloc:
4813     return EmitCoroutineIntrinsic(E, Intrinsic::coro_alloc);
4814   case Builtin::BI__builtin_coro_begin:
4815     return EmitCoroutineIntrinsic(E, Intrinsic::coro_begin);
4816   case Builtin::BI__builtin_coro_end:
4817     return EmitCoroutineIntrinsic(E, Intrinsic::coro_end);
4818   case Builtin::BI__builtin_coro_suspend:
4819     return EmitCoroutineIntrinsic(E, Intrinsic::coro_suspend);
4820   case Builtin::BI__builtin_coro_size:
4821     return EmitCoroutineIntrinsic(E, Intrinsic::coro_size);
4822   case Builtin::BI__builtin_coro_align:
4823     return EmitCoroutineIntrinsic(E, Intrinsic::coro_align);
4824 
4825   // OpenCL v2.0 s6.13.16.2, Built-in pipe read and write functions
4826   case Builtin::BIread_pipe:
4827   case Builtin::BIwrite_pipe: {
4828     Value *Arg0 = EmitScalarExpr(E->getArg(0)),
4829           *Arg1 = EmitScalarExpr(E->getArg(1));
4830     CGOpenCLRuntime OpenCLRT(CGM);
4831     Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
4832     Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
4833 
4834     // Type of the generic packet parameter.
4835     unsigned GenericAS =
4836         getContext().getTargetAddressSpace(LangAS::opencl_generic);
4837     llvm::Type *I8PTy = llvm::PointerType::get(
4838         llvm::Type::getInt8Ty(getLLVMContext()), GenericAS);
4839 
4840     // Testing which overloaded version we should generate the call for.
4841     if (2U == E->getNumArgs()) {
4842       const char *Name = (BuiltinID == Builtin::BIread_pipe) ? "__read_pipe_2"
4843                                                              : "__write_pipe_2";
4844       // Creating a generic function type to be able to call with any builtin or
4845       // user defined type.
4846       llvm::Type *ArgTys[] = {Arg0->getType(), I8PTy, Int32Ty, Int32Ty};
4847       llvm::FunctionType *FTy = llvm::FunctionType::get(
4848           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
4849       Value *BCast = Builder.CreatePointerCast(Arg1, I8PTy);
4850       return RValue::get(
4851           EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
4852                           {Arg0, BCast, PacketSize, PacketAlign}));
4853     } else {
4854       assert(4 == E->getNumArgs() &&
4855              "Illegal number of parameters to pipe function");
4856       const char *Name = (BuiltinID == Builtin::BIread_pipe) ? "__read_pipe_4"
4857                                                              : "__write_pipe_4";
4858 
4859       llvm::Type *ArgTys[] = {Arg0->getType(), Arg1->getType(), Int32Ty, I8PTy,
4860                               Int32Ty, Int32Ty};
4861       Value *Arg2 = EmitScalarExpr(E->getArg(2)),
4862             *Arg3 = EmitScalarExpr(E->getArg(3));
4863       llvm::FunctionType *FTy = llvm::FunctionType::get(
4864           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
4865       Value *BCast = Builder.CreatePointerCast(Arg3, I8PTy);
4866       // We know the third argument is an integer type, but we may need to cast
4867       // it to i32.
4868       if (Arg2->getType() != Int32Ty)
4869         Arg2 = Builder.CreateZExtOrTrunc(Arg2, Int32Ty);
4870       return RValue::get(
4871           EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
4872                           {Arg0, Arg1, Arg2, BCast, PacketSize, PacketAlign}));
4873     }
4874   }
4875   // OpenCL v2.0 s6.13.16 ,s9.17.3.5 - Built-in pipe reserve read and write
4876   // functions
4877   case Builtin::BIreserve_read_pipe:
4878   case Builtin::BIreserve_write_pipe:
4879   case Builtin::BIwork_group_reserve_read_pipe:
4880   case Builtin::BIwork_group_reserve_write_pipe:
4881   case Builtin::BIsub_group_reserve_read_pipe:
4882   case Builtin::BIsub_group_reserve_write_pipe: {
4883     // Composing the mangled name for the function.
4884     const char *Name;
4885     if (BuiltinID == Builtin::BIreserve_read_pipe)
4886       Name = "__reserve_read_pipe";
4887     else if (BuiltinID == Builtin::BIreserve_write_pipe)
4888       Name = "__reserve_write_pipe";
4889     else if (BuiltinID == Builtin::BIwork_group_reserve_read_pipe)
4890       Name = "__work_group_reserve_read_pipe";
4891     else if (BuiltinID == Builtin::BIwork_group_reserve_write_pipe)
4892       Name = "__work_group_reserve_write_pipe";
4893     else if (BuiltinID == Builtin::BIsub_group_reserve_read_pipe)
4894       Name = "__sub_group_reserve_read_pipe";
4895     else
4896       Name = "__sub_group_reserve_write_pipe";
4897 
4898     Value *Arg0 = EmitScalarExpr(E->getArg(0)),
4899           *Arg1 = EmitScalarExpr(E->getArg(1));
4900     llvm::Type *ReservedIDTy = ConvertType(getContext().OCLReserveIDTy);
4901     CGOpenCLRuntime OpenCLRT(CGM);
4902     Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
4903     Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
4904 
4905     // Building the generic function prototype.
4906     llvm::Type *ArgTys[] = {Arg0->getType(), Int32Ty, Int32Ty, Int32Ty};
4907     llvm::FunctionType *FTy = llvm::FunctionType::get(
4908         ReservedIDTy, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
4909     // We know the second argument is an integer type, but we may need to cast
4910     // it to i32.
4911     if (Arg1->getType() != Int32Ty)
4912       Arg1 = Builder.CreateZExtOrTrunc(Arg1, Int32Ty);
4913     return RValue::get(EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
4914                                        {Arg0, Arg1, PacketSize, PacketAlign}));
4915   }
4916   // OpenCL v2.0 s6.13.16, s9.17.3.5 - Built-in pipe commit read and write
4917   // functions
4918   case Builtin::BIcommit_read_pipe:
4919   case Builtin::BIcommit_write_pipe:
4920   case Builtin::BIwork_group_commit_read_pipe:
4921   case Builtin::BIwork_group_commit_write_pipe:
4922   case Builtin::BIsub_group_commit_read_pipe:
4923   case Builtin::BIsub_group_commit_write_pipe: {
4924     const char *Name;
4925     if (BuiltinID == Builtin::BIcommit_read_pipe)
4926       Name = "__commit_read_pipe";
4927     else if (BuiltinID == Builtin::BIcommit_write_pipe)
4928       Name = "__commit_write_pipe";
4929     else if (BuiltinID == Builtin::BIwork_group_commit_read_pipe)
4930       Name = "__work_group_commit_read_pipe";
4931     else if (BuiltinID == Builtin::BIwork_group_commit_write_pipe)
4932       Name = "__work_group_commit_write_pipe";
4933     else if (BuiltinID == Builtin::BIsub_group_commit_read_pipe)
4934       Name = "__sub_group_commit_read_pipe";
4935     else
4936       Name = "__sub_group_commit_write_pipe";
4937 
4938     Value *Arg0 = EmitScalarExpr(E->getArg(0)),
4939           *Arg1 = EmitScalarExpr(E->getArg(1));
4940     CGOpenCLRuntime OpenCLRT(CGM);
4941     Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
4942     Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
4943 
4944     // Building the generic function prototype.
4945     llvm::Type *ArgTys[] = {Arg0->getType(), Arg1->getType(), Int32Ty, Int32Ty};
4946     llvm::FunctionType *FTy =
4947         llvm::FunctionType::get(llvm::Type::getVoidTy(getLLVMContext()),
4948                                 llvm::ArrayRef<llvm::Type *>(ArgTys), false);
4949 
4950     return RValue::get(EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
4951                                        {Arg0, Arg1, PacketSize, PacketAlign}));
4952   }
4953   // OpenCL v2.0 s6.13.16.4 Built-in pipe query functions
4954   case Builtin::BIget_pipe_num_packets:
4955   case Builtin::BIget_pipe_max_packets: {
4956     const char *BaseName;
4957     const auto *PipeTy = E->getArg(0)->getType()->castAs<PipeType>();
4958     if (BuiltinID == Builtin::BIget_pipe_num_packets)
4959       BaseName = "__get_pipe_num_packets";
4960     else
4961       BaseName = "__get_pipe_max_packets";
4962     std::string Name = std::string(BaseName) +
4963                        std::string(PipeTy->isReadOnly() ? "_ro" : "_wo");
4964 
4965     // Building the generic function prototype.
4966     Value *Arg0 = EmitScalarExpr(E->getArg(0));
4967     CGOpenCLRuntime OpenCLRT(CGM);
4968     Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
4969     Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
4970     llvm::Type *ArgTys[] = {Arg0->getType(), Int32Ty, Int32Ty};
4971     llvm::FunctionType *FTy = llvm::FunctionType::get(
4972         Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
4973 
4974     return RValue::get(EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
4975                                        {Arg0, PacketSize, PacketAlign}));
4976   }
4977 
4978   // OpenCL v2.0 s6.13.9 - Address space qualifier functions.
4979   case Builtin::BIto_global:
4980   case Builtin::BIto_local:
4981   case Builtin::BIto_private: {
4982     auto Arg0 = EmitScalarExpr(E->getArg(0));
4983     auto NewArgT = llvm::PointerType::get(Int8Ty,
4984       CGM.getContext().getTargetAddressSpace(LangAS::opencl_generic));
4985     auto NewRetT = llvm::PointerType::get(Int8Ty,
4986       CGM.getContext().getTargetAddressSpace(
4987         E->getType()->getPointeeType().getAddressSpace()));
4988     auto FTy = llvm::FunctionType::get(NewRetT, {NewArgT}, false);
4989     llvm::Value *NewArg;
4990     if (Arg0->getType()->getPointerAddressSpace() !=
4991         NewArgT->getPointerAddressSpace())
4992       NewArg = Builder.CreateAddrSpaceCast(Arg0, NewArgT);
4993     else
4994       NewArg = Builder.CreateBitOrPointerCast(Arg0, NewArgT);
4995     auto NewName = std::string("__") + E->getDirectCallee()->getName().str();
4996     auto NewCall =
4997         EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, NewName), {NewArg});
4998     return RValue::get(Builder.CreateBitOrPointerCast(NewCall,
4999       ConvertType(E->getType())));
5000   }
5001 
5002   // OpenCL v2.0, s6.13.17 - Enqueue kernel function.
5003   // It contains four different overload formats specified in Table 6.13.17.1.
5004   case Builtin::BIenqueue_kernel: {
5005     StringRef Name; // Generated function call name
5006     unsigned NumArgs = E->getNumArgs();
5007 
5008     llvm::Type *QueueTy = ConvertType(getContext().OCLQueueTy);
5009     llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy(
5010         getContext().getTargetAddressSpace(LangAS::opencl_generic));
5011 
5012     llvm::Value *Queue = EmitScalarExpr(E->getArg(0));
5013     llvm::Value *Flags = EmitScalarExpr(E->getArg(1));
5014     LValue NDRangeL = EmitAggExprToLValue(E->getArg(2));
5015     llvm::Value *Range = NDRangeL.getAddress(*this).getPointer();
5016     llvm::Type *RangeTy = NDRangeL.getAddress(*this).getType();
5017 
5018     if (NumArgs == 4) {
5019       // The most basic form of the call with parameters:
5020       // queue_t, kernel_enqueue_flags_t, ndrange_t, block(void)
5021       Name = "__enqueue_kernel_basic";
5022       llvm::Type *ArgTys[] = {QueueTy, Int32Ty, RangeTy, GenericVoidPtrTy,
5023                               GenericVoidPtrTy};
5024       llvm::FunctionType *FTy = llvm::FunctionType::get(
5025           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
5026 
5027       auto Info =
5028           CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(3));
5029       llvm::Value *Kernel =
5030           Builder.CreatePointerCast(Info.KernelHandle, GenericVoidPtrTy);
5031       llvm::Value *Block =
5032           Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
5033 
5034       AttrBuilder B(Builder.getContext());
5035       B.addByValAttr(NDRangeL.getAddress(*this).getElementType());
5036       llvm::AttributeList ByValAttrSet =
5037           llvm::AttributeList::get(CGM.getModule().getContext(), 3U, B);
5038 
5039       auto RTCall =
5040           EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name, ByValAttrSet),
5041                           {Queue, Flags, Range, Kernel, Block});
5042       RTCall->setAttributes(ByValAttrSet);
5043       return RValue::get(RTCall);
5044     }
5045     assert(NumArgs >= 5 && "Invalid enqueue_kernel signature");
5046 
5047     // Create a temporary array to hold the sizes of local pointer arguments
5048     // for the block. \p First is the position of the first size argument.
5049     auto CreateArrayForSizeVar = [=](unsigned First)
5050         -> std::tuple<llvm::Value *, llvm::Value *, llvm::Value *> {
5051       llvm::APInt ArraySize(32, NumArgs - First);
5052       QualType SizeArrayTy = getContext().getConstantArrayType(
5053           getContext().getSizeType(), ArraySize, nullptr, ArrayType::Normal,
5054           /*IndexTypeQuals=*/0);
5055       auto Tmp = CreateMemTemp(SizeArrayTy, "block_sizes");
5056       llvm::Value *TmpPtr = Tmp.getPointer();
5057       llvm::Value *TmpSize = EmitLifetimeStart(
5058           CGM.getDataLayout().getTypeAllocSize(Tmp.getElementType()), TmpPtr);
5059       llvm::Value *ElemPtr;
5060       // Each of the following arguments specifies the size of the corresponding
5061       // argument passed to the enqueued block.
5062       auto *Zero = llvm::ConstantInt::get(IntTy, 0);
5063       for (unsigned I = First; I < NumArgs; ++I) {
5064         auto *Index = llvm::ConstantInt::get(IntTy, I - First);
5065         auto *GEP = Builder.CreateGEP(Tmp.getElementType(), TmpPtr,
5066                                       {Zero, Index});
5067         if (I == First)
5068           ElemPtr = GEP;
5069         auto *V =
5070             Builder.CreateZExtOrTrunc(EmitScalarExpr(E->getArg(I)), SizeTy);
5071         Builder.CreateAlignedStore(
5072             V, GEP, CGM.getDataLayout().getPrefTypeAlign(SizeTy));
5073       }
5074       return std::tie(ElemPtr, TmpSize, TmpPtr);
5075     };
5076 
5077     // Could have events and/or varargs.
5078     if (E->getArg(3)->getType()->isBlockPointerType()) {
5079       // No events passed, but has variadic arguments.
5080       Name = "__enqueue_kernel_varargs";
5081       auto Info =
5082           CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(3));
5083       llvm::Value *Kernel =
5084           Builder.CreatePointerCast(Info.KernelHandle, GenericVoidPtrTy);
5085       auto *Block = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
5086       llvm::Value *ElemPtr, *TmpSize, *TmpPtr;
5087       std::tie(ElemPtr, TmpSize, TmpPtr) = CreateArrayForSizeVar(4);
5088 
5089       // Create a vector of the arguments, as well as a constant value to
5090       // express to the runtime the number of variadic arguments.
5091       llvm::Value *const Args[] = {Queue,  Flags,
5092                                    Range,  Kernel,
5093                                    Block,  ConstantInt::get(IntTy, NumArgs - 4),
5094                                    ElemPtr};
5095       llvm::Type *const ArgTys[] = {
5096           QueueTy,          IntTy, RangeTy,           GenericVoidPtrTy,
5097           GenericVoidPtrTy, IntTy, ElemPtr->getType()};
5098 
5099       llvm::FunctionType *FTy = llvm::FunctionType::get(Int32Ty, ArgTys, false);
5100       auto Call = RValue::get(
5101           EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Args));
5102       if (TmpSize)
5103         EmitLifetimeEnd(TmpSize, TmpPtr);
5104       return Call;
5105     }
5106     // Any calls now have event arguments passed.
5107     if (NumArgs >= 7) {
5108       llvm::PointerType *PtrTy = llvm::PointerType::get(
5109           CGM.getLLVMContext(),
5110           CGM.getContext().getTargetAddressSpace(LangAS::opencl_generic));
5111 
5112       llvm::Value *NumEvents =
5113           Builder.CreateZExtOrTrunc(EmitScalarExpr(E->getArg(3)), Int32Ty);
5114 
5115       // Since SemaOpenCLBuiltinEnqueueKernel allows fifth and sixth arguments
5116       // to be a null pointer constant (including `0` literal), we can take it
5117       // into account and emit null pointer directly.
5118       llvm::Value *EventWaitList = nullptr;
5119       if (E->getArg(4)->isNullPointerConstant(
5120               getContext(), Expr::NPC_ValueDependentIsNotNull)) {
5121         EventWaitList = llvm::ConstantPointerNull::get(PtrTy);
5122       } else {
5123         EventWaitList = E->getArg(4)->getType()->isArrayType()
5124                         ? EmitArrayToPointerDecay(E->getArg(4)).getPointer()
5125                         : EmitScalarExpr(E->getArg(4));
5126         // Convert to generic address space.
5127         EventWaitList = Builder.CreatePointerCast(EventWaitList, PtrTy);
5128       }
5129       llvm::Value *EventRet = nullptr;
5130       if (E->getArg(5)->isNullPointerConstant(
5131               getContext(), Expr::NPC_ValueDependentIsNotNull)) {
5132         EventRet = llvm::ConstantPointerNull::get(PtrTy);
5133       } else {
5134         EventRet =
5135             Builder.CreatePointerCast(EmitScalarExpr(E->getArg(5)), PtrTy);
5136       }
5137 
5138       auto Info =
5139           CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(6));
5140       llvm::Value *Kernel =
5141           Builder.CreatePointerCast(Info.KernelHandle, GenericVoidPtrTy);
5142       llvm::Value *Block =
5143           Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
5144 
5145       std::vector<llvm::Type *> ArgTys = {
5146           QueueTy, Int32Ty, RangeTy,          Int32Ty,
5147           PtrTy,   PtrTy,   GenericVoidPtrTy, GenericVoidPtrTy};
5148 
5149       std::vector<llvm::Value *> Args = {Queue,     Flags,         Range,
5150                                          NumEvents, EventWaitList, EventRet,
5151                                          Kernel,    Block};
5152 
5153       if (NumArgs == 7) {
5154         // Has events but no variadics.
5155         Name = "__enqueue_kernel_basic_events";
5156         llvm::FunctionType *FTy = llvm::FunctionType::get(
5157             Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
5158         return RValue::get(
5159             EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
5160                             llvm::ArrayRef<llvm::Value *>(Args)));
5161       }
5162       // Has event info and variadics
5163       // Pass the number of variadics to the runtime function too.
5164       Args.push_back(ConstantInt::get(Int32Ty, NumArgs - 7));
5165       ArgTys.push_back(Int32Ty);
5166       Name = "__enqueue_kernel_events_varargs";
5167 
5168       llvm::Value *ElemPtr, *TmpSize, *TmpPtr;
5169       std::tie(ElemPtr, TmpSize, TmpPtr) = CreateArrayForSizeVar(7);
5170       Args.push_back(ElemPtr);
5171       ArgTys.push_back(ElemPtr->getType());
5172 
5173       llvm::FunctionType *FTy = llvm::FunctionType::get(
5174           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
5175       auto Call =
5176           RValue::get(EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
5177                                       llvm::ArrayRef<llvm::Value *>(Args)));
5178       if (TmpSize)
5179         EmitLifetimeEnd(TmpSize, TmpPtr);
5180       return Call;
5181     }
5182     [[fallthrough]];
5183   }
5184   // OpenCL v2.0 s6.13.17.6 - Kernel query functions need bitcast of block
5185   // parameter.
5186   case Builtin::BIget_kernel_work_group_size: {
5187     llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy(
5188         getContext().getTargetAddressSpace(LangAS::opencl_generic));
5189     auto Info =
5190         CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(0));
5191     Value *Kernel =
5192         Builder.CreatePointerCast(Info.KernelHandle, GenericVoidPtrTy);
5193     Value *Arg = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
5194     return RValue::get(EmitRuntimeCall(
5195         CGM.CreateRuntimeFunction(
5196             llvm::FunctionType::get(IntTy, {GenericVoidPtrTy, GenericVoidPtrTy},
5197                                     false),
5198             "__get_kernel_work_group_size_impl"),
5199         {Kernel, Arg}));
5200   }
5201   case Builtin::BIget_kernel_preferred_work_group_size_multiple: {
5202     llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy(
5203         getContext().getTargetAddressSpace(LangAS::opencl_generic));
5204     auto Info =
5205         CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(0));
5206     Value *Kernel =
5207         Builder.CreatePointerCast(Info.KernelHandle, GenericVoidPtrTy);
5208     Value *Arg = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
5209     return RValue::get(EmitRuntimeCall(
5210         CGM.CreateRuntimeFunction(
5211             llvm::FunctionType::get(IntTy, {GenericVoidPtrTy, GenericVoidPtrTy},
5212                                     false),
5213             "__get_kernel_preferred_work_group_size_multiple_impl"),
5214         {Kernel, Arg}));
5215   }
5216   case Builtin::BIget_kernel_max_sub_group_size_for_ndrange:
5217   case Builtin::BIget_kernel_sub_group_count_for_ndrange: {
5218     llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy(
5219         getContext().getTargetAddressSpace(LangAS::opencl_generic));
5220     LValue NDRangeL = EmitAggExprToLValue(E->getArg(0));
5221     llvm::Value *NDRange = NDRangeL.getAddress(*this).getPointer();
5222     auto Info =
5223         CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(1));
5224     Value *Kernel =
5225         Builder.CreatePointerCast(Info.KernelHandle, GenericVoidPtrTy);
5226     Value *Block = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
5227     const char *Name =
5228         BuiltinID == Builtin::BIget_kernel_max_sub_group_size_for_ndrange
5229             ? "__get_kernel_max_sub_group_size_for_ndrange_impl"
5230             : "__get_kernel_sub_group_count_for_ndrange_impl";
5231     return RValue::get(EmitRuntimeCall(
5232         CGM.CreateRuntimeFunction(
5233             llvm::FunctionType::get(
5234                 IntTy, {NDRange->getType(), GenericVoidPtrTy, GenericVoidPtrTy},
5235                 false),
5236             Name),
5237         {NDRange, Kernel, Block}));
5238   }
5239 
5240   case Builtin::BI__builtin_store_half:
5241   case Builtin::BI__builtin_store_halff: {
5242     Value *Val = EmitScalarExpr(E->getArg(0));
5243     Address Address = EmitPointerWithAlignment(E->getArg(1));
5244     Value *HalfVal = Builder.CreateFPTrunc(Val, Builder.getHalfTy());
5245     Builder.CreateStore(HalfVal, Address);
5246     return RValue::get(nullptr);
5247   }
5248   case Builtin::BI__builtin_load_half: {
5249     Address Address = EmitPointerWithAlignment(E->getArg(0));
5250     Value *HalfVal = Builder.CreateLoad(Address);
5251     return RValue::get(Builder.CreateFPExt(HalfVal, Builder.getDoubleTy()));
5252   }
5253   case Builtin::BI__builtin_load_halff: {
5254     Address Address = EmitPointerWithAlignment(E->getArg(0));
5255     Value *HalfVal = Builder.CreateLoad(Address);
5256     return RValue::get(Builder.CreateFPExt(HalfVal, Builder.getFloatTy()));
5257   }
5258   case Builtin::BIprintf:
5259     if (getTarget().getTriple().isNVPTX() ||
5260         getTarget().getTriple().isAMDGCN()) {
5261       if (getLangOpts().OpenMPIsTargetDevice)
5262         return EmitOpenMPDevicePrintfCallExpr(E);
5263       if (getTarget().getTriple().isNVPTX())
5264         return EmitNVPTXDevicePrintfCallExpr(E);
5265       if (getTarget().getTriple().isAMDGCN() && getLangOpts().HIP)
5266         return EmitAMDGPUDevicePrintfCallExpr(E);
5267     }
5268 
5269     break;
5270   case Builtin::BI__builtin_canonicalize:
5271   case Builtin::BI__builtin_canonicalizef:
5272   case Builtin::BI__builtin_canonicalizef16:
5273   case Builtin::BI__builtin_canonicalizel:
5274     return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::canonicalize));
5275 
5276   case Builtin::BI__builtin_thread_pointer: {
5277     if (!getContext().getTargetInfo().isTLSSupported())
5278       CGM.ErrorUnsupported(E, "__builtin_thread_pointer");
5279     // Fall through - it's already mapped to the intrinsic by ClangBuiltin.
5280     break;
5281   }
5282   case Builtin::BI__builtin_os_log_format:
5283     return emitBuiltinOSLogFormat(*E);
5284 
5285   case Builtin::BI__xray_customevent: {
5286     if (!ShouldXRayInstrumentFunction())
5287       return RValue::getIgnored();
5288 
5289     if (!CGM.getCodeGenOpts().XRayInstrumentationBundle.has(
5290             XRayInstrKind::Custom))
5291       return RValue::getIgnored();
5292 
5293     if (const auto *XRayAttr = CurFuncDecl->getAttr<XRayInstrumentAttr>())
5294       if (XRayAttr->neverXRayInstrument() && !AlwaysEmitXRayCustomEvents())
5295         return RValue::getIgnored();
5296 
5297     Function *F = CGM.getIntrinsic(Intrinsic::xray_customevent);
5298     auto FTy = F->getFunctionType();
5299     auto Arg0 = E->getArg(0);
5300     auto Arg0Val = EmitScalarExpr(Arg0);
5301     auto Arg0Ty = Arg0->getType();
5302     auto PTy0 = FTy->getParamType(0);
5303     if (PTy0 != Arg0Val->getType()) {
5304       if (Arg0Ty->isArrayType())
5305         Arg0Val = EmitArrayToPointerDecay(Arg0).getPointer();
5306       else
5307         Arg0Val = Builder.CreatePointerCast(Arg0Val, PTy0);
5308     }
5309     auto Arg1 = EmitScalarExpr(E->getArg(1));
5310     auto PTy1 = FTy->getParamType(1);
5311     if (PTy1 != Arg1->getType())
5312       Arg1 = Builder.CreateTruncOrBitCast(Arg1, PTy1);
5313     return RValue::get(Builder.CreateCall(F, {Arg0Val, Arg1}));
5314   }
5315 
5316   case Builtin::BI__xray_typedevent: {
5317     // TODO: There should be a way to always emit events even if the current
5318     // function is not instrumented. Losing events in a stream can cripple
5319     // a trace.
5320     if (!ShouldXRayInstrumentFunction())
5321       return RValue::getIgnored();
5322 
5323     if (!CGM.getCodeGenOpts().XRayInstrumentationBundle.has(
5324             XRayInstrKind::Typed))
5325       return RValue::getIgnored();
5326 
5327     if (const auto *XRayAttr = CurFuncDecl->getAttr<XRayInstrumentAttr>())
5328       if (XRayAttr->neverXRayInstrument() && !AlwaysEmitXRayTypedEvents())
5329         return RValue::getIgnored();
5330 
5331     Function *F = CGM.getIntrinsic(Intrinsic::xray_typedevent);
5332     auto FTy = F->getFunctionType();
5333     auto Arg0 = EmitScalarExpr(E->getArg(0));
5334     auto PTy0 = FTy->getParamType(0);
5335     if (PTy0 != Arg0->getType())
5336       Arg0 = Builder.CreateTruncOrBitCast(Arg0, PTy0);
5337     auto Arg1 = E->getArg(1);
5338     auto Arg1Val = EmitScalarExpr(Arg1);
5339     auto Arg1Ty = Arg1->getType();
5340     auto PTy1 = FTy->getParamType(1);
5341     if (PTy1 != Arg1Val->getType()) {
5342       if (Arg1Ty->isArrayType())
5343         Arg1Val = EmitArrayToPointerDecay(Arg1).getPointer();
5344       else
5345         Arg1Val = Builder.CreatePointerCast(Arg1Val, PTy1);
5346     }
5347     auto Arg2 = EmitScalarExpr(E->getArg(2));
5348     auto PTy2 = FTy->getParamType(2);
5349     if (PTy2 != Arg2->getType())
5350       Arg2 = Builder.CreateTruncOrBitCast(Arg2, PTy2);
5351     return RValue::get(Builder.CreateCall(F, {Arg0, Arg1Val, Arg2}));
5352   }
5353 
5354   case Builtin::BI__builtin_ms_va_start:
5355   case Builtin::BI__builtin_ms_va_end:
5356     return RValue::get(
5357         EmitVAStartEnd(EmitMSVAListRef(E->getArg(0)).getPointer(),
5358                        BuiltinID == Builtin::BI__builtin_ms_va_start));
5359 
5360   case Builtin::BI__builtin_ms_va_copy: {
5361     // Lower this manually. We can't reliably determine whether or not any
5362     // given va_copy() is for a Win64 va_list from the calling convention
5363     // alone, because it's legal to do this from a System V ABI function.
5364     // With opaque pointer types, we won't have enough information in LLVM
5365     // IR to determine this from the argument types, either. Best to do it
5366     // now, while we have enough information.
5367     Address DestAddr = EmitMSVAListRef(E->getArg(0));
5368     Address SrcAddr = EmitMSVAListRef(E->getArg(1));
5369 
5370     llvm::Type *BPP = Int8PtrPtrTy;
5371 
5372     DestAddr = Address(Builder.CreateBitCast(DestAddr.getPointer(), BPP, "cp"),
5373                        Int8PtrTy, DestAddr.getAlignment());
5374     SrcAddr = Address(Builder.CreateBitCast(SrcAddr.getPointer(), BPP, "ap"),
5375                       Int8PtrTy, SrcAddr.getAlignment());
5376 
5377     Value *ArgPtr = Builder.CreateLoad(SrcAddr, "ap.val");
5378     return RValue::get(Builder.CreateStore(ArgPtr, DestAddr));
5379   }
5380 
5381   case Builtin::BI__builtin_get_device_side_mangled_name: {
5382     auto Name = CGM.getCUDARuntime().getDeviceSideName(
5383         cast<DeclRefExpr>(E->getArg(0)->IgnoreImpCasts())->getDecl());
5384     auto Str = CGM.GetAddrOfConstantCString(Name, "");
5385     llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
5386                                llvm::ConstantInt::get(SizeTy, 0)};
5387     auto *Ptr = llvm::ConstantExpr::getGetElementPtr(Str.getElementType(),
5388                                                      Str.getPointer(), Zeros);
5389     return RValue::get(Ptr);
5390   }
5391   }
5392 
5393   // If this is an alias for a lib function (e.g. __builtin_sin), emit
5394   // the call using the normal call path, but using the unmangled
5395   // version of the function name.
5396   if (getContext().BuiltinInfo.isLibFunction(BuiltinID))
5397     return emitLibraryCall(*this, FD, E,
5398                            CGM.getBuiltinLibFunction(FD, BuiltinID));
5399 
5400   // If this is a predefined lib function (e.g. malloc), emit the call
5401   // using exactly the normal call path.
5402   if (getContext().BuiltinInfo.isPredefinedLibFunction(BuiltinID))
5403     return emitLibraryCall(*this, FD, E,
5404                       cast<llvm::Constant>(EmitScalarExpr(E->getCallee())));
5405 
5406   // Check that a call to a target specific builtin has the correct target
5407   // features.
5408   // This is down here to avoid non-target specific builtins, however, if
5409   // generic builtins start to require generic target features then we
5410   // can move this up to the beginning of the function.
5411   checkTargetFeatures(E, FD);
5412 
5413   if (unsigned VectorWidth = getContext().BuiltinInfo.getRequiredVectorWidth(BuiltinID))
5414     LargestVectorWidth = std::max(LargestVectorWidth, VectorWidth);
5415 
5416   // See if we have a target specific intrinsic.
5417   StringRef Name = getContext().BuiltinInfo.getName(BuiltinID);
5418   Intrinsic::ID IntrinsicID = Intrinsic::not_intrinsic;
5419   StringRef Prefix =
5420       llvm::Triple::getArchTypePrefix(getTarget().getTriple().getArch());
5421   if (!Prefix.empty()) {
5422     IntrinsicID = Intrinsic::getIntrinsicForClangBuiltin(Prefix.data(), Name);
5423     // NOTE we don't need to perform a compatibility flag check here since the
5424     // intrinsics are declared in Builtins*.def via LANGBUILTIN which filter the
5425     // MS builtins via ALL_MS_LANGUAGES and are filtered earlier.
5426     if (IntrinsicID == Intrinsic::not_intrinsic)
5427       IntrinsicID = Intrinsic::getIntrinsicForMSBuiltin(Prefix.data(), Name);
5428   }
5429 
5430   if (IntrinsicID != Intrinsic::not_intrinsic) {
5431     SmallVector<Value*, 16> Args;
5432 
5433     // Find out if any arguments are required to be integer constant
5434     // expressions.
5435     unsigned ICEArguments = 0;
5436     ASTContext::GetBuiltinTypeError Error;
5437     getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
5438     assert(Error == ASTContext::GE_None && "Should not codegen an error");
5439 
5440     Function *F = CGM.getIntrinsic(IntrinsicID);
5441     llvm::FunctionType *FTy = F->getFunctionType();
5442 
5443     for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) {
5444       Value *ArgValue;
5445       // If this is a normal argument, just emit it as a scalar.
5446       if ((ICEArguments & (1 << i)) == 0) {
5447         ArgValue = EmitScalarExpr(E->getArg(i));
5448       } else {
5449         // If this is required to be a constant, constant fold it so that we
5450         // know that the generated intrinsic gets a ConstantInt.
5451         ArgValue = llvm::ConstantInt::get(
5452             getLLVMContext(),
5453             *E->getArg(i)->getIntegerConstantExpr(getContext()));
5454       }
5455 
5456       // If the intrinsic arg type is different from the builtin arg type
5457       // we need to do a bit cast.
5458       llvm::Type *PTy = FTy->getParamType(i);
5459       if (PTy != ArgValue->getType()) {
5460         // XXX - vector of pointers?
5461         if (auto *PtrTy = dyn_cast<llvm::PointerType>(PTy)) {
5462           if (PtrTy->getAddressSpace() !=
5463               ArgValue->getType()->getPointerAddressSpace()) {
5464             ArgValue = Builder.CreateAddrSpaceCast(
5465                 ArgValue, llvm::PointerType::get(getLLVMContext(),
5466                                                  PtrTy->getAddressSpace()));
5467           }
5468         }
5469 
5470         assert(PTy->canLosslesslyBitCastTo(FTy->getParamType(i)) &&
5471                "Must be able to losslessly bit cast to param");
5472         // Cast vector type (e.g., v256i32) to x86_amx, this only happen
5473         // in amx intrinsics.
5474         if (PTy->isX86_AMXTy())
5475           ArgValue = Builder.CreateIntrinsic(Intrinsic::x86_cast_vector_to_tile,
5476                                              {ArgValue->getType()}, {ArgValue});
5477         else
5478           ArgValue = Builder.CreateBitCast(ArgValue, PTy);
5479       }
5480 
5481       Args.push_back(ArgValue);
5482     }
5483 
5484     Value *V = Builder.CreateCall(F, Args);
5485     QualType BuiltinRetType = E->getType();
5486 
5487     llvm::Type *RetTy = VoidTy;
5488     if (!BuiltinRetType->isVoidType())
5489       RetTy = ConvertType(BuiltinRetType);
5490 
5491     if (RetTy != V->getType()) {
5492       // XXX - vector of pointers?
5493       if (auto *PtrTy = dyn_cast<llvm::PointerType>(RetTy)) {
5494         if (PtrTy->getAddressSpace() != V->getType()->getPointerAddressSpace()) {
5495           V = Builder.CreateAddrSpaceCast(
5496               V, llvm::PointerType::get(getLLVMContext(),
5497                                         PtrTy->getAddressSpace()));
5498         }
5499       }
5500 
5501       assert(V->getType()->canLosslesslyBitCastTo(RetTy) &&
5502              "Must be able to losslessly bit cast result type");
5503       // Cast x86_amx to vector type (e.g., v256i32), this only happen
5504       // in amx intrinsics.
5505       if (V->getType()->isX86_AMXTy())
5506         V = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector, {RetTy},
5507                                     {V});
5508       else
5509         V = Builder.CreateBitCast(V, RetTy);
5510     }
5511 
5512     if (RetTy->isVoidTy())
5513       return RValue::get(nullptr);
5514 
5515     return RValue::get(V);
5516   }
5517 
5518   // Some target-specific builtins can have aggregate return values, e.g.
5519   // __builtin_arm_mve_vld2q_u32. So if the result is an aggregate, force
5520   // ReturnValue to be non-null, so that the target-specific emission code can
5521   // always just emit into it.
5522   TypeEvaluationKind EvalKind = getEvaluationKind(E->getType());
5523   if (EvalKind == TEK_Aggregate && ReturnValue.isNull()) {
5524     Address DestPtr = CreateMemTemp(E->getType(), "agg.tmp");
5525     ReturnValue = ReturnValueSlot(DestPtr, false);
5526   }
5527 
5528   // Now see if we can emit a target-specific builtin.
5529   if (Value *V = EmitTargetBuiltinExpr(BuiltinID, E, ReturnValue)) {
5530     switch (EvalKind) {
5531     case TEK_Scalar:
5532       if (V->getType()->isVoidTy())
5533         return RValue::get(nullptr);
5534       return RValue::get(V);
5535     case TEK_Aggregate:
5536       return RValue::getAggregate(ReturnValue.getValue(),
5537                                   ReturnValue.isVolatile());
5538     case TEK_Complex:
5539       llvm_unreachable("No current target builtin returns complex");
5540     }
5541     llvm_unreachable("Bad evaluation kind in EmitBuiltinExpr");
5542   }
5543 
5544   ErrorUnsupported(E, "builtin function");
5545 
5546   // Unknown builtin, for now just dump it out and return undef.
5547   return GetUndefRValue(E->getType());
5548 }
5549 
5550 static Value *EmitTargetArchBuiltinExpr(CodeGenFunction *CGF,
5551                                         unsigned BuiltinID, const CallExpr *E,
5552                                         ReturnValueSlot ReturnValue,
5553                                         llvm::Triple::ArchType Arch) {
5554   switch (Arch) {
5555   case llvm::Triple::arm:
5556   case llvm::Triple::armeb:
5557   case llvm::Triple::thumb:
5558   case llvm::Triple::thumbeb:
5559     return CGF->EmitARMBuiltinExpr(BuiltinID, E, ReturnValue, Arch);
5560   case llvm::Triple::aarch64:
5561   case llvm::Triple::aarch64_32:
5562   case llvm::Triple::aarch64_be:
5563     return CGF->EmitAArch64BuiltinExpr(BuiltinID, E, Arch);
5564   case llvm::Triple::bpfeb:
5565   case llvm::Triple::bpfel:
5566     return CGF->EmitBPFBuiltinExpr(BuiltinID, E);
5567   case llvm::Triple::x86:
5568   case llvm::Triple::x86_64:
5569     return CGF->EmitX86BuiltinExpr(BuiltinID, E);
5570   case llvm::Triple::ppc:
5571   case llvm::Triple::ppcle:
5572   case llvm::Triple::ppc64:
5573   case llvm::Triple::ppc64le:
5574     return CGF->EmitPPCBuiltinExpr(BuiltinID, E);
5575   case llvm::Triple::r600:
5576   case llvm::Triple::amdgcn:
5577     return CGF->EmitAMDGPUBuiltinExpr(BuiltinID, E);
5578   case llvm::Triple::systemz:
5579     return CGF->EmitSystemZBuiltinExpr(BuiltinID, E);
5580   case llvm::Triple::nvptx:
5581   case llvm::Triple::nvptx64:
5582     return CGF->EmitNVPTXBuiltinExpr(BuiltinID, E);
5583   case llvm::Triple::wasm32:
5584   case llvm::Triple::wasm64:
5585     return CGF->EmitWebAssemblyBuiltinExpr(BuiltinID, E);
5586   case llvm::Triple::hexagon:
5587     return CGF->EmitHexagonBuiltinExpr(BuiltinID, E);
5588   case llvm::Triple::riscv32:
5589   case llvm::Triple::riscv64:
5590     return CGF->EmitRISCVBuiltinExpr(BuiltinID, E, ReturnValue);
5591   case llvm::Triple::loongarch32:
5592   case llvm::Triple::loongarch64:
5593     return CGF->EmitLoongArchBuiltinExpr(BuiltinID, E);
5594   default:
5595     return nullptr;
5596   }
5597 }
5598 
5599 Value *CodeGenFunction::EmitTargetBuiltinExpr(unsigned BuiltinID,
5600                                               const CallExpr *E,
5601                                               ReturnValueSlot ReturnValue) {
5602   if (getContext().BuiltinInfo.isAuxBuiltinID(BuiltinID)) {
5603     assert(getContext().getAuxTargetInfo() && "Missing aux target info");
5604     return EmitTargetArchBuiltinExpr(
5605         this, getContext().BuiltinInfo.getAuxBuiltinID(BuiltinID), E,
5606         ReturnValue, getContext().getAuxTargetInfo()->getTriple().getArch());
5607   }
5608 
5609   return EmitTargetArchBuiltinExpr(this, BuiltinID, E, ReturnValue,
5610                                    getTarget().getTriple().getArch());
5611 }
5612 
5613 static llvm::FixedVectorType *GetNeonType(CodeGenFunction *CGF,
5614                                           NeonTypeFlags TypeFlags,
5615                                           bool HasLegalHalfType = true,
5616                                           bool V1Ty = false,
5617                                           bool AllowBFloatArgsAndRet = true) {
5618   int IsQuad = TypeFlags.isQuad();
5619   switch (TypeFlags.getEltType()) {
5620   case NeonTypeFlags::Int8:
5621   case NeonTypeFlags::Poly8:
5622     return llvm::FixedVectorType::get(CGF->Int8Ty, V1Ty ? 1 : (8 << IsQuad));
5623   case NeonTypeFlags::Int16:
5624   case NeonTypeFlags::Poly16:
5625     return llvm::FixedVectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
5626   case NeonTypeFlags::BFloat16:
5627     if (AllowBFloatArgsAndRet)
5628       return llvm::FixedVectorType::get(CGF->BFloatTy, V1Ty ? 1 : (4 << IsQuad));
5629     else
5630       return llvm::FixedVectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
5631   case NeonTypeFlags::Float16:
5632     if (HasLegalHalfType)
5633       return llvm::FixedVectorType::get(CGF->HalfTy, V1Ty ? 1 : (4 << IsQuad));
5634     else
5635       return llvm::FixedVectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
5636   case NeonTypeFlags::Int32:
5637     return llvm::FixedVectorType::get(CGF->Int32Ty, V1Ty ? 1 : (2 << IsQuad));
5638   case NeonTypeFlags::Int64:
5639   case NeonTypeFlags::Poly64:
5640     return llvm::FixedVectorType::get(CGF->Int64Ty, V1Ty ? 1 : (1 << IsQuad));
5641   case NeonTypeFlags::Poly128:
5642     // FIXME: i128 and f128 doesn't get fully support in Clang and llvm.
5643     // There is a lot of i128 and f128 API missing.
5644     // so we use v16i8 to represent poly128 and get pattern matched.
5645     return llvm::FixedVectorType::get(CGF->Int8Ty, 16);
5646   case NeonTypeFlags::Float32:
5647     return llvm::FixedVectorType::get(CGF->FloatTy, V1Ty ? 1 : (2 << IsQuad));
5648   case NeonTypeFlags::Float64:
5649     return llvm::FixedVectorType::get(CGF->DoubleTy, V1Ty ? 1 : (1 << IsQuad));
5650   }
5651   llvm_unreachable("Unknown vector element type!");
5652 }
5653 
5654 static llvm::VectorType *GetFloatNeonType(CodeGenFunction *CGF,
5655                                           NeonTypeFlags IntTypeFlags) {
5656   int IsQuad = IntTypeFlags.isQuad();
5657   switch (IntTypeFlags.getEltType()) {
5658   case NeonTypeFlags::Int16:
5659     return llvm::FixedVectorType::get(CGF->HalfTy, (4 << IsQuad));
5660   case NeonTypeFlags::Int32:
5661     return llvm::FixedVectorType::get(CGF->FloatTy, (2 << IsQuad));
5662   case NeonTypeFlags::Int64:
5663     return llvm::FixedVectorType::get(CGF->DoubleTy, (1 << IsQuad));
5664   default:
5665     llvm_unreachable("Type can't be converted to floating-point!");
5666   }
5667 }
5668 
5669 Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C,
5670                                       const ElementCount &Count) {
5671   Value *SV = llvm::ConstantVector::getSplat(Count, C);
5672   return Builder.CreateShuffleVector(V, V, SV, "lane");
5673 }
5674 
5675 Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C) {
5676   ElementCount EC = cast<llvm::VectorType>(V->getType())->getElementCount();
5677   return EmitNeonSplat(V, C, EC);
5678 }
5679 
5680 Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl<Value*> &Ops,
5681                                      const char *name,
5682                                      unsigned shift, bool rightshift) {
5683   unsigned j = 0;
5684   for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
5685        ai != ae; ++ai, ++j) {
5686     if (F->isConstrainedFPIntrinsic())
5687       if (ai->getType()->isMetadataTy())
5688         continue;
5689     if (shift > 0 && shift == j)
5690       Ops[j] = EmitNeonShiftVector(Ops[j], ai->getType(), rightshift);
5691     else
5692       Ops[j] = Builder.CreateBitCast(Ops[j], ai->getType(), name);
5693   }
5694 
5695   if (F->isConstrainedFPIntrinsic())
5696     return Builder.CreateConstrainedFPCall(F, Ops, name);
5697   else
5698     return Builder.CreateCall(F, Ops, name);
5699 }
5700 
5701 Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
5702                                             bool neg) {
5703   int SV = cast<ConstantInt>(V)->getSExtValue();
5704   return ConstantInt::get(Ty, neg ? -SV : SV);
5705 }
5706 
5707 // Right-shift a vector by a constant.
5708 Value *CodeGenFunction::EmitNeonRShiftImm(Value *Vec, Value *Shift,
5709                                           llvm::Type *Ty, bool usgn,
5710                                           const char *name) {
5711   llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
5712 
5713   int ShiftAmt = cast<ConstantInt>(Shift)->getSExtValue();
5714   int EltSize = VTy->getScalarSizeInBits();
5715 
5716   Vec = Builder.CreateBitCast(Vec, Ty);
5717 
5718   // lshr/ashr are undefined when the shift amount is equal to the vector
5719   // element size.
5720   if (ShiftAmt == EltSize) {
5721     if (usgn) {
5722       // Right-shifting an unsigned value by its size yields 0.
5723       return llvm::ConstantAggregateZero::get(VTy);
5724     } else {
5725       // Right-shifting a signed value by its size is equivalent
5726       // to a shift of size-1.
5727       --ShiftAmt;
5728       Shift = ConstantInt::get(VTy->getElementType(), ShiftAmt);
5729     }
5730   }
5731 
5732   Shift = EmitNeonShiftVector(Shift, Ty, false);
5733   if (usgn)
5734     return Builder.CreateLShr(Vec, Shift, name);
5735   else
5736     return Builder.CreateAShr(Vec, Shift, name);
5737 }
5738 
5739 enum {
5740   AddRetType = (1 << 0),
5741   Add1ArgType = (1 << 1),
5742   Add2ArgTypes = (1 << 2),
5743 
5744   VectorizeRetType = (1 << 3),
5745   VectorizeArgTypes = (1 << 4),
5746 
5747   InventFloatType = (1 << 5),
5748   UnsignedAlts = (1 << 6),
5749 
5750   Use64BitVectors = (1 << 7),
5751   Use128BitVectors = (1 << 8),
5752 
5753   Vectorize1ArgType = Add1ArgType | VectorizeArgTypes,
5754   VectorRet = AddRetType | VectorizeRetType,
5755   VectorRetGetArgs01 =
5756       AddRetType | Add2ArgTypes | VectorizeRetType | VectorizeArgTypes,
5757   FpCmpzModifiers =
5758       AddRetType | VectorizeRetType | Add1ArgType | InventFloatType
5759 };
5760 
5761 namespace {
5762 struct ARMVectorIntrinsicInfo {
5763   const char *NameHint;
5764   unsigned BuiltinID;
5765   unsigned LLVMIntrinsic;
5766   unsigned AltLLVMIntrinsic;
5767   uint64_t TypeModifier;
5768 
5769   bool operator<(unsigned RHSBuiltinID) const {
5770     return BuiltinID < RHSBuiltinID;
5771   }
5772   bool operator<(const ARMVectorIntrinsicInfo &TE) const {
5773     return BuiltinID < TE.BuiltinID;
5774   }
5775 };
5776 } // end anonymous namespace
5777 
5778 #define NEONMAP0(NameBase) \
5779   { #NameBase, NEON::BI__builtin_neon_ ## NameBase, 0, 0, 0 }
5780 
5781 #define NEONMAP1(NameBase, LLVMIntrinsic, TypeModifier) \
5782   { #NameBase, NEON:: BI__builtin_neon_ ## NameBase, \
5783       Intrinsic::LLVMIntrinsic, 0, TypeModifier }
5784 
5785 #define NEONMAP2(NameBase, LLVMIntrinsic, AltLLVMIntrinsic, TypeModifier) \
5786   { #NameBase, NEON:: BI__builtin_neon_ ## NameBase, \
5787       Intrinsic::LLVMIntrinsic, Intrinsic::AltLLVMIntrinsic, \
5788       TypeModifier }
5789 
5790 static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
5791   NEONMAP1(__a32_vcvt_bf16_f32, arm_neon_vcvtfp2bf, 0),
5792   NEONMAP0(splat_lane_v),
5793   NEONMAP0(splat_laneq_v),
5794   NEONMAP0(splatq_lane_v),
5795   NEONMAP0(splatq_laneq_v),
5796   NEONMAP2(vabd_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
5797   NEONMAP2(vabdq_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
5798   NEONMAP1(vabs_v, arm_neon_vabs, 0),
5799   NEONMAP1(vabsq_v, arm_neon_vabs, 0),
5800   NEONMAP0(vadd_v),
5801   NEONMAP0(vaddhn_v),
5802   NEONMAP0(vaddq_v),
5803   NEONMAP1(vaesdq_u8, arm_neon_aesd, 0),
5804   NEONMAP1(vaeseq_u8, arm_neon_aese, 0),
5805   NEONMAP1(vaesimcq_u8, arm_neon_aesimc, 0),
5806   NEONMAP1(vaesmcq_u8, arm_neon_aesmc, 0),
5807   NEONMAP1(vbfdot_f32, arm_neon_bfdot, 0),
5808   NEONMAP1(vbfdotq_f32, arm_neon_bfdot, 0),
5809   NEONMAP1(vbfmlalbq_f32, arm_neon_bfmlalb, 0),
5810   NEONMAP1(vbfmlaltq_f32, arm_neon_bfmlalt, 0),
5811   NEONMAP1(vbfmmlaq_f32, arm_neon_bfmmla, 0),
5812   NEONMAP1(vbsl_v, arm_neon_vbsl, AddRetType),
5813   NEONMAP1(vbslq_v, arm_neon_vbsl, AddRetType),
5814   NEONMAP1(vcadd_rot270_f16, arm_neon_vcadd_rot270, Add1ArgType),
5815   NEONMAP1(vcadd_rot270_f32, arm_neon_vcadd_rot270, Add1ArgType),
5816   NEONMAP1(vcadd_rot90_f16, arm_neon_vcadd_rot90, Add1ArgType),
5817   NEONMAP1(vcadd_rot90_f32, arm_neon_vcadd_rot90, Add1ArgType),
5818   NEONMAP1(vcaddq_rot270_f16, arm_neon_vcadd_rot270, Add1ArgType),
5819   NEONMAP1(vcaddq_rot270_f32, arm_neon_vcadd_rot270, Add1ArgType),
5820   NEONMAP1(vcaddq_rot270_f64, arm_neon_vcadd_rot270, Add1ArgType),
5821   NEONMAP1(vcaddq_rot90_f16, arm_neon_vcadd_rot90, Add1ArgType),
5822   NEONMAP1(vcaddq_rot90_f32, arm_neon_vcadd_rot90, Add1ArgType),
5823   NEONMAP1(vcaddq_rot90_f64, arm_neon_vcadd_rot90, Add1ArgType),
5824   NEONMAP1(vcage_v, arm_neon_vacge, 0),
5825   NEONMAP1(vcageq_v, arm_neon_vacge, 0),
5826   NEONMAP1(vcagt_v, arm_neon_vacgt, 0),
5827   NEONMAP1(vcagtq_v, arm_neon_vacgt, 0),
5828   NEONMAP1(vcale_v, arm_neon_vacge, 0),
5829   NEONMAP1(vcaleq_v, arm_neon_vacge, 0),
5830   NEONMAP1(vcalt_v, arm_neon_vacgt, 0),
5831   NEONMAP1(vcaltq_v, arm_neon_vacgt, 0),
5832   NEONMAP0(vceqz_v),
5833   NEONMAP0(vceqzq_v),
5834   NEONMAP0(vcgez_v),
5835   NEONMAP0(vcgezq_v),
5836   NEONMAP0(vcgtz_v),
5837   NEONMAP0(vcgtzq_v),
5838   NEONMAP0(vclez_v),
5839   NEONMAP0(vclezq_v),
5840   NEONMAP1(vcls_v, arm_neon_vcls, Add1ArgType),
5841   NEONMAP1(vclsq_v, arm_neon_vcls, Add1ArgType),
5842   NEONMAP0(vcltz_v),
5843   NEONMAP0(vcltzq_v),
5844   NEONMAP1(vclz_v, ctlz, Add1ArgType),
5845   NEONMAP1(vclzq_v, ctlz, Add1ArgType),
5846   NEONMAP1(vcnt_v, ctpop, Add1ArgType),
5847   NEONMAP1(vcntq_v, ctpop, Add1ArgType),
5848   NEONMAP1(vcvt_f16_f32, arm_neon_vcvtfp2hf, 0),
5849   NEONMAP0(vcvt_f16_s16),
5850   NEONMAP0(vcvt_f16_u16),
5851   NEONMAP1(vcvt_f32_f16, arm_neon_vcvthf2fp, 0),
5852   NEONMAP0(vcvt_f32_v),
5853   NEONMAP1(vcvt_n_f16_s16, arm_neon_vcvtfxs2fp, 0),
5854   NEONMAP1(vcvt_n_f16_u16, arm_neon_vcvtfxu2fp, 0),
5855   NEONMAP2(vcvt_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
5856   NEONMAP1(vcvt_n_s16_f16, arm_neon_vcvtfp2fxs, 0),
5857   NEONMAP1(vcvt_n_s32_v, arm_neon_vcvtfp2fxs, 0),
5858   NEONMAP1(vcvt_n_s64_v, arm_neon_vcvtfp2fxs, 0),
5859   NEONMAP1(vcvt_n_u16_f16, arm_neon_vcvtfp2fxu, 0),
5860   NEONMAP1(vcvt_n_u32_v, arm_neon_vcvtfp2fxu, 0),
5861   NEONMAP1(vcvt_n_u64_v, arm_neon_vcvtfp2fxu, 0),
5862   NEONMAP0(vcvt_s16_f16),
5863   NEONMAP0(vcvt_s32_v),
5864   NEONMAP0(vcvt_s64_v),
5865   NEONMAP0(vcvt_u16_f16),
5866   NEONMAP0(vcvt_u32_v),
5867   NEONMAP0(vcvt_u64_v),
5868   NEONMAP1(vcvta_s16_f16, arm_neon_vcvtas, 0),
5869   NEONMAP1(vcvta_s32_v, arm_neon_vcvtas, 0),
5870   NEONMAP1(vcvta_s64_v, arm_neon_vcvtas, 0),
5871   NEONMAP1(vcvta_u16_f16, arm_neon_vcvtau, 0),
5872   NEONMAP1(vcvta_u32_v, arm_neon_vcvtau, 0),
5873   NEONMAP1(vcvta_u64_v, arm_neon_vcvtau, 0),
5874   NEONMAP1(vcvtaq_s16_f16, arm_neon_vcvtas, 0),
5875   NEONMAP1(vcvtaq_s32_v, arm_neon_vcvtas, 0),
5876   NEONMAP1(vcvtaq_s64_v, arm_neon_vcvtas, 0),
5877   NEONMAP1(vcvtaq_u16_f16, arm_neon_vcvtau, 0),
5878   NEONMAP1(vcvtaq_u32_v, arm_neon_vcvtau, 0),
5879   NEONMAP1(vcvtaq_u64_v, arm_neon_vcvtau, 0),
5880   NEONMAP1(vcvth_bf16_f32, arm_neon_vcvtbfp2bf, 0),
5881   NEONMAP1(vcvtm_s16_f16, arm_neon_vcvtms, 0),
5882   NEONMAP1(vcvtm_s32_v, arm_neon_vcvtms, 0),
5883   NEONMAP1(vcvtm_s64_v, arm_neon_vcvtms, 0),
5884   NEONMAP1(vcvtm_u16_f16, arm_neon_vcvtmu, 0),
5885   NEONMAP1(vcvtm_u32_v, arm_neon_vcvtmu, 0),
5886   NEONMAP1(vcvtm_u64_v, arm_neon_vcvtmu, 0),
5887   NEONMAP1(vcvtmq_s16_f16, arm_neon_vcvtms, 0),
5888   NEONMAP1(vcvtmq_s32_v, arm_neon_vcvtms, 0),
5889   NEONMAP1(vcvtmq_s64_v, arm_neon_vcvtms, 0),
5890   NEONMAP1(vcvtmq_u16_f16, arm_neon_vcvtmu, 0),
5891   NEONMAP1(vcvtmq_u32_v, arm_neon_vcvtmu, 0),
5892   NEONMAP1(vcvtmq_u64_v, arm_neon_vcvtmu, 0),
5893   NEONMAP1(vcvtn_s16_f16, arm_neon_vcvtns, 0),
5894   NEONMAP1(vcvtn_s32_v, arm_neon_vcvtns, 0),
5895   NEONMAP1(vcvtn_s64_v, arm_neon_vcvtns, 0),
5896   NEONMAP1(vcvtn_u16_f16, arm_neon_vcvtnu, 0),
5897   NEONMAP1(vcvtn_u32_v, arm_neon_vcvtnu, 0),
5898   NEONMAP1(vcvtn_u64_v, arm_neon_vcvtnu, 0),
5899   NEONMAP1(vcvtnq_s16_f16, arm_neon_vcvtns, 0),
5900   NEONMAP1(vcvtnq_s32_v, arm_neon_vcvtns, 0),
5901   NEONMAP1(vcvtnq_s64_v, arm_neon_vcvtns, 0),
5902   NEONMAP1(vcvtnq_u16_f16, arm_neon_vcvtnu, 0),
5903   NEONMAP1(vcvtnq_u32_v, arm_neon_vcvtnu, 0),
5904   NEONMAP1(vcvtnq_u64_v, arm_neon_vcvtnu, 0),
5905   NEONMAP1(vcvtp_s16_f16, arm_neon_vcvtps, 0),
5906   NEONMAP1(vcvtp_s32_v, arm_neon_vcvtps, 0),
5907   NEONMAP1(vcvtp_s64_v, arm_neon_vcvtps, 0),
5908   NEONMAP1(vcvtp_u16_f16, arm_neon_vcvtpu, 0),
5909   NEONMAP1(vcvtp_u32_v, arm_neon_vcvtpu, 0),
5910   NEONMAP1(vcvtp_u64_v, arm_neon_vcvtpu, 0),
5911   NEONMAP1(vcvtpq_s16_f16, arm_neon_vcvtps, 0),
5912   NEONMAP1(vcvtpq_s32_v, arm_neon_vcvtps, 0),
5913   NEONMAP1(vcvtpq_s64_v, arm_neon_vcvtps, 0),
5914   NEONMAP1(vcvtpq_u16_f16, arm_neon_vcvtpu, 0),
5915   NEONMAP1(vcvtpq_u32_v, arm_neon_vcvtpu, 0),
5916   NEONMAP1(vcvtpq_u64_v, arm_neon_vcvtpu, 0),
5917   NEONMAP0(vcvtq_f16_s16),
5918   NEONMAP0(vcvtq_f16_u16),
5919   NEONMAP0(vcvtq_f32_v),
5920   NEONMAP1(vcvtq_n_f16_s16, arm_neon_vcvtfxs2fp, 0),
5921   NEONMAP1(vcvtq_n_f16_u16, arm_neon_vcvtfxu2fp, 0),
5922   NEONMAP2(vcvtq_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
5923   NEONMAP1(vcvtq_n_s16_f16, arm_neon_vcvtfp2fxs, 0),
5924   NEONMAP1(vcvtq_n_s32_v, arm_neon_vcvtfp2fxs, 0),
5925   NEONMAP1(vcvtq_n_s64_v, arm_neon_vcvtfp2fxs, 0),
5926   NEONMAP1(vcvtq_n_u16_f16, arm_neon_vcvtfp2fxu, 0),
5927   NEONMAP1(vcvtq_n_u32_v, arm_neon_vcvtfp2fxu, 0),
5928   NEONMAP1(vcvtq_n_u64_v, arm_neon_vcvtfp2fxu, 0),
5929   NEONMAP0(vcvtq_s16_f16),
5930   NEONMAP0(vcvtq_s32_v),
5931   NEONMAP0(vcvtq_s64_v),
5932   NEONMAP0(vcvtq_u16_f16),
5933   NEONMAP0(vcvtq_u32_v),
5934   NEONMAP0(vcvtq_u64_v),
5935   NEONMAP1(vdot_s32, arm_neon_sdot, 0),
5936   NEONMAP1(vdot_u32, arm_neon_udot, 0),
5937   NEONMAP1(vdotq_s32, arm_neon_sdot, 0),
5938   NEONMAP1(vdotq_u32, arm_neon_udot, 0),
5939   NEONMAP0(vext_v),
5940   NEONMAP0(vextq_v),
5941   NEONMAP0(vfma_v),
5942   NEONMAP0(vfmaq_v),
5943   NEONMAP2(vhadd_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts),
5944   NEONMAP2(vhaddq_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts),
5945   NEONMAP2(vhsub_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
5946   NEONMAP2(vhsubq_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
5947   NEONMAP0(vld1_dup_v),
5948   NEONMAP1(vld1_v, arm_neon_vld1, 0),
5949   NEONMAP1(vld1_x2_v, arm_neon_vld1x2, 0),
5950   NEONMAP1(vld1_x3_v, arm_neon_vld1x3, 0),
5951   NEONMAP1(vld1_x4_v, arm_neon_vld1x4, 0),
5952   NEONMAP0(vld1q_dup_v),
5953   NEONMAP1(vld1q_v, arm_neon_vld1, 0),
5954   NEONMAP1(vld1q_x2_v, arm_neon_vld1x2, 0),
5955   NEONMAP1(vld1q_x3_v, arm_neon_vld1x3, 0),
5956   NEONMAP1(vld1q_x4_v, arm_neon_vld1x4, 0),
5957   NEONMAP1(vld2_dup_v, arm_neon_vld2dup, 0),
5958   NEONMAP1(vld2_lane_v, arm_neon_vld2lane, 0),
5959   NEONMAP1(vld2_v, arm_neon_vld2, 0),
5960   NEONMAP1(vld2q_dup_v, arm_neon_vld2dup, 0),
5961   NEONMAP1(vld2q_lane_v, arm_neon_vld2lane, 0),
5962   NEONMAP1(vld2q_v, arm_neon_vld2, 0),
5963   NEONMAP1(vld3_dup_v, arm_neon_vld3dup, 0),
5964   NEONMAP1(vld3_lane_v, arm_neon_vld3lane, 0),
5965   NEONMAP1(vld3_v, arm_neon_vld3, 0),
5966   NEONMAP1(vld3q_dup_v, arm_neon_vld3dup, 0),
5967   NEONMAP1(vld3q_lane_v, arm_neon_vld3lane, 0),
5968   NEONMAP1(vld3q_v, arm_neon_vld3, 0),
5969   NEONMAP1(vld4_dup_v, arm_neon_vld4dup, 0),
5970   NEONMAP1(vld4_lane_v, arm_neon_vld4lane, 0),
5971   NEONMAP1(vld4_v, arm_neon_vld4, 0),
5972   NEONMAP1(vld4q_dup_v, arm_neon_vld4dup, 0),
5973   NEONMAP1(vld4q_lane_v, arm_neon_vld4lane, 0),
5974   NEONMAP1(vld4q_v, arm_neon_vld4, 0),
5975   NEONMAP2(vmax_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
5976   NEONMAP1(vmaxnm_v, arm_neon_vmaxnm, Add1ArgType),
5977   NEONMAP1(vmaxnmq_v, arm_neon_vmaxnm, Add1ArgType),
5978   NEONMAP2(vmaxq_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
5979   NEONMAP2(vmin_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
5980   NEONMAP1(vminnm_v, arm_neon_vminnm, Add1ArgType),
5981   NEONMAP1(vminnmq_v, arm_neon_vminnm, Add1ArgType),
5982   NEONMAP2(vminq_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
5983   NEONMAP1(vmmlaq_s32, arm_neon_smmla, 0),
5984   NEONMAP1(vmmlaq_u32, arm_neon_ummla, 0),
5985   NEONMAP0(vmovl_v),
5986   NEONMAP0(vmovn_v),
5987   NEONMAP1(vmul_v, arm_neon_vmulp, Add1ArgType),
5988   NEONMAP0(vmull_v),
5989   NEONMAP1(vmulq_v, arm_neon_vmulp, Add1ArgType),
5990   NEONMAP2(vpadal_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts),
5991   NEONMAP2(vpadalq_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts),
5992   NEONMAP1(vpadd_v, arm_neon_vpadd, Add1ArgType),
5993   NEONMAP2(vpaddl_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts),
5994   NEONMAP2(vpaddlq_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts),
5995   NEONMAP1(vpaddq_v, arm_neon_vpadd, Add1ArgType),
5996   NEONMAP2(vpmax_v, arm_neon_vpmaxu, arm_neon_vpmaxs, Add1ArgType | UnsignedAlts),
5997   NEONMAP2(vpmin_v, arm_neon_vpminu, arm_neon_vpmins, Add1ArgType | UnsignedAlts),
5998   NEONMAP1(vqabs_v, arm_neon_vqabs, Add1ArgType),
5999   NEONMAP1(vqabsq_v, arm_neon_vqabs, Add1ArgType),
6000   NEONMAP2(vqadd_v, uadd_sat, sadd_sat, Add1ArgType | UnsignedAlts),
6001   NEONMAP2(vqaddq_v, uadd_sat, sadd_sat, Add1ArgType | UnsignedAlts),
6002   NEONMAP2(vqdmlal_v, arm_neon_vqdmull, sadd_sat, 0),
6003   NEONMAP2(vqdmlsl_v, arm_neon_vqdmull, ssub_sat, 0),
6004   NEONMAP1(vqdmulh_v, arm_neon_vqdmulh, Add1ArgType),
6005   NEONMAP1(vqdmulhq_v, arm_neon_vqdmulh, Add1ArgType),
6006   NEONMAP1(vqdmull_v, arm_neon_vqdmull, Add1ArgType),
6007   NEONMAP2(vqmovn_v, arm_neon_vqmovnu, arm_neon_vqmovns, Add1ArgType | UnsignedAlts),
6008   NEONMAP1(vqmovun_v, arm_neon_vqmovnsu, Add1ArgType),
6009   NEONMAP1(vqneg_v, arm_neon_vqneg, Add1ArgType),
6010   NEONMAP1(vqnegq_v, arm_neon_vqneg, Add1ArgType),
6011   NEONMAP1(vqrdmlah_s16, arm_neon_vqrdmlah, Add1ArgType),
6012   NEONMAP1(vqrdmlah_s32, arm_neon_vqrdmlah, Add1ArgType),
6013   NEONMAP1(vqrdmlahq_s16, arm_neon_vqrdmlah, Add1ArgType),
6014   NEONMAP1(vqrdmlahq_s32, arm_neon_vqrdmlah, Add1ArgType),
6015   NEONMAP1(vqrdmlsh_s16, arm_neon_vqrdmlsh, Add1ArgType),
6016   NEONMAP1(vqrdmlsh_s32, arm_neon_vqrdmlsh, Add1ArgType),
6017   NEONMAP1(vqrdmlshq_s16, arm_neon_vqrdmlsh, Add1ArgType),
6018   NEONMAP1(vqrdmlshq_s32, arm_neon_vqrdmlsh, Add1ArgType),
6019   NEONMAP1(vqrdmulh_v, arm_neon_vqrdmulh, Add1ArgType),
6020   NEONMAP1(vqrdmulhq_v, arm_neon_vqrdmulh, Add1ArgType),
6021   NEONMAP2(vqrshl_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
6022   NEONMAP2(vqrshlq_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
6023   NEONMAP2(vqshl_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts),
6024   NEONMAP2(vqshl_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
6025   NEONMAP2(vqshlq_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts),
6026   NEONMAP2(vqshlq_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
6027   NEONMAP1(vqshlu_n_v, arm_neon_vqshiftsu, 0),
6028   NEONMAP1(vqshluq_n_v, arm_neon_vqshiftsu, 0),
6029   NEONMAP2(vqsub_v, usub_sat, ssub_sat, Add1ArgType | UnsignedAlts),
6030   NEONMAP2(vqsubq_v, usub_sat, ssub_sat, Add1ArgType | UnsignedAlts),
6031   NEONMAP1(vraddhn_v, arm_neon_vraddhn, Add1ArgType),
6032   NEONMAP2(vrecpe_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
6033   NEONMAP2(vrecpeq_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
6034   NEONMAP1(vrecps_v, arm_neon_vrecps, Add1ArgType),
6035   NEONMAP1(vrecpsq_v, arm_neon_vrecps, Add1ArgType),
6036   NEONMAP2(vrhadd_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts),
6037   NEONMAP2(vrhaddq_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts),
6038   NEONMAP1(vrnd_v, arm_neon_vrintz, Add1ArgType),
6039   NEONMAP1(vrnda_v, arm_neon_vrinta, Add1ArgType),
6040   NEONMAP1(vrndaq_v, arm_neon_vrinta, Add1ArgType),
6041   NEONMAP0(vrndi_v),
6042   NEONMAP0(vrndiq_v),
6043   NEONMAP1(vrndm_v, arm_neon_vrintm, Add1ArgType),
6044   NEONMAP1(vrndmq_v, arm_neon_vrintm, Add1ArgType),
6045   NEONMAP1(vrndn_v, arm_neon_vrintn, Add1ArgType),
6046   NEONMAP1(vrndnq_v, arm_neon_vrintn, Add1ArgType),
6047   NEONMAP1(vrndp_v, arm_neon_vrintp, Add1ArgType),
6048   NEONMAP1(vrndpq_v, arm_neon_vrintp, Add1ArgType),
6049   NEONMAP1(vrndq_v, arm_neon_vrintz, Add1ArgType),
6050   NEONMAP1(vrndx_v, arm_neon_vrintx, Add1ArgType),
6051   NEONMAP1(vrndxq_v, arm_neon_vrintx, Add1ArgType),
6052   NEONMAP2(vrshl_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts),
6053   NEONMAP2(vrshlq_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts),
6054   NEONMAP2(vrshr_n_v, arm_neon_vrshiftu, arm_neon_vrshifts, UnsignedAlts),
6055   NEONMAP2(vrshrq_n_v, arm_neon_vrshiftu, arm_neon_vrshifts, UnsignedAlts),
6056   NEONMAP2(vrsqrte_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0),
6057   NEONMAP2(vrsqrteq_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0),
6058   NEONMAP1(vrsqrts_v, arm_neon_vrsqrts, Add1ArgType),
6059   NEONMAP1(vrsqrtsq_v, arm_neon_vrsqrts, Add1ArgType),
6060   NEONMAP1(vrsubhn_v, arm_neon_vrsubhn, Add1ArgType),
6061   NEONMAP1(vsha1su0q_u32, arm_neon_sha1su0, 0),
6062   NEONMAP1(vsha1su1q_u32, arm_neon_sha1su1, 0),
6063   NEONMAP1(vsha256h2q_u32, arm_neon_sha256h2, 0),
6064   NEONMAP1(vsha256hq_u32, arm_neon_sha256h, 0),
6065   NEONMAP1(vsha256su0q_u32, arm_neon_sha256su0, 0),
6066   NEONMAP1(vsha256su1q_u32, arm_neon_sha256su1, 0),
6067   NEONMAP0(vshl_n_v),
6068   NEONMAP2(vshl_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts),
6069   NEONMAP0(vshll_n_v),
6070   NEONMAP0(vshlq_n_v),
6071   NEONMAP2(vshlq_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts),
6072   NEONMAP0(vshr_n_v),
6073   NEONMAP0(vshrn_n_v),
6074   NEONMAP0(vshrq_n_v),
6075   NEONMAP1(vst1_v, arm_neon_vst1, 0),
6076   NEONMAP1(vst1_x2_v, arm_neon_vst1x2, 0),
6077   NEONMAP1(vst1_x3_v, arm_neon_vst1x3, 0),
6078   NEONMAP1(vst1_x4_v, arm_neon_vst1x4, 0),
6079   NEONMAP1(vst1q_v, arm_neon_vst1, 0),
6080   NEONMAP1(vst1q_x2_v, arm_neon_vst1x2, 0),
6081   NEONMAP1(vst1q_x3_v, arm_neon_vst1x3, 0),
6082   NEONMAP1(vst1q_x4_v, arm_neon_vst1x4, 0),
6083   NEONMAP1(vst2_lane_v, arm_neon_vst2lane, 0),
6084   NEONMAP1(vst2_v, arm_neon_vst2, 0),
6085   NEONMAP1(vst2q_lane_v, arm_neon_vst2lane, 0),
6086   NEONMAP1(vst2q_v, arm_neon_vst2, 0),
6087   NEONMAP1(vst3_lane_v, arm_neon_vst3lane, 0),
6088   NEONMAP1(vst3_v, arm_neon_vst3, 0),
6089   NEONMAP1(vst3q_lane_v, arm_neon_vst3lane, 0),
6090   NEONMAP1(vst3q_v, arm_neon_vst3, 0),
6091   NEONMAP1(vst4_lane_v, arm_neon_vst4lane, 0),
6092   NEONMAP1(vst4_v, arm_neon_vst4, 0),
6093   NEONMAP1(vst4q_lane_v, arm_neon_vst4lane, 0),
6094   NEONMAP1(vst4q_v, arm_neon_vst4, 0),
6095   NEONMAP0(vsubhn_v),
6096   NEONMAP0(vtrn_v),
6097   NEONMAP0(vtrnq_v),
6098   NEONMAP0(vtst_v),
6099   NEONMAP0(vtstq_v),
6100   NEONMAP1(vusdot_s32, arm_neon_usdot, 0),
6101   NEONMAP1(vusdotq_s32, arm_neon_usdot, 0),
6102   NEONMAP1(vusmmlaq_s32, arm_neon_usmmla, 0),
6103   NEONMAP0(vuzp_v),
6104   NEONMAP0(vuzpq_v),
6105   NEONMAP0(vzip_v),
6106   NEONMAP0(vzipq_v)
6107 };
6108 
6109 static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
6110   NEONMAP1(__a64_vcvtq_low_bf16_f32, aarch64_neon_bfcvtn, 0),
6111   NEONMAP0(splat_lane_v),
6112   NEONMAP0(splat_laneq_v),
6113   NEONMAP0(splatq_lane_v),
6114   NEONMAP0(splatq_laneq_v),
6115   NEONMAP1(vabs_v, aarch64_neon_abs, 0),
6116   NEONMAP1(vabsq_v, aarch64_neon_abs, 0),
6117   NEONMAP0(vadd_v),
6118   NEONMAP0(vaddhn_v),
6119   NEONMAP0(vaddq_p128),
6120   NEONMAP0(vaddq_v),
6121   NEONMAP1(vaesdq_u8, aarch64_crypto_aesd, 0),
6122   NEONMAP1(vaeseq_u8, aarch64_crypto_aese, 0),
6123   NEONMAP1(vaesimcq_u8, aarch64_crypto_aesimc, 0),
6124   NEONMAP1(vaesmcq_u8, aarch64_crypto_aesmc, 0),
6125   NEONMAP2(vbcaxq_s16, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6126   NEONMAP2(vbcaxq_s32, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6127   NEONMAP2(vbcaxq_s64, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6128   NEONMAP2(vbcaxq_s8, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6129   NEONMAP2(vbcaxq_u16, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6130   NEONMAP2(vbcaxq_u32, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6131   NEONMAP2(vbcaxq_u64, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6132   NEONMAP2(vbcaxq_u8, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6133   NEONMAP1(vbfdot_f32, aarch64_neon_bfdot, 0),
6134   NEONMAP1(vbfdotq_f32, aarch64_neon_bfdot, 0),
6135   NEONMAP1(vbfmlalbq_f32, aarch64_neon_bfmlalb, 0),
6136   NEONMAP1(vbfmlaltq_f32, aarch64_neon_bfmlalt, 0),
6137   NEONMAP1(vbfmmlaq_f32, aarch64_neon_bfmmla, 0),
6138   NEONMAP1(vcadd_rot270_f16, aarch64_neon_vcadd_rot270, Add1ArgType),
6139   NEONMAP1(vcadd_rot270_f32, aarch64_neon_vcadd_rot270, Add1ArgType),
6140   NEONMAP1(vcadd_rot90_f16, aarch64_neon_vcadd_rot90, Add1ArgType),
6141   NEONMAP1(vcadd_rot90_f32, aarch64_neon_vcadd_rot90, Add1ArgType),
6142   NEONMAP1(vcaddq_rot270_f16, aarch64_neon_vcadd_rot270, Add1ArgType),
6143   NEONMAP1(vcaddq_rot270_f32, aarch64_neon_vcadd_rot270, Add1ArgType),
6144   NEONMAP1(vcaddq_rot270_f64, aarch64_neon_vcadd_rot270, Add1ArgType),
6145   NEONMAP1(vcaddq_rot90_f16, aarch64_neon_vcadd_rot90, Add1ArgType),
6146   NEONMAP1(vcaddq_rot90_f32, aarch64_neon_vcadd_rot90, Add1ArgType),
6147   NEONMAP1(vcaddq_rot90_f64, aarch64_neon_vcadd_rot90, Add1ArgType),
6148   NEONMAP1(vcage_v, aarch64_neon_facge, 0),
6149   NEONMAP1(vcageq_v, aarch64_neon_facge, 0),
6150   NEONMAP1(vcagt_v, aarch64_neon_facgt, 0),
6151   NEONMAP1(vcagtq_v, aarch64_neon_facgt, 0),
6152   NEONMAP1(vcale_v, aarch64_neon_facge, 0),
6153   NEONMAP1(vcaleq_v, aarch64_neon_facge, 0),
6154   NEONMAP1(vcalt_v, aarch64_neon_facgt, 0),
6155   NEONMAP1(vcaltq_v, aarch64_neon_facgt, 0),
6156   NEONMAP0(vceqz_v),
6157   NEONMAP0(vceqzq_v),
6158   NEONMAP0(vcgez_v),
6159   NEONMAP0(vcgezq_v),
6160   NEONMAP0(vcgtz_v),
6161   NEONMAP0(vcgtzq_v),
6162   NEONMAP0(vclez_v),
6163   NEONMAP0(vclezq_v),
6164   NEONMAP1(vcls_v, aarch64_neon_cls, Add1ArgType),
6165   NEONMAP1(vclsq_v, aarch64_neon_cls, Add1ArgType),
6166   NEONMAP0(vcltz_v),
6167   NEONMAP0(vcltzq_v),
6168   NEONMAP1(vclz_v, ctlz, Add1ArgType),
6169   NEONMAP1(vclzq_v, ctlz, Add1ArgType),
6170   NEONMAP1(vcmla_f16, aarch64_neon_vcmla_rot0, Add1ArgType),
6171   NEONMAP1(vcmla_f32, aarch64_neon_vcmla_rot0, Add1ArgType),
6172   NEONMAP1(vcmla_rot180_f16, aarch64_neon_vcmla_rot180, Add1ArgType),
6173   NEONMAP1(vcmla_rot180_f32, aarch64_neon_vcmla_rot180, Add1ArgType),
6174   NEONMAP1(vcmla_rot270_f16, aarch64_neon_vcmla_rot270, Add1ArgType),
6175   NEONMAP1(vcmla_rot270_f32, aarch64_neon_vcmla_rot270, Add1ArgType),
6176   NEONMAP1(vcmla_rot90_f16, aarch64_neon_vcmla_rot90, Add1ArgType),
6177   NEONMAP1(vcmla_rot90_f32, aarch64_neon_vcmla_rot90, Add1ArgType),
6178   NEONMAP1(vcmlaq_f16, aarch64_neon_vcmla_rot0, Add1ArgType),
6179   NEONMAP1(vcmlaq_f32, aarch64_neon_vcmla_rot0, Add1ArgType),
6180   NEONMAP1(vcmlaq_f64, aarch64_neon_vcmla_rot0, Add1ArgType),
6181   NEONMAP1(vcmlaq_rot180_f16, aarch64_neon_vcmla_rot180, Add1ArgType),
6182   NEONMAP1(vcmlaq_rot180_f32, aarch64_neon_vcmla_rot180, Add1ArgType),
6183   NEONMAP1(vcmlaq_rot180_f64, aarch64_neon_vcmla_rot180, Add1ArgType),
6184   NEONMAP1(vcmlaq_rot270_f16, aarch64_neon_vcmla_rot270, Add1ArgType),
6185   NEONMAP1(vcmlaq_rot270_f32, aarch64_neon_vcmla_rot270, Add1ArgType),
6186   NEONMAP1(vcmlaq_rot270_f64, aarch64_neon_vcmla_rot270, Add1ArgType),
6187   NEONMAP1(vcmlaq_rot90_f16, aarch64_neon_vcmla_rot90, Add1ArgType),
6188   NEONMAP1(vcmlaq_rot90_f32, aarch64_neon_vcmla_rot90, Add1ArgType),
6189   NEONMAP1(vcmlaq_rot90_f64, aarch64_neon_vcmla_rot90, Add1ArgType),
6190   NEONMAP1(vcnt_v, ctpop, Add1ArgType),
6191   NEONMAP1(vcntq_v, ctpop, Add1ArgType),
6192   NEONMAP1(vcvt_f16_f32, aarch64_neon_vcvtfp2hf, 0),
6193   NEONMAP0(vcvt_f16_s16),
6194   NEONMAP0(vcvt_f16_u16),
6195   NEONMAP1(vcvt_f32_f16, aarch64_neon_vcvthf2fp, 0),
6196   NEONMAP0(vcvt_f32_v),
6197   NEONMAP1(vcvt_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
6198   NEONMAP1(vcvt_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
6199   NEONMAP2(vcvt_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
6200   NEONMAP2(vcvt_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
6201   NEONMAP1(vcvt_n_s16_f16, aarch64_neon_vcvtfp2fxs, 0),
6202   NEONMAP1(vcvt_n_s32_v, aarch64_neon_vcvtfp2fxs, 0),
6203   NEONMAP1(vcvt_n_s64_v, aarch64_neon_vcvtfp2fxs, 0),
6204   NEONMAP1(vcvt_n_u16_f16, aarch64_neon_vcvtfp2fxu, 0),
6205   NEONMAP1(vcvt_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
6206   NEONMAP1(vcvt_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
6207   NEONMAP0(vcvtq_f16_s16),
6208   NEONMAP0(vcvtq_f16_u16),
6209   NEONMAP0(vcvtq_f32_v),
6210   NEONMAP1(vcvtq_high_bf16_f32, aarch64_neon_bfcvtn2, 0),
6211   NEONMAP1(vcvtq_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
6212   NEONMAP1(vcvtq_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
6213   NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
6214   NEONMAP2(vcvtq_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
6215   NEONMAP1(vcvtq_n_s16_f16, aarch64_neon_vcvtfp2fxs, 0),
6216   NEONMAP1(vcvtq_n_s32_v, aarch64_neon_vcvtfp2fxs, 0),
6217   NEONMAP1(vcvtq_n_s64_v, aarch64_neon_vcvtfp2fxs, 0),
6218   NEONMAP1(vcvtq_n_u16_f16, aarch64_neon_vcvtfp2fxu, 0),
6219   NEONMAP1(vcvtq_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
6220   NEONMAP1(vcvtq_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
6221   NEONMAP1(vcvtx_f32_v, aarch64_neon_fcvtxn, AddRetType | Add1ArgType),
6222   NEONMAP1(vdot_s32, aarch64_neon_sdot, 0),
6223   NEONMAP1(vdot_u32, aarch64_neon_udot, 0),
6224   NEONMAP1(vdotq_s32, aarch64_neon_sdot, 0),
6225   NEONMAP1(vdotq_u32, aarch64_neon_udot, 0),
6226   NEONMAP2(veor3q_s16, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6227   NEONMAP2(veor3q_s32, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6228   NEONMAP2(veor3q_s64, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6229   NEONMAP2(veor3q_s8, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6230   NEONMAP2(veor3q_u16, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6231   NEONMAP2(veor3q_u32, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6232   NEONMAP2(veor3q_u64, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6233   NEONMAP2(veor3q_u8, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6234   NEONMAP0(vext_v),
6235   NEONMAP0(vextq_v),
6236   NEONMAP0(vfma_v),
6237   NEONMAP0(vfmaq_v),
6238   NEONMAP1(vfmlal_high_f16, aarch64_neon_fmlal2, 0),
6239   NEONMAP1(vfmlal_low_f16, aarch64_neon_fmlal, 0),
6240   NEONMAP1(vfmlalq_high_f16, aarch64_neon_fmlal2, 0),
6241   NEONMAP1(vfmlalq_low_f16, aarch64_neon_fmlal, 0),
6242   NEONMAP1(vfmlsl_high_f16, aarch64_neon_fmlsl2, 0),
6243   NEONMAP1(vfmlsl_low_f16, aarch64_neon_fmlsl, 0),
6244   NEONMAP1(vfmlslq_high_f16, aarch64_neon_fmlsl2, 0),
6245   NEONMAP1(vfmlslq_low_f16, aarch64_neon_fmlsl, 0),
6246   NEONMAP2(vhadd_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
6247   NEONMAP2(vhaddq_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
6248   NEONMAP2(vhsub_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts),
6249   NEONMAP2(vhsubq_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts),
6250   NEONMAP1(vld1_x2_v, aarch64_neon_ld1x2, 0),
6251   NEONMAP1(vld1_x3_v, aarch64_neon_ld1x3, 0),
6252   NEONMAP1(vld1_x4_v, aarch64_neon_ld1x4, 0),
6253   NEONMAP1(vld1q_x2_v, aarch64_neon_ld1x2, 0),
6254   NEONMAP1(vld1q_x3_v, aarch64_neon_ld1x3, 0),
6255   NEONMAP1(vld1q_x4_v, aarch64_neon_ld1x4, 0),
6256   NEONMAP1(vmmlaq_s32, aarch64_neon_smmla, 0),
6257   NEONMAP1(vmmlaq_u32, aarch64_neon_ummla, 0),
6258   NEONMAP0(vmovl_v),
6259   NEONMAP0(vmovn_v),
6260   NEONMAP1(vmul_v, aarch64_neon_pmul, Add1ArgType),
6261   NEONMAP1(vmulq_v, aarch64_neon_pmul, Add1ArgType),
6262   NEONMAP1(vpadd_v, aarch64_neon_addp, Add1ArgType),
6263   NEONMAP2(vpaddl_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts),
6264   NEONMAP2(vpaddlq_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts),
6265   NEONMAP1(vpaddq_v, aarch64_neon_addp, Add1ArgType),
6266   NEONMAP1(vqabs_v, aarch64_neon_sqabs, Add1ArgType),
6267   NEONMAP1(vqabsq_v, aarch64_neon_sqabs, Add1ArgType),
6268   NEONMAP2(vqadd_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts),
6269   NEONMAP2(vqaddq_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts),
6270   NEONMAP2(vqdmlal_v, aarch64_neon_sqdmull, aarch64_neon_sqadd, 0),
6271   NEONMAP2(vqdmlsl_v, aarch64_neon_sqdmull, aarch64_neon_sqsub, 0),
6272   NEONMAP1(vqdmulh_lane_v, aarch64_neon_sqdmulh_lane, 0),
6273   NEONMAP1(vqdmulh_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
6274   NEONMAP1(vqdmulh_v, aarch64_neon_sqdmulh, Add1ArgType),
6275   NEONMAP1(vqdmulhq_lane_v, aarch64_neon_sqdmulh_lane, 0),
6276   NEONMAP1(vqdmulhq_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
6277   NEONMAP1(vqdmulhq_v, aarch64_neon_sqdmulh, Add1ArgType),
6278   NEONMAP1(vqdmull_v, aarch64_neon_sqdmull, Add1ArgType),
6279   NEONMAP2(vqmovn_v, aarch64_neon_uqxtn, aarch64_neon_sqxtn, Add1ArgType | UnsignedAlts),
6280   NEONMAP1(vqmovun_v, aarch64_neon_sqxtun, Add1ArgType),
6281   NEONMAP1(vqneg_v, aarch64_neon_sqneg, Add1ArgType),
6282   NEONMAP1(vqnegq_v, aarch64_neon_sqneg, Add1ArgType),
6283   NEONMAP1(vqrdmlah_s16, aarch64_neon_sqrdmlah, Add1ArgType),
6284   NEONMAP1(vqrdmlah_s32, aarch64_neon_sqrdmlah, Add1ArgType),
6285   NEONMAP1(vqrdmlahq_s16, aarch64_neon_sqrdmlah, Add1ArgType),
6286   NEONMAP1(vqrdmlahq_s32, aarch64_neon_sqrdmlah, Add1ArgType),
6287   NEONMAP1(vqrdmlsh_s16, aarch64_neon_sqrdmlsh, Add1ArgType),
6288   NEONMAP1(vqrdmlsh_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
6289   NEONMAP1(vqrdmlshq_s16, aarch64_neon_sqrdmlsh, Add1ArgType),
6290   NEONMAP1(vqrdmlshq_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
6291   NEONMAP1(vqrdmulh_lane_v, aarch64_neon_sqrdmulh_lane, 0),
6292   NEONMAP1(vqrdmulh_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
6293   NEONMAP1(vqrdmulh_v, aarch64_neon_sqrdmulh, Add1ArgType),
6294   NEONMAP1(vqrdmulhq_lane_v, aarch64_neon_sqrdmulh_lane, 0),
6295   NEONMAP1(vqrdmulhq_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
6296   NEONMAP1(vqrdmulhq_v, aarch64_neon_sqrdmulh, Add1ArgType),
6297   NEONMAP2(vqrshl_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
6298   NEONMAP2(vqrshlq_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
6299   NEONMAP2(vqshl_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl, UnsignedAlts),
6300   NEONMAP2(vqshl_v, aarch64_neon_uqshl, aarch64_neon_sqshl, Add1ArgType | UnsignedAlts),
6301   NEONMAP2(vqshlq_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl,UnsignedAlts),
6302   NEONMAP2(vqshlq_v, aarch64_neon_uqshl, aarch64_neon_sqshl, Add1ArgType | UnsignedAlts),
6303   NEONMAP1(vqshlu_n_v, aarch64_neon_sqshlu, 0),
6304   NEONMAP1(vqshluq_n_v, aarch64_neon_sqshlu, 0),
6305   NEONMAP2(vqsub_v, aarch64_neon_uqsub, aarch64_neon_sqsub, Add1ArgType | UnsignedAlts),
6306   NEONMAP2(vqsubq_v, aarch64_neon_uqsub, aarch64_neon_sqsub, Add1ArgType | UnsignedAlts),
6307   NEONMAP1(vraddhn_v, aarch64_neon_raddhn, Add1ArgType),
6308   NEONMAP1(vrax1q_u64, aarch64_crypto_rax1, 0),
6309   NEONMAP2(vrecpe_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0),
6310   NEONMAP2(vrecpeq_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0),
6311   NEONMAP1(vrecps_v, aarch64_neon_frecps, Add1ArgType),
6312   NEONMAP1(vrecpsq_v, aarch64_neon_frecps, Add1ArgType),
6313   NEONMAP2(vrhadd_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts),
6314   NEONMAP2(vrhaddq_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts),
6315   NEONMAP1(vrnd32x_f32, aarch64_neon_frint32x, Add1ArgType),
6316   NEONMAP1(vrnd32xq_f32, aarch64_neon_frint32x, Add1ArgType),
6317   NEONMAP1(vrnd32z_f32, aarch64_neon_frint32z, Add1ArgType),
6318   NEONMAP1(vrnd32zq_f32, aarch64_neon_frint32z, Add1ArgType),
6319   NEONMAP1(vrnd64x_f32, aarch64_neon_frint64x, Add1ArgType),
6320   NEONMAP1(vrnd64xq_f32, aarch64_neon_frint64x, Add1ArgType),
6321   NEONMAP1(vrnd64z_f32, aarch64_neon_frint64z, Add1ArgType),
6322   NEONMAP1(vrnd64zq_f32, aarch64_neon_frint64z, Add1ArgType),
6323   NEONMAP0(vrndi_v),
6324   NEONMAP0(vrndiq_v),
6325   NEONMAP2(vrshl_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts),
6326   NEONMAP2(vrshlq_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts),
6327   NEONMAP2(vrshr_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts),
6328   NEONMAP2(vrshrq_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts),
6329   NEONMAP2(vrsqrte_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0),
6330   NEONMAP2(vrsqrteq_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0),
6331   NEONMAP1(vrsqrts_v, aarch64_neon_frsqrts, Add1ArgType),
6332   NEONMAP1(vrsqrtsq_v, aarch64_neon_frsqrts, Add1ArgType),
6333   NEONMAP1(vrsubhn_v, aarch64_neon_rsubhn, Add1ArgType),
6334   NEONMAP1(vsha1su0q_u32, aarch64_crypto_sha1su0, 0),
6335   NEONMAP1(vsha1su1q_u32, aarch64_crypto_sha1su1, 0),
6336   NEONMAP1(vsha256h2q_u32, aarch64_crypto_sha256h2, 0),
6337   NEONMAP1(vsha256hq_u32, aarch64_crypto_sha256h, 0),
6338   NEONMAP1(vsha256su0q_u32, aarch64_crypto_sha256su0, 0),
6339   NEONMAP1(vsha256su1q_u32, aarch64_crypto_sha256su1, 0),
6340   NEONMAP1(vsha512h2q_u64, aarch64_crypto_sha512h2, 0),
6341   NEONMAP1(vsha512hq_u64, aarch64_crypto_sha512h, 0),
6342   NEONMAP1(vsha512su0q_u64, aarch64_crypto_sha512su0, 0),
6343   NEONMAP1(vsha512su1q_u64, aarch64_crypto_sha512su1, 0),
6344   NEONMAP0(vshl_n_v),
6345   NEONMAP2(vshl_v, aarch64_neon_ushl, aarch64_neon_sshl, Add1ArgType | UnsignedAlts),
6346   NEONMAP0(vshll_n_v),
6347   NEONMAP0(vshlq_n_v),
6348   NEONMAP2(vshlq_v, aarch64_neon_ushl, aarch64_neon_sshl, Add1ArgType | UnsignedAlts),
6349   NEONMAP0(vshr_n_v),
6350   NEONMAP0(vshrn_n_v),
6351   NEONMAP0(vshrq_n_v),
6352   NEONMAP1(vsm3partw1q_u32, aarch64_crypto_sm3partw1, 0),
6353   NEONMAP1(vsm3partw2q_u32, aarch64_crypto_sm3partw2, 0),
6354   NEONMAP1(vsm3ss1q_u32, aarch64_crypto_sm3ss1, 0),
6355   NEONMAP1(vsm3tt1aq_u32, aarch64_crypto_sm3tt1a, 0),
6356   NEONMAP1(vsm3tt1bq_u32, aarch64_crypto_sm3tt1b, 0),
6357   NEONMAP1(vsm3tt2aq_u32, aarch64_crypto_sm3tt2a, 0),
6358   NEONMAP1(vsm3tt2bq_u32, aarch64_crypto_sm3tt2b, 0),
6359   NEONMAP1(vsm4ekeyq_u32, aarch64_crypto_sm4ekey, 0),
6360   NEONMAP1(vsm4eq_u32, aarch64_crypto_sm4e, 0),
6361   NEONMAP1(vst1_x2_v, aarch64_neon_st1x2, 0),
6362   NEONMAP1(vst1_x3_v, aarch64_neon_st1x3, 0),
6363   NEONMAP1(vst1_x4_v, aarch64_neon_st1x4, 0),
6364   NEONMAP1(vst1q_x2_v, aarch64_neon_st1x2, 0),
6365   NEONMAP1(vst1q_x3_v, aarch64_neon_st1x3, 0),
6366   NEONMAP1(vst1q_x4_v, aarch64_neon_st1x4, 0),
6367   NEONMAP0(vsubhn_v),
6368   NEONMAP0(vtst_v),
6369   NEONMAP0(vtstq_v),
6370   NEONMAP1(vusdot_s32, aarch64_neon_usdot, 0),
6371   NEONMAP1(vusdotq_s32, aarch64_neon_usdot, 0),
6372   NEONMAP1(vusmmlaq_s32, aarch64_neon_usmmla, 0),
6373   NEONMAP1(vxarq_u64, aarch64_crypto_xar, 0),
6374 };
6375 
6376 static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
6377   NEONMAP1(vabdd_f64, aarch64_sisd_fabd, Add1ArgType),
6378   NEONMAP1(vabds_f32, aarch64_sisd_fabd, Add1ArgType),
6379   NEONMAP1(vabsd_s64, aarch64_neon_abs, Add1ArgType),
6380   NEONMAP1(vaddlv_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType),
6381   NEONMAP1(vaddlv_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType),
6382   NEONMAP1(vaddlvq_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType),
6383   NEONMAP1(vaddlvq_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType),
6384   NEONMAP1(vaddv_f32, aarch64_neon_faddv, AddRetType | Add1ArgType),
6385   NEONMAP1(vaddv_s32, aarch64_neon_saddv, AddRetType | Add1ArgType),
6386   NEONMAP1(vaddv_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType),
6387   NEONMAP1(vaddvq_f32, aarch64_neon_faddv, AddRetType | Add1ArgType),
6388   NEONMAP1(vaddvq_f64, aarch64_neon_faddv, AddRetType | Add1ArgType),
6389   NEONMAP1(vaddvq_s32, aarch64_neon_saddv, AddRetType | Add1ArgType),
6390   NEONMAP1(vaddvq_s64, aarch64_neon_saddv, AddRetType | Add1ArgType),
6391   NEONMAP1(vaddvq_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType),
6392   NEONMAP1(vaddvq_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
6393   NEONMAP1(vcaged_f64, aarch64_neon_facge, AddRetType | Add1ArgType),
6394   NEONMAP1(vcages_f32, aarch64_neon_facge, AddRetType | Add1ArgType),
6395   NEONMAP1(vcagtd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType),
6396   NEONMAP1(vcagts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType),
6397   NEONMAP1(vcaled_f64, aarch64_neon_facge, AddRetType | Add1ArgType),
6398   NEONMAP1(vcales_f32, aarch64_neon_facge, AddRetType | Add1ArgType),
6399   NEONMAP1(vcaltd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType),
6400   NEONMAP1(vcalts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType),
6401   NEONMAP1(vcvtad_s64_f64, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
6402   NEONMAP1(vcvtad_u64_f64, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
6403   NEONMAP1(vcvtas_s32_f32, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
6404   NEONMAP1(vcvtas_u32_f32, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
6405   NEONMAP1(vcvtd_n_f64_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
6406   NEONMAP1(vcvtd_n_f64_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
6407   NEONMAP1(vcvtd_n_s64_f64, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
6408   NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
6409   NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
6410   NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
6411   NEONMAP1(vcvth_bf16_f32, aarch64_neon_bfcvt, 0),
6412   NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
6413   NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
6414   NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
6415   NEONMAP1(vcvtms_u32_f32, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
6416   NEONMAP1(vcvtnd_s64_f64, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
6417   NEONMAP1(vcvtnd_u64_f64, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
6418   NEONMAP1(vcvtns_s32_f32, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
6419   NEONMAP1(vcvtns_u32_f32, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
6420   NEONMAP1(vcvtpd_s64_f64, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
6421   NEONMAP1(vcvtpd_u64_f64, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
6422   NEONMAP1(vcvtps_s32_f32, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
6423   NEONMAP1(vcvtps_u32_f32, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
6424   NEONMAP1(vcvts_n_f32_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
6425   NEONMAP1(vcvts_n_f32_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
6426   NEONMAP1(vcvts_n_s32_f32, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
6427   NEONMAP1(vcvts_n_u32_f32, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
6428   NEONMAP1(vcvts_s32_f32, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
6429   NEONMAP1(vcvts_u32_f32, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
6430   NEONMAP1(vcvtxd_f32_f64, aarch64_sisd_fcvtxn, 0),
6431   NEONMAP1(vmaxnmv_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
6432   NEONMAP1(vmaxnmvq_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
6433   NEONMAP1(vmaxnmvq_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
6434   NEONMAP1(vmaxv_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
6435   NEONMAP1(vmaxv_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType),
6436   NEONMAP1(vmaxv_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType),
6437   NEONMAP1(vmaxvq_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
6438   NEONMAP1(vmaxvq_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
6439   NEONMAP1(vmaxvq_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType),
6440   NEONMAP1(vmaxvq_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType),
6441   NEONMAP1(vminnmv_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
6442   NEONMAP1(vminnmvq_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
6443   NEONMAP1(vminnmvq_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
6444   NEONMAP1(vminv_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
6445   NEONMAP1(vminv_s32, aarch64_neon_sminv, AddRetType | Add1ArgType),
6446   NEONMAP1(vminv_u32, aarch64_neon_uminv, AddRetType | Add1ArgType),
6447   NEONMAP1(vminvq_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
6448   NEONMAP1(vminvq_f64, aarch64_neon_fminv, AddRetType | Add1ArgType),
6449   NEONMAP1(vminvq_s32, aarch64_neon_sminv, AddRetType | Add1ArgType),
6450   NEONMAP1(vminvq_u32, aarch64_neon_uminv, AddRetType | Add1ArgType),
6451   NEONMAP1(vmull_p64, aarch64_neon_pmull64, 0),
6452   NEONMAP1(vmulxd_f64, aarch64_neon_fmulx, Add1ArgType),
6453   NEONMAP1(vmulxs_f32, aarch64_neon_fmulx, Add1ArgType),
6454   NEONMAP1(vpaddd_s64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
6455   NEONMAP1(vpaddd_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
6456   NEONMAP1(vpmaxnmqd_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
6457   NEONMAP1(vpmaxnms_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
6458   NEONMAP1(vpmaxqd_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
6459   NEONMAP1(vpmaxs_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
6460   NEONMAP1(vpminnmqd_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
6461   NEONMAP1(vpminnms_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
6462   NEONMAP1(vpminqd_f64, aarch64_neon_fminv, AddRetType | Add1ArgType),
6463   NEONMAP1(vpmins_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
6464   NEONMAP1(vqabsb_s8, aarch64_neon_sqabs, Vectorize1ArgType | Use64BitVectors),
6465   NEONMAP1(vqabsd_s64, aarch64_neon_sqabs, Add1ArgType),
6466   NEONMAP1(vqabsh_s16, aarch64_neon_sqabs, Vectorize1ArgType | Use64BitVectors),
6467   NEONMAP1(vqabss_s32, aarch64_neon_sqabs, Add1ArgType),
6468   NEONMAP1(vqaddb_s8, aarch64_neon_sqadd, Vectorize1ArgType | Use64BitVectors),
6469   NEONMAP1(vqaddb_u8, aarch64_neon_uqadd, Vectorize1ArgType | Use64BitVectors),
6470   NEONMAP1(vqaddd_s64, aarch64_neon_sqadd, Add1ArgType),
6471   NEONMAP1(vqaddd_u64, aarch64_neon_uqadd, Add1ArgType),
6472   NEONMAP1(vqaddh_s16, aarch64_neon_sqadd, Vectorize1ArgType | Use64BitVectors),
6473   NEONMAP1(vqaddh_u16, aarch64_neon_uqadd, Vectorize1ArgType | Use64BitVectors),
6474   NEONMAP1(vqadds_s32, aarch64_neon_sqadd, Add1ArgType),
6475   NEONMAP1(vqadds_u32, aarch64_neon_uqadd, Add1ArgType),
6476   NEONMAP1(vqdmulhh_s16, aarch64_neon_sqdmulh, Vectorize1ArgType | Use64BitVectors),
6477   NEONMAP1(vqdmulhs_s32, aarch64_neon_sqdmulh, Add1ArgType),
6478   NEONMAP1(vqdmullh_s16, aarch64_neon_sqdmull, VectorRet | Use128BitVectors),
6479   NEONMAP1(vqdmulls_s32, aarch64_neon_sqdmulls_scalar, 0),
6480   NEONMAP1(vqmovnd_s64, aarch64_neon_scalar_sqxtn, AddRetType | Add1ArgType),
6481   NEONMAP1(vqmovnd_u64, aarch64_neon_scalar_uqxtn, AddRetType | Add1ArgType),
6482   NEONMAP1(vqmovnh_s16, aarch64_neon_sqxtn, VectorRet | Use64BitVectors),
6483   NEONMAP1(vqmovnh_u16, aarch64_neon_uqxtn, VectorRet | Use64BitVectors),
6484   NEONMAP1(vqmovns_s32, aarch64_neon_sqxtn, VectorRet | Use64BitVectors),
6485   NEONMAP1(vqmovns_u32, aarch64_neon_uqxtn, VectorRet | Use64BitVectors),
6486   NEONMAP1(vqmovund_s64, aarch64_neon_scalar_sqxtun, AddRetType | Add1ArgType),
6487   NEONMAP1(vqmovunh_s16, aarch64_neon_sqxtun, VectorRet | Use64BitVectors),
6488   NEONMAP1(vqmovuns_s32, aarch64_neon_sqxtun, VectorRet | Use64BitVectors),
6489   NEONMAP1(vqnegb_s8, aarch64_neon_sqneg, Vectorize1ArgType | Use64BitVectors),
6490   NEONMAP1(vqnegd_s64, aarch64_neon_sqneg, Add1ArgType),
6491   NEONMAP1(vqnegh_s16, aarch64_neon_sqneg, Vectorize1ArgType | Use64BitVectors),
6492   NEONMAP1(vqnegs_s32, aarch64_neon_sqneg, Add1ArgType),
6493   NEONMAP1(vqrdmlahh_s16, aarch64_neon_sqrdmlah, Vectorize1ArgType | Use64BitVectors),
6494   NEONMAP1(vqrdmlahs_s32, aarch64_neon_sqrdmlah, Add1ArgType),
6495   NEONMAP1(vqrdmlshh_s16, aarch64_neon_sqrdmlsh, Vectorize1ArgType | Use64BitVectors),
6496   NEONMAP1(vqrdmlshs_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
6497   NEONMAP1(vqrdmulhh_s16, aarch64_neon_sqrdmulh, Vectorize1ArgType | Use64BitVectors),
6498   NEONMAP1(vqrdmulhs_s32, aarch64_neon_sqrdmulh, Add1ArgType),
6499   NEONMAP1(vqrshlb_s8, aarch64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors),
6500   NEONMAP1(vqrshlb_u8, aarch64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors),
6501   NEONMAP1(vqrshld_s64, aarch64_neon_sqrshl, Add1ArgType),
6502   NEONMAP1(vqrshld_u64, aarch64_neon_uqrshl, Add1ArgType),
6503   NEONMAP1(vqrshlh_s16, aarch64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors),
6504   NEONMAP1(vqrshlh_u16, aarch64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors),
6505   NEONMAP1(vqrshls_s32, aarch64_neon_sqrshl, Add1ArgType),
6506   NEONMAP1(vqrshls_u32, aarch64_neon_uqrshl, Add1ArgType),
6507   NEONMAP1(vqrshrnd_n_s64, aarch64_neon_sqrshrn, AddRetType),
6508   NEONMAP1(vqrshrnd_n_u64, aarch64_neon_uqrshrn, AddRetType),
6509   NEONMAP1(vqrshrnh_n_s16, aarch64_neon_sqrshrn, VectorRet | Use64BitVectors),
6510   NEONMAP1(vqrshrnh_n_u16, aarch64_neon_uqrshrn, VectorRet | Use64BitVectors),
6511   NEONMAP1(vqrshrns_n_s32, aarch64_neon_sqrshrn, VectorRet | Use64BitVectors),
6512   NEONMAP1(vqrshrns_n_u32, aarch64_neon_uqrshrn, VectorRet | Use64BitVectors),
6513   NEONMAP1(vqrshrund_n_s64, aarch64_neon_sqrshrun, AddRetType),
6514   NEONMAP1(vqrshrunh_n_s16, aarch64_neon_sqrshrun, VectorRet | Use64BitVectors),
6515   NEONMAP1(vqrshruns_n_s32, aarch64_neon_sqrshrun, VectorRet | Use64BitVectors),
6516   NEONMAP1(vqshlb_n_s8, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
6517   NEONMAP1(vqshlb_n_u8, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
6518   NEONMAP1(vqshlb_s8, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
6519   NEONMAP1(vqshlb_u8, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
6520   NEONMAP1(vqshld_s64, aarch64_neon_sqshl, Add1ArgType),
6521   NEONMAP1(vqshld_u64, aarch64_neon_uqshl, Add1ArgType),
6522   NEONMAP1(vqshlh_n_s16, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
6523   NEONMAP1(vqshlh_n_u16, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
6524   NEONMAP1(vqshlh_s16, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
6525   NEONMAP1(vqshlh_u16, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
6526   NEONMAP1(vqshls_n_s32, aarch64_neon_sqshl, Add1ArgType),
6527   NEONMAP1(vqshls_n_u32, aarch64_neon_uqshl, Add1ArgType),
6528   NEONMAP1(vqshls_s32, aarch64_neon_sqshl, Add1ArgType),
6529   NEONMAP1(vqshls_u32, aarch64_neon_uqshl, Add1ArgType),
6530   NEONMAP1(vqshlub_n_s8, aarch64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors),
6531   NEONMAP1(vqshluh_n_s16, aarch64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors),
6532   NEONMAP1(vqshlus_n_s32, aarch64_neon_sqshlu, Add1ArgType),
6533   NEONMAP1(vqshrnd_n_s64, aarch64_neon_sqshrn, AddRetType),
6534   NEONMAP1(vqshrnd_n_u64, aarch64_neon_uqshrn, AddRetType),
6535   NEONMAP1(vqshrnh_n_s16, aarch64_neon_sqshrn, VectorRet | Use64BitVectors),
6536   NEONMAP1(vqshrnh_n_u16, aarch64_neon_uqshrn, VectorRet | Use64BitVectors),
6537   NEONMAP1(vqshrns_n_s32, aarch64_neon_sqshrn, VectorRet | Use64BitVectors),
6538   NEONMAP1(vqshrns_n_u32, aarch64_neon_uqshrn, VectorRet | Use64BitVectors),
6539   NEONMAP1(vqshrund_n_s64, aarch64_neon_sqshrun, AddRetType),
6540   NEONMAP1(vqshrunh_n_s16, aarch64_neon_sqshrun, VectorRet | Use64BitVectors),
6541   NEONMAP1(vqshruns_n_s32, aarch64_neon_sqshrun, VectorRet | Use64BitVectors),
6542   NEONMAP1(vqsubb_s8, aarch64_neon_sqsub, Vectorize1ArgType | Use64BitVectors),
6543   NEONMAP1(vqsubb_u8, aarch64_neon_uqsub, Vectorize1ArgType | Use64BitVectors),
6544   NEONMAP1(vqsubd_s64, aarch64_neon_sqsub, Add1ArgType),
6545   NEONMAP1(vqsubd_u64, aarch64_neon_uqsub, Add1ArgType),
6546   NEONMAP1(vqsubh_s16, aarch64_neon_sqsub, Vectorize1ArgType | Use64BitVectors),
6547   NEONMAP1(vqsubh_u16, aarch64_neon_uqsub, Vectorize1ArgType | Use64BitVectors),
6548   NEONMAP1(vqsubs_s32, aarch64_neon_sqsub, Add1ArgType),
6549   NEONMAP1(vqsubs_u32, aarch64_neon_uqsub, Add1ArgType),
6550   NEONMAP1(vrecped_f64, aarch64_neon_frecpe, Add1ArgType),
6551   NEONMAP1(vrecpes_f32, aarch64_neon_frecpe, Add1ArgType),
6552   NEONMAP1(vrecpxd_f64, aarch64_neon_frecpx, Add1ArgType),
6553   NEONMAP1(vrecpxs_f32, aarch64_neon_frecpx, Add1ArgType),
6554   NEONMAP1(vrshld_s64, aarch64_neon_srshl, Add1ArgType),
6555   NEONMAP1(vrshld_u64, aarch64_neon_urshl, Add1ArgType),
6556   NEONMAP1(vrsqrted_f64, aarch64_neon_frsqrte, Add1ArgType),
6557   NEONMAP1(vrsqrtes_f32, aarch64_neon_frsqrte, Add1ArgType),
6558   NEONMAP1(vrsqrtsd_f64, aarch64_neon_frsqrts, Add1ArgType),
6559   NEONMAP1(vrsqrtss_f32, aarch64_neon_frsqrts, Add1ArgType),
6560   NEONMAP1(vsha1cq_u32, aarch64_crypto_sha1c, 0),
6561   NEONMAP1(vsha1h_u32, aarch64_crypto_sha1h, 0),
6562   NEONMAP1(vsha1mq_u32, aarch64_crypto_sha1m, 0),
6563   NEONMAP1(vsha1pq_u32, aarch64_crypto_sha1p, 0),
6564   NEONMAP1(vshld_s64, aarch64_neon_sshl, Add1ArgType),
6565   NEONMAP1(vshld_u64, aarch64_neon_ushl, Add1ArgType),
6566   NEONMAP1(vslid_n_s64, aarch64_neon_vsli, Vectorize1ArgType),
6567   NEONMAP1(vslid_n_u64, aarch64_neon_vsli, Vectorize1ArgType),
6568   NEONMAP1(vsqaddb_u8, aarch64_neon_usqadd, Vectorize1ArgType | Use64BitVectors),
6569   NEONMAP1(vsqaddd_u64, aarch64_neon_usqadd, Add1ArgType),
6570   NEONMAP1(vsqaddh_u16, aarch64_neon_usqadd, Vectorize1ArgType | Use64BitVectors),
6571   NEONMAP1(vsqadds_u32, aarch64_neon_usqadd, Add1ArgType),
6572   NEONMAP1(vsrid_n_s64, aarch64_neon_vsri, Vectorize1ArgType),
6573   NEONMAP1(vsrid_n_u64, aarch64_neon_vsri, Vectorize1ArgType),
6574   NEONMAP1(vuqaddb_s8, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
6575   NEONMAP1(vuqaddd_s64, aarch64_neon_suqadd, Add1ArgType),
6576   NEONMAP1(vuqaddh_s16, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
6577   NEONMAP1(vuqadds_s32, aarch64_neon_suqadd, Add1ArgType),
6578   // FP16 scalar intrinisics go here.
6579   NEONMAP1(vabdh_f16, aarch64_sisd_fabd, Add1ArgType),
6580   NEONMAP1(vcvtah_s32_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
6581   NEONMAP1(vcvtah_s64_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
6582   NEONMAP1(vcvtah_u32_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
6583   NEONMAP1(vcvtah_u64_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
6584   NEONMAP1(vcvth_n_f16_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
6585   NEONMAP1(vcvth_n_f16_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
6586   NEONMAP1(vcvth_n_f16_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
6587   NEONMAP1(vcvth_n_f16_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
6588   NEONMAP1(vcvth_n_s32_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
6589   NEONMAP1(vcvth_n_s64_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
6590   NEONMAP1(vcvth_n_u32_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
6591   NEONMAP1(vcvth_n_u64_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
6592   NEONMAP1(vcvth_s32_f16, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
6593   NEONMAP1(vcvth_s64_f16, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
6594   NEONMAP1(vcvth_u32_f16, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
6595   NEONMAP1(vcvth_u64_f16, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
6596   NEONMAP1(vcvtmh_s32_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
6597   NEONMAP1(vcvtmh_s64_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
6598   NEONMAP1(vcvtmh_u32_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
6599   NEONMAP1(vcvtmh_u64_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
6600   NEONMAP1(vcvtnh_s32_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
6601   NEONMAP1(vcvtnh_s64_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
6602   NEONMAP1(vcvtnh_u32_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
6603   NEONMAP1(vcvtnh_u64_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
6604   NEONMAP1(vcvtph_s32_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
6605   NEONMAP1(vcvtph_s64_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
6606   NEONMAP1(vcvtph_u32_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
6607   NEONMAP1(vcvtph_u64_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
6608   NEONMAP1(vmulxh_f16, aarch64_neon_fmulx, Add1ArgType),
6609   NEONMAP1(vrecpeh_f16, aarch64_neon_frecpe, Add1ArgType),
6610   NEONMAP1(vrecpxh_f16, aarch64_neon_frecpx, Add1ArgType),
6611   NEONMAP1(vrsqrteh_f16, aarch64_neon_frsqrte, Add1ArgType),
6612   NEONMAP1(vrsqrtsh_f16, aarch64_neon_frsqrts, Add1ArgType),
6613 };
6614 
6615 // Some intrinsics are equivalent for codegen.
6616 static const std::pair<unsigned, unsigned> NEONEquivalentIntrinsicMap[] = {
6617   { NEON::BI__builtin_neon_splat_lane_bf16, NEON::BI__builtin_neon_splat_lane_v, },
6618   { NEON::BI__builtin_neon_splat_laneq_bf16, NEON::BI__builtin_neon_splat_laneq_v, },
6619   { NEON::BI__builtin_neon_splatq_lane_bf16, NEON::BI__builtin_neon_splatq_lane_v, },
6620   { NEON::BI__builtin_neon_splatq_laneq_bf16, NEON::BI__builtin_neon_splatq_laneq_v, },
6621   { NEON::BI__builtin_neon_vabd_f16, NEON::BI__builtin_neon_vabd_v, },
6622   { NEON::BI__builtin_neon_vabdq_f16, NEON::BI__builtin_neon_vabdq_v, },
6623   { NEON::BI__builtin_neon_vabs_f16, NEON::BI__builtin_neon_vabs_v, },
6624   { NEON::BI__builtin_neon_vabsq_f16, NEON::BI__builtin_neon_vabsq_v, },
6625   { NEON::BI__builtin_neon_vbsl_f16, NEON::BI__builtin_neon_vbsl_v, },
6626   { NEON::BI__builtin_neon_vbslq_f16, NEON::BI__builtin_neon_vbslq_v, },
6627   { NEON::BI__builtin_neon_vcage_f16, NEON::BI__builtin_neon_vcage_v, },
6628   { NEON::BI__builtin_neon_vcageq_f16, NEON::BI__builtin_neon_vcageq_v, },
6629   { NEON::BI__builtin_neon_vcagt_f16, NEON::BI__builtin_neon_vcagt_v, },
6630   { NEON::BI__builtin_neon_vcagtq_f16, NEON::BI__builtin_neon_vcagtq_v, },
6631   { NEON::BI__builtin_neon_vcale_f16, NEON::BI__builtin_neon_vcale_v, },
6632   { NEON::BI__builtin_neon_vcaleq_f16, NEON::BI__builtin_neon_vcaleq_v, },
6633   { NEON::BI__builtin_neon_vcalt_f16, NEON::BI__builtin_neon_vcalt_v, },
6634   { NEON::BI__builtin_neon_vcaltq_f16, NEON::BI__builtin_neon_vcaltq_v, },
6635   { NEON::BI__builtin_neon_vceqz_f16, NEON::BI__builtin_neon_vceqz_v, },
6636   { NEON::BI__builtin_neon_vceqzq_f16, NEON::BI__builtin_neon_vceqzq_v, },
6637   { NEON::BI__builtin_neon_vcgez_f16, NEON::BI__builtin_neon_vcgez_v, },
6638   { NEON::BI__builtin_neon_vcgezq_f16, NEON::BI__builtin_neon_vcgezq_v, },
6639   { NEON::BI__builtin_neon_vcgtz_f16, NEON::BI__builtin_neon_vcgtz_v, },
6640   { NEON::BI__builtin_neon_vcgtzq_f16, NEON::BI__builtin_neon_vcgtzq_v, },
6641   { NEON::BI__builtin_neon_vclez_f16, NEON::BI__builtin_neon_vclez_v, },
6642   { NEON::BI__builtin_neon_vclezq_f16, NEON::BI__builtin_neon_vclezq_v, },
6643   { NEON::BI__builtin_neon_vcltz_f16, NEON::BI__builtin_neon_vcltz_v, },
6644   { NEON::BI__builtin_neon_vcltzq_f16, NEON::BI__builtin_neon_vcltzq_v, },
6645   { NEON::BI__builtin_neon_vext_f16, NEON::BI__builtin_neon_vext_v, },
6646   { NEON::BI__builtin_neon_vextq_f16, NEON::BI__builtin_neon_vextq_v, },
6647   { NEON::BI__builtin_neon_vfma_f16, NEON::BI__builtin_neon_vfma_v, },
6648   { NEON::BI__builtin_neon_vfma_lane_f16, NEON::BI__builtin_neon_vfma_lane_v, },
6649   { NEON::BI__builtin_neon_vfma_laneq_f16, NEON::BI__builtin_neon_vfma_laneq_v, },
6650   { NEON::BI__builtin_neon_vfmaq_f16, NEON::BI__builtin_neon_vfmaq_v, },
6651   { NEON::BI__builtin_neon_vfmaq_lane_f16, NEON::BI__builtin_neon_vfmaq_lane_v, },
6652   { NEON::BI__builtin_neon_vfmaq_laneq_f16, NEON::BI__builtin_neon_vfmaq_laneq_v, },
6653   { NEON::BI__builtin_neon_vld1_bf16_x2, NEON::BI__builtin_neon_vld1_x2_v },
6654   { NEON::BI__builtin_neon_vld1_bf16_x3, NEON::BI__builtin_neon_vld1_x3_v },
6655   { NEON::BI__builtin_neon_vld1_bf16_x4, NEON::BI__builtin_neon_vld1_x4_v },
6656   { NEON::BI__builtin_neon_vld1_bf16, NEON::BI__builtin_neon_vld1_v },
6657   { NEON::BI__builtin_neon_vld1_dup_bf16, NEON::BI__builtin_neon_vld1_dup_v },
6658   { NEON::BI__builtin_neon_vld1_lane_bf16, NEON::BI__builtin_neon_vld1_lane_v },
6659   { NEON::BI__builtin_neon_vld1q_bf16_x2, NEON::BI__builtin_neon_vld1q_x2_v },
6660   { NEON::BI__builtin_neon_vld1q_bf16_x3, NEON::BI__builtin_neon_vld1q_x3_v },
6661   { NEON::BI__builtin_neon_vld1q_bf16_x4, NEON::BI__builtin_neon_vld1q_x4_v },
6662   { NEON::BI__builtin_neon_vld1q_bf16, NEON::BI__builtin_neon_vld1q_v },
6663   { NEON::BI__builtin_neon_vld1q_dup_bf16, NEON::BI__builtin_neon_vld1q_dup_v },
6664   { NEON::BI__builtin_neon_vld1q_lane_bf16, NEON::BI__builtin_neon_vld1q_lane_v },
6665   { NEON::BI__builtin_neon_vld2_bf16, NEON::BI__builtin_neon_vld2_v },
6666   { NEON::BI__builtin_neon_vld2_dup_bf16, NEON::BI__builtin_neon_vld2_dup_v },
6667   { NEON::BI__builtin_neon_vld2_lane_bf16, NEON::BI__builtin_neon_vld2_lane_v },
6668   { NEON::BI__builtin_neon_vld2q_bf16, NEON::BI__builtin_neon_vld2q_v },
6669   { NEON::BI__builtin_neon_vld2q_dup_bf16, NEON::BI__builtin_neon_vld2q_dup_v },
6670   { NEON::BI__builtin_neon_vld2q_lane_bf16, NEON::BI__builtin_neon_vld2q_lane_v },
6671   { NEON::BI__builtin_neon_vld3_bf16, NEON::BI__builtin_neon_vld3_v },
6672   { NEON::BI__builtin_neon_vld3_dup_bf16, NEON::BI__builtin_neon_vld3_dup_v },
6673   { NEON::BI__builtin_neon_vld3_lane_bf16, NEON::BI__builtin_neon_vld3_lane_v },
6674   { NEON::BI__builtin_neon_vld3q_bf16, NEON::BI__builtin_neon_vld3q_v },
6675   { NEON::BI__builtin_neon_vld3q_dup_bf16, NEON::BI__builtin_neon_vld3q_dup_v },
6676   { NEON::BI__builtin_neon_vld3q_lane_bf16, NEON::BI__builtin_neon_vld3q_lane_v },
6677   { NEON::BI__builtin_neon_vld4_bf16, NEON::BI__builtin_neon_vld4_v },
6678   { NEON::BI__builtin_neon_vld4_dup_bf16, NEON::BI__builtin_neon_vld4_dup_v },
6679   { NEON::BI__builtin_neon_vld4_lane_bf16, NEON::BI__builtin_neon_vld4_lane_v },
6680   { NEON::BI__builtin_neon_vld4q_bf16, NEON::BI__builtin_neon_vld4q_v },
6681   { NEON::BI__builtin_neon_vld4q_dup_bf16, NEON::BI__builtin_neon_vld4q_dup_v },
6682   { NEON::BI__builtin_neon_vld4q_lane_bf16, NEON::BI__builtin_neon_vld4q_lane_v },
6683   { NEON::BI__builtin_neon_vmax_f16, NEON::BI__builtin_neon_vmax_v, },
6684   { NEON::BI__builtin_neon_vmaxnm_f16, NEON::BI__builtin_neon_vmaxnm_v, },
6685   { NEON::BI__builtin_neon_vmaxnmq_f16, NEON::BI__builtin_neon_vmaxnmq_v, },
6686   { NEON::BI__builtin_neon_vmaxq_f16, NEON::BI__builtin_neon_vmaxq_v, },
6687   { NEON::BI__builtin_neon_vmin_f16, NEON::BI__builtin_neon_vmin_v, },
6688   { NEON::BI__builtin_neon_vminnm_f16, NEON::BI__builtin_neon_vminnm_v, },
6689   { NEON::BI__builtin_neon_vminnmq_f16, NEON::BI__builtin_neon_vminnmq_v, },
6690   { NEON::BI__builtin_neon_vminq_f16, NEON::BI__builtin_neon_vminq_v, },
6691   { NEON::BI__builtin_neon_vmulx_f16, NEON::BI__builtin_neon_vmulx_v, },
6692   { NEON::BI__builtin_neon_vmulxq_f16, NEON::BI__builtin_neon_vmulxq_v, },
6693   { NEON::BI__builtin_neon_vpadd_f16, NEON::BI__builtin_neon_vpadd_v, },
6694   { NEON::BI__builtin_neon_vpaddq_f16, NEON::BI__builtin_neon_vpaddq_v, },
6695   { NEON::BI__builtin_neon_vpmax_f16, NEON::BI__builtin_neon_vpmax_v, },
6696   { NEON::BI__builtin_neon_vpmaxnm_f16, NEON::BI__builtin_neon_vpmaxnm_v, },
6697   { NEON::BI__builtin_neon_vpmaxnmq_f16, NEON::BI__builtin_neon_vpmaxnmq_v, },
6698   { NEON::BI__builtin_neon_vpmaxq_f16, NEON::BI__builtin_neon_vpmaxq_v, },
6699   { NEON::BI__builtin_neon_vpmin_f16, NEON::BI__builtin_neon_vpmin_v, },
6700   { NEON::BI__builtin_neon_vpminnm_f16, NEON::BI__builtin_neon_vpminnm_v, },
6701   { NEON::BI__builtin_neon_vpminnmq_f16, NEON::BI__builtin_neon_vpminnmq_v, },
6702   { NEON::BI__builtin_neon_vpminq_f16, NEON::BI__builtin_neon_vpminq_v, },
6703   { NEON::BI__builtin_neon_vrecpe_f16, NEON::BI__builtin_neon_vrecpe_v, },
6704   { NEON::BI__builtin_neon_vrecpeq_f16, NEON::BI__builtin_neon_vrecpeq_v, },
6705   { NEON::BI__builtin_neon_vrecps_f16, NEON::BI__builtin_neon_vrecps_v, },
6706   { NEON::BI__builtin_neon_vrecpsq_f16, NEON::BI__builtin_neon_vrecpsq_v, },
6707   { NEON::BI__builtin_neon_vrnd_f16, NEON::BI__builtin_neon_vrnd_v, },
6708   { NEON::BI__builtin_neon_vrnda_f16, NEON::BI__builtin_neon_vrnda_v, },
6709   { NEON::BI__builtin_neon_vrndaq_f16, NEON::BI__builtin_neon_vrndaq_v, },
6710   { NEON::BI__builtin_neon_vrndi_f16, NEON::BI__builtin_neon_vrndi_v, },
6711   { NEON::BI__builtin_neon_vrndiq_f16, NEON::BI__builtin_neon_vrndiq_v, },
6712   { NEON::BI__builtin_neon_vrndm_f16, NEON::BI__builtin_neon_vrndm_v, },
6713   { NEON::BI__builtin_neon_vrndmq_f16, NEON::BI__builtin_neon_vrndmq_v, },
6714   { NEON::BI__builtin_neon_vrndn_f16, NEON::BI__builtin_neon_vrndn_v, },
6715   { NEON::BI__builtin_neon_vrndnq_f16, NEON::BI__builtin_neon_vrndnq_v, },
6716   { NEON::BI__builtin_neon_vrndp_f16, NEON::BI__builtin_neon_vrndp_v, },
6717   { NEON::BI__builtin_neon_vrndpq_f16, NEON::BI__builtin_neon_vrndpq_v, },
6718   { NEON::BI__builtin_neon_vrndq_f16, NEON::BI__builtin_neon_vrndq_v, },
6719   { NEON::BI__builtin_neon_vrndx_f16, NEON::BI__builtin_neon_vrndx_v, },
6720   { NEON::BI__builtin_neon_vrndxq_f16, NEON::BI__builtin_neon_vrndxq_v, },
6721   { NEON::BI__builtin_neon_vrsqrte_f16, NEON::BI__builtin_neon_vrsqrte_v, },
6722   { NEON::BI__builtin_neon_vrsqrteq_f16, NEON::BI__builtin_neon_vrsqrteq_v, },
6723   { NEON::BI__builtin_neon_vrsqrts_f16, NEON::BI__builtin_neon_vrsqrts_v, },
6724   { NEON::BI__builtin_neon_vrsqrtsq_f16, NEON::BI__builtin_neon_vrsqrtsq_v, },
6725   { NEON::BI__builtin_neon_vsqrt_f16, NEON::BI__builtin_neon_vsqrt_v, },
6726   { NEON::BI__builtin_neon_vsqrtq_f16, NEON::BI__builtin_neon_vsqrtq_v, },
6727   { NEON::BI__builtin_neon_vst1_bf16_x2, NEON::BI__builtin_neon_vst1_x2_v },
6728   { NEON::BI__builtin_neon_vst1_bf16_x3, NEON::BI__builtin_neon_vst1_x3_v },
6729   { NEON::BI__builtin_neon_vst1_bf16_x4, NEON::BI__builtin_neon_vst1_x4_v },
6730   { NEON::BI__builtin_neon_vst1_bf16, NEON::BI__builtin_neon_vst1_v },
6731   { NEON::BI__builtin_neon_vst1_lane_bf16, NEON::BI__builtin_neon_vst1_lane_v },
6732   { NEON::BI__builtin_neon_vst1q_bf16_x2, NEON::BI__builtin_neon_vst1q_x2_v },
6733   { NEON::BI__builtin_neon_vst1q_bf16_x3, NEON::BI__builtin_neon_vst1q_x3_v },
6734   { NEON::BI__builtin_neon_vst1q_bf16_x4, NEON::BI__builtin_neon_vst1q_x4_v },
6735   { NEON::BI__builtin_neon_vst1q_bf16, NEON::BI__builtin_neon_vst1q_v },
6736   { NEON::BI__builtin_neon_vst1q_lane_bf16, NEON::BI__builtin_neon_vst1q_lane_v },
6737   { NEON::BI__builtin_neon_vst2_bf16, NEON::BI__builtin_neon_vst2_v },
6738   { NEON::BI__builtin_neon_vst2_lane_bf16, NEON::BI__builtin_neon_vst2_lane_v },
6739   { NEON::BI__builtin_neon_vst2q_bf16, NEON::BI__builtin_neon_vst2q_v },
6740   { NEON::BI__builtin_neon_vst2q_lane_bf16, NEON::BI__builtin_neon_vst2q_lane_v },
6741   { NEON::BI__builtin_neon_vst3_bf16, NEON::BI__builtin_neon_vst3_v },
6742   { NEON::BI__builtin_neon_vst3_lane_bf16, NEON::BI__builtin_neon_vst3_lane_v },
6743   { NEON::BI__builtin_neon_vst3q_bf16, NEON::BI__builtin_neon_vst3q_v },
6744   { NEON::BI__builtin_neon_vst3q_lane_bf16, NEON::BI__builtin_neon_vst3q_lane_v },
6745   { NEON::BI__builtin_neon_vst4_bf16, NEON::BI__builtin_neon_vst4_v },
6746   { NEON::BI__builtin_neon_vst4_lane_bf16, NEON::BI__builtin_neon_vst4_lane_v },
6747   { NEON::BI__builtin_neon_vst4q_bf16, NEON::BI__builtin_neon_vst4q_v },
6748   { NEON::BI__builtin_neon_vst4q_lane_bf16, NEON::BI__builtin_neon_vst4q_lane_v },
6749   { NEON::BI__builtin_neon_vtrn_f16, NEON::BI__builtin_neon_vtrn_v, },
6750   { NEON::BI__builtin_neon_vtrnq_f16, NEON::BI__builtin_neon_vtrnq_v, },
6751   { NEON::BI__builtin_neon_vuzp_f16, NEON::BI__builtin_neon_vuzp_v, },
6752   { NEON::BI__builtin_neon_vuzpq_f16, NEON::BI__builtin_neon_vuzpq_v, },
6753   { NEON::BI__builtin_neon_vzip_f16, NEON::BI__builtin_neon_vzip_v, },
6754   { NEON::BI__builtin_neon_vzipq_f16, NEON::BI__builtin_neon_vzipq_v, },
6755   // The mangling rules cause us to have one ID for each type for vldap1(q)_lane
6756   // and vstl1(q)_lane, but codegen is equivalent for all of them. Choose an
6757   // arbitrary one to be handled as tha canonical variation.
6758   { NEON::BI__builtin_neon_vldap1_lane_u64, NEON::BI__builtin_neon_vldap1_lane_s64 },
6759   { NEON::BI__builtin_neon_vldap1_lane_f64, NEON::BI__builtin_neon_vldap1_lane_s64 },
6760   { NEON::BI__builtin_neon_vldap1_lane_p64, NEON::BI__builtin_neon_vldap1_lane_s64 },
6761   { NEON::BI__builtin_neon_vldap1q_lane_u64, NEON::BI__builtin_neon_vldap1q_lane_s64 },
6762   { NEON::BI__builtin_neon_vldap1q_lane_f64, NEON::BI__builtin_neon_vldap1q_lane_s64 },
6763   { NEON::BI__builtin_neon_vldap1q_lane_p64, NEON::BI__builtin_neon_vldap1q_lane_s64 },
6764   { NEON::BI__builtin_neon_vstl1_lane_u64, NEON::BI__builtin_neon_vstl1_lane_s64 },
6765   { NEON::BI__builtin_neon_vstl1_lane_f64, NEON::BI__builtin_neon_vstl1_lane_s64 },
6766   { NEON::BI__builtin_neon_vstl1_lane_p64, NEON::BI__builtin_neon_vstl1_lane_s64 },
6767   { NEON::BI__builtin_neon_vstl1q_lane_u64, NEON::BI__builtin_neon_vstl1q_lane_s64 },
6768   { NEON::BI__builtin_neon_vstl1q_lane_f64, NEON::BI__builtin_neon_vstl1q_lane_s64 },
6769   { NEON::BI__builtin_neon_vstl1q_lane_p64, NEON::BI__builtin_neon_vstl1q_lane_s64 },
6770 };
6771 
6772 #undef NEONMAP0
6773 #undef NEONMAP1
6774 #undef NEONMAP2
6775 
6776 #define SVEMAP1(NameBase, LLVMIntrinsic, TypeModifier)                         \
6777   {                                                                            \
6778     #NameBase, SVE::BI__builtin_sve_##NameBase, Intrinsic::LLVMIntrinsic, 0,   \
6779         TypeModifier                                                           \
6780   }
6781 
6782 #define SVEMAP2(NameBase, TypeModifier)                                        \
6783   { #NameBase, SVE::BI__builtin_sve_##NameBase, 0, 0, TypeModifier }
6784 static const ARMVectorIntrinsicInfo AArch64SVEIntrinsicMap[] = {
6785 #define GET_SVE_LLVM_INTRINSIC_MAP
6786 #include "clang/Basic/arm_sve_builtin_cg.inc"
6787 #include "clang/Basic/BuiltinsAArch64NeonSVEBridge_cg.def"
6788 #undef GET_SVE_LLVM_INTRINSIC_MAP
6789 };
6790 
6791 #undef SVEMAP1
6792 #undef SVEMAP2
6793 
6794 #define SMEMAP1(NameBase, LLVMIntrinsic, TypeModifier)                         \
6795   {                                                                            \
6796     #NameBase, SME::BI__builtin_sme_##NameBase, Intrinsic::LLVMIntrinsic, 0,   \
6797         TypeModifier                                                           \
6798   }
6799 
6800 #define SMEMAP2(NameBase, TypeModifier)                                        \
6801   { #NameBase, SME::BI__builtin_sme_##NameBase, 0, 0, TypeModifier }
6802 static const ARMVectorIntrinsicInfo AArch64SMEIntrinsicMap[] = {
6803 #define GET_SME_LLVM_INTRINSIC_MAP
6804 #include "clang/Basic/arm_sme_builtin_cg.inc"
6805 #undef GET_SME_LLVM_INTRINSIC_MAP
6806 };
6807 
6808 #undef SMEMAP1
6809 #undef SMEMAP2
6810 
6811 static bool NEONSIMDIntrinsicsProvenSorted = false;
6812 
6813 static bool AArch64SIMDIntrinsicsProvenSorted = false;
6814 static bool AArch64SISDIntrinsicsProvenSorted = false;
6815 static bool AArch64SVEIntrinsicsProvenSorted = false;
6816 static bool AArch64SMEIntrinsicsProvenSorted = false;
6817 
6818 static const ARMVectorIntrinsicInfo *
6819 findARMVectorIntrinsicInMap(ArrayRef<ARMVectorIntrinsicInfo> IntrinsicMap,
6820                             unsigned BuiltinID, bool &MapProvenSorted) {
6821 
6822 #ifndef NDEBUG
6823   if (!MapProvenSorted) {
6824     assert(llvm::is_sorted(IntrinsicMap));
6825     MapProvenSorted = true;
6826   }
6827 #endif
6828 
6829   const ARMVectorIntrinsicInfo *Builtin =
6830       llvm::lower_bound(IntrinsicMap, BuiltinID);
6831 
6832   if (Builtin != IntrinsicMap.end() && Builtin->BuiltinID == BuiltinID)
6833     return Builtin;
6834 
6835   return nullptr;
6836 }
6837 
6838 Function *CodeGenFunction::LookupNeonLLVMIntrinsic(unsigned IntrinsicID,
6839                                                    unsigned Modifier,
6840                                                    llvm::Type *ArgType,
6841                                                    const CallExpr *E) {
6842   int VectorSize = 0;
6843   if (Modifier & Use64BitVectors)
6844     VectorSize = 64;
6845   else if (Modifier & Use128BitVectors)
6846     VectorSize = 128;
6847 
6848   // Return type.
6849   SmallVector<llvm::Type *, 3> Tys;
6850   if (Modifier & AddRetType) {
6851     llvm::Type *Ty = ConvertType(E->getCallReturnType(getContext()));
6852     if (Modifier & VectorizeRetType)
6853       Ty = llvm::FixedVectorType::get(
6854           Ty, VectorSize ? VectorSize / Ty->getPrimitiveSizeInBits() : 1);
6855 
6856     Tys.push_back(Ty);
6857   }
6858 
6859   // Arguments.
6860   if (Modifier & VectorizeArgTypes) {
6861     int Elts = VectorSize ? VectorSize / ArgType->getPrimitiveSizeInBits() : 1;
6862     ArgType = llvm::FixedVectorType::get(ArgType, Elts);
6863   }
6864 
6865   if (Modifier & (Add1ArgType | Add2ArgTypes))
6866     Tys.push_back(ArgType);
6867 
6868   if (Modifier & Add2ArgTypes)
6869     Tys.push_back(ArgType);
6870 
6871   if (Modifier & InventFloatType)
6872     Tys.push_back(FloatTy);
6873 
6874   return CGM.getIntrinsic(IntrinsicID, Tys);
6875 }
6876 
6877 static Value *EmitCommonNeonSISDBuiltinExpr(
6878     CodeGenFunction &CGF, const ARMVectorIntrinsicInfo &SISDInfo,
6879     SmallVectorImpl<Value *> &Ops, const CallExpr *E) {
6880   unsigned BuiltinID = SISDInfo.BuiltinID;
6881   unsigned int Int = SISDInfo.LLVMIntrinsic;
6882   unsigned Modifier = SISDInfo.TypeModifier;
6883   const char *s = SISDInfo.NameHint;
6884 
6885   switch (BuiltinID) {
6886   case NEON::BI__builtin_neon_vcled_s64:
6887   case NEON::BI__builtin_neon_vcled_u64:
6888   case NEON::BI__builtin_neon_vcles_f32:
6889   case NEON::BI__builtin_neon_vcled_f64:
6890   case NEON::BI__builtin_neon_vcltd_s64:
6891   case NEON::BI__builtin_neon_vcltd_u64:
6892   case NEON::BI__builtin_neon_vclts_f32:
6893   case NEON::BI__builtin_neon_vcltd_f64:
6894   case NEON::BI__builtin_neon_vcales_f32:
6895   case NEON::BI__builtin_neon_vcaled_f64:
6896   case NEON::BI__builtin_neon_vcalts_f32:
6897   case NEON::BI__builtin_neon_vcaltd_f64:
6898     // Only one direction of comparisons actually exist, cmle is actually a cmge
6899     // with swapped operands. The table gives us the right intrinsic but we
6900     // still need to do the swap.
6901     std::swap(Ops[0], Ops[1]);
6902     break;
6903   }
6904 
6905   assert(Int && "Generic code assumes a valid intrinsic");
6906 
6907   // Determine the type(s) of this overloaded AArch64 intrinsic.
6908   const Expr *Arg = E->getArg(0);
6909   llvm::Type *ArgTy = CGF.ConvertType(Arg->getType());
6910   Function *F = CGF.LookupNeonLLVMIntrinsic(Int, Modifier, ArgTy, E);
6911 
6912   int j = 0;
6913   ConstantInt *C0 = ConstantInt::get(CGF.SizeTy, 0);
6914   for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
6915        ai != ae; ++ai, ++j) {
6916     llvm::Type *ArgTy = ai->getType();
6917     if (Ops[j]->getType()->getPrimitiveSizeInBits() ==
6918              ArgTy->getPrimitiveSizeInBits())
6919       continue;
6920 
6921     assert(ArgTy->isVectorTy() && !Ops[j]->getType()->isVectorTy());
6922     // The constant argument to an _n_ intrinsic always has Int32Ty, so truncate
6923     // it before inserting.
6924     Ops[j] = CGF.Builder.CreateTruncOrBitCast(
6925         Ops[j], cast<llvm::VectorType>(ArgTy)->getElementType());
6926     Ops[j] =
6927         CGF.Builder.CreateInsertElement(PoisonValue::get(ArgTy), Ops[j], C0);
6928   }
6929 
6930   Value *Result = CGF.EmitNeonCall(F, Ops, s);
6931   llvm::Type *ResultType = CGF.ConvertType(E->getType());
6932   if (ResultType->getPrimitiveSizeInBits().getFixedValue() <
6933       Result->getType()->getPrimitiveSizeInBits().getFixedValue())
6934     return CGF.Builder.CreateExtractElement(Result, C0);
6935 
6936   return CGF.Builder.CreateBitCast(Result, ResultType, s);
6937 }
6938 
6939 Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
6940     unsigned BuiltinID, unsigned LLVMIntrinsic, unsigned AltLLVMIntrinsic,
6941     const char *NameHint, unsigned Modifier, const CallExpr *E,
6942     SmallVectorImpl<llvm::Value *> &Ops, Address PtrOp0, Address PtrOp1,
6943     llvm::Triple::ArchType Arch) {
6944   // Get the last argument, which specifies the vector type.
6945   const Expr *Arg = E->getArg(E->getNumArgs() - 1);
6946   std::optional<llvm::APSInt> NeonTypeConst =
6947       Arg->getIntegerConstantExpr(getContext());
6948   if (!NeonTypeConst)
6949     return nullptr;
6950 
6951   // Determine the type of this overloaded NEON intrinsic.
6952   NeonTypeFlags Type(NeonTypeConst->getZExtValue());
6953   bool Usgn = Type.isUnsigned();
6954   bool Quad = Type.isQuad();
6955   const bool HasLegalHalfType = getTarget().hasLegalHalfType();
6956   const bool AllowBFloatArgsAndRet =
6957       getTargetHooks().getABIInfo().allowBFloatArgsAndRet();
6958 
6959   llvm::FixedVectorType *VTy =
6960       GetNeonType(this, Type, HasLegalHalfType, false, AllowBFloatArgsAndRet);
6961   llvm::Type *Ty = VTy;
6962   if (!Ty)
6963     return nullptr;
6964 
6965   auto getAlignmentValue32 = [&](Address addr) -> Value* {
6966     return Builder.getInt32(addr.getAlignment().getQuantity());
6967   };
6968 
6969   unsigned Int = LLVMIntrinsic;
6970   if ((Modifier & UnsignedAlts) && !Usgn)
6971     Int = AltLLVMIntrinsic;
6972 
6973   switch (BuiltinID) {
6974   default: break;
6975   case NEON::BI__builtin_neon_splat_lane_v:
6976   case NEON::BI__builtin_neon_splat_laneq_v:
6977   case NEON::BI__builtin_neon_splatq_lane_v:
6978   case NEON::BI__builtin_neon_splatq_laneq_v: {
6979     auto NumElements = VTy->getElementCount();
6980     if (BuiltinID == NEON::BI__builtin_neon_splatq_lane_v)
6981       NumElements = NumElements * 2;
6982     if (BuiltinID == NEON::BI__builtin_neon_splat_laneq_v)
6983       NumElements = NumElements.divideCoefficientBy(2);
6984 
6985     Ops[0] = Builder.CreateBitCast(Ops[0], VTy);
6986     return EmitNeonSplat(Ops[0], cast<ConstantInt>(Ops[1]), NumElements);
6987   }
6988   case NEON::BI__builtin_neon_vpadd_v:
6989   case NEON::BI__builtin_neon_vpaddq_v:
6990     // We don't allow fp/int overloading of intrinsics.
6991     if (VTy->getElementType()->isFloatingPointTy() &&
6992         Int == Intrinsic::aarch64_neon_addp)
6993       Int = Intrinsic::aarch64_neon_faddp;
6994     break;
6995   case NEON::BI__builtin_neon_vabs_v:
6996   case NEON::BI__builtin_neon_vabsq_v:
6997     if (VTy->getElementType()->isFloatingPointTy())
6998       return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, Ty), Ops, "vabs");
6999     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), Ops, "vabs");
7000   case NEON::BI__builtin_neon_vadd_v:
7001   case NEON::BI__builtin_neon_vaddq_v: {
7002     llvm::Type *VTy = llvm::FixedVectorType::get(Int8Ty, Quad ? 16 : 8);
7003     Ops[0] = Builder.CreateBitCast(Ops[0], VTy);
7004     Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
7005     Ops[0] =  Builder.CreateXor(Ops[0], Ops[1]);
7006     return Builder.CreateBitCast(Ops[0], Ty);
7007   }
7008   case NEON::BI__builtin_neon_vaddhn_v: {
7009     llvm::FixedVectorType *SrcTy =
7010         llvm::FixedVectorType::getExtendedElementVectorType(VTy);
7011 
7012     // %sum = add <4 x i32> %lhs, %rhs
7013     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
7014     Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy);
7015     Ops[0] = Builder.CreateAdd(Ops[0], Ops[1], "vaddhn");
7016 
7017     // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
7018     Constant *ShiftAmt =
7019         ConstantInt::get(SrcTy, SrcTy->getScalarSizeInBits() / 2);
7020     Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vaddhn");
7021 
7022     // %res = trunc <4 x i32> %high to <4 x i16>
7023     return Builder.CreateTrunc(Ops[0], VTy, "vaddhn");
7024   }
7025   case NEON::BI__builtin_neon_vcale_v:
7026   case NEON::BI__builtin_neon_vcaleq_v:
7027   case NEON::BI__builtin_neon_vcalt_v:
7028   case NEON::BI__builtin_neon_vcaltq_v:
7029     std::swap(Ops[0], Ops[1]);
7030     [[fallthrough]];
7031   case NEON::BI__builtin_neon_vcage_v:
7032   case NEON::BI__builtin_neon_vcageq_v:
7033   case NEON::BI__builtin_neon_vcagt_v:
7034   case NEON::BI__builtin_neon_vcagtq_v: {
7035     llvm::Type *Ty;
7036     switch (VTy->getScalarSizeInBits()) {
7037     default: llvm_unreachable("unexpected type");
7038     case 32:
7039       Ty = FloatTy;
7040       break;
7041     case 64:
7042       Ty = DoubleTy;
7043       break;
7044     case 16:
7045       Ty = HalfTy;
7046       break;
7047     }
7048     auto *VecFlt = llvm::FixedVectorType::get(Ty, VTy->getNumElements());
7049     llvm::Type *Tys[] = { VTy, VecFlt };
7050     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
7051     return EmitNeonCall(F, Ops, NameHint);
7052   }
7053   case NEON::BI__builtin_neon_vceqz_v:
7054   case NEON::BI__builtin_neon_vceqzq_v:
7055     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OEQ,
7056                                          ICmpInst::ICMP_EQ, "vceqz");
7057   case NEON::BI__builtin_neon_vcgez_v:
7058   case NEON::BI__builtin_neon_vcgezq_v:
7059     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGE,
7060                                          ICmpInst::ICMP_SGE, "vcgez");
7061   case NEON::BI__builtin_neon_vclez_v:
7062   case NEON::BI__builtin_neon_vclezq_v:
7063     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLE,
7064                                          ICmpInst::ICMP_SLE, "vclez");
7065   case NEON::BI__builtin_neon_vcgtz_v:
7066   case NEON::BI__builtin_neon_vcgtzq_v:
7067     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGT,
7068                                          ICmpInst::ICMP_SGT, "vcgtz");
7069   case NEON::BI__builtin_neon_vcltz_v:
7070   case NEON::BI__builtin_neon_vcltzq_v:
7071     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLT,
7072                                          ICmpInst::ICMP_SLT, "vcltz");
7073   case NEON::BI__builtin_neon_vclz_v:
7074   case NEON::BI__builtin_neon_vclzq_v:
7075     // We generate target-independent intrinsic, which needs a second argument
7076     // for whether or not clz of zero is undefined; on ARM it isn't.
7077     Ops.push_back(Builder.getInt1(getTarget().isCLZForZeroUndef()));
7078     break;
7079   case NEON::BI__builtin_neon_vcvt_f32_v:
7080   case NEON::BI__builtin_neon_vcvtq_f32_v:
7081     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7082     Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, Quad),
7083                      HasLegalHalfType);
7084     return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
7085                 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
7086   case NEON::BI__builtin_neon_vcvt_f16_s16:
7087   case NEON::BI__builtin_neon_vcvt_f16_u16:
7088   case NEON::BI__builtin_neon_vcvtq_f16_s16:
7089   case NEON::BI__builtin_neon_vcvtq_f16_u16:
7090     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7091     Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float16, false, Quad),
7092                      HasLegalHalfType);
7093     return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
7094                 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
7095   case NEON::BI__builtin_neon_vcvt_n_f16_s16:
7096   case NEON::BI__builtin_neon_vcvt_n_f16_u16:
7097   case NEON::BI__builtin_neon_vcvtq_n_f16_s16:
7098   case NEON::BI__builtin_neon_vcvtq_n_f16_u16: {
7099     llvm::Type *Tys[2] = { GetFloatNeonType(this, Type), Ty };
7100     Function *F = CGM.getIntrinsic(Int, Tys);
7101     return EmitNeonCall(F, Ops, "vcvt_n");
7102   }
7103   case NEON::BI__builtin_neon_vcvt_n_f32_v:
7104   case NEON::BI__builtin_neon_vcvt_n_f64_v:
7105   case NEON::BI__builtin_neon_vcvtq_n_f32_v:
7106   case NEON::BI__builtin_neon_vcvtq_n_f64_v: {
7107     llvm::Type *Tys[2] = { GetFloatNeonType(this, Type), Ty };
7108     Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
7109     Function *F = CGM.getIntrinsic(Int, Tys);
7110     return EmitNeonCall(F, Ops, "vcvt_n");
7111   }
7112   case NEON::BI__builtin_neon_vcvt_n_s16_f16:
7113   case NEON::BI__builtin_neon_vcvt_n_s32_v:
7114   case NEON::BI__builtin_neon_vcvt_n_u16_f16:
7115   case NEON::BI__builtin_neon_vcvt_n_u32_v:
7116   case NEON::BI__builtin_neon_vcvt_n_s64_v:
7117   case NEON::BI__builtin_neon_vcvt_n_u64_v:
7118   case NEON::BI__builtin_neon_vcvtq_n_s16_f16:
7119   case NEON::BI__builtin_neon_vcvtq_n_s32_v:
7120   case NEON::BI__builtin_neon_vcvtq_n_u16_f16:
7121   case NEON::BI__builtin_neon_vcvtq_n_u32_v:
7122   case NEON::BI__builtin_neon_vcvtq_n_s64_v:
7123   case NEON::BI__builtin_neon_vcvtq_n_u64_v: {
7124     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
7125     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
7126     return EmitNeonCall(F, Ops, "vcvt_n");
7127   }
7128   case NEON::BI__builtin_neon_vcvt_s32_v:
7129   case NEON::BI__builtin_neon_vcvt_u32_v:
7130   case NEON::BI__builtin_neon_vcvt_s64_v:
7131   case NEON::BI__builtin_neon_vcvt_u64_v:
7132   case NEON::BI__builtin_neon_vcvt_s16_f16:
7133   case NEON::BI__builtin_neon_vcvt_u16_f16:
7134   case NEON::BI__builtin_neon_vcvtq_s32_v:
7135   case NEON::BI__builtin_neon_vcvtq_u32_v:
7136   case NEON::BI__builtin_neon_vcvtq_s64_v:
7137   case NEON::BI__builtin_neon_vcvtq_u64_v:
7138   case NEON::BI__builtin_neon_vcvtq_s16_f16:
7139   case NEON::BI__builtin_neon_vcvtq_u16_f16: {
7140     Ops[0] = Builder.CreateBitCast(Ops[0], GetFloatNeonType(this, Type));
7141     return Usgn ? Builder.CreateFPToUI(Ops[0], Ty, "vcvt")
7142                 : Builder.CreateFPToSI(Ops[0], Ty, "vcvt");
7143   }
7144   case NEON::BI__builtin_neon_vcvta_s16_f16:
7145   case NEON::BI__builtin_neon_vcvta_s32_v:
7146   case NEON::BI__builtin_neon_vcvta_s64_v:
7147   case NEON::BI__builtin_neon_vcvta_u16_f16:
7148   case NEON::BI__builtin_neon_vcvta_u32_v:
7149   case NEON::BI__builtin_neon_vcvta_u64_v:
7150   case NEON::BI__builtin_neon_vcvtaq_s16_f16:
7151   case NEON::BI__builtin_neon_vcvtaq_s32_v:
7152   case NEON::BI__builtin_neon_vcvtaq_s64_v:
7153   case NEON::BI__builtin_neon_vcvtaq_u16_f16:
7154   case NEON::BI__builtin_neon_vcvtaq_u32_v:
7155   case NEON::BI__builtin_neon_vcvtaq_u64_v:
7156   case NEON::BI__builtin_neon_vcvtn_s16_f16:
7157   case NEON::BI__builtin_neon_vcvtn_s32_v:
7158   case NEON::BI__builtin_neon_vcvtn_s64_v:
7159   case NEON::BI__builtin_neon_vcvtn_u16_f16:
7160   case NEON::BI__builtin_neon_vcvtn_u32_v:
7161   case NEON::BI__builtin_neon_vcvtn_u64_v:
7162   case NEON::BI__builtin_neon_vcvtnq_s16_f16:
7163   case NEON::BI__builtin_neon_vcvtnq_s32_v:
7164   case NEON::BI__builtin_neon_vcvtnq_s64_v:
7165   case NEON::BI__builtin_neon_vcvtnq_u16_f16:
7166   case NEON::BI__builtin_neon_vcvtnq_u32_v:
7167   case NEON::BI__builtin_neon_vcvtnq_u64_v:
7168   case NEON::BI__builtin_neon_vcvtp_s16_f16:
7169   case NEON::BI__builtin_neon_vcvtp_s32_v:
7170   case NEON::BI__builtin_neon_vcvtp_s64_v:
7171   case NEON::BI__builtin_neon_vcvtp_u16_f16:
7172   case NEON::BI__builtin_neon_vcvtp_u32_v:
7173   case NEON::BI__builtin_neon_vcvtp_u64_v:
7174   case NEON::BI__builtin_neon_vcvtpq_s16_f16:
7175   case NEON::BI__builtin_neon_vcvtpq_s32_v:
7176   case NEON::BI__builtin_neon_vcvtpq_s64_v:
7177   case NEON::BI__builtin_neon_vcvtpq_u16_f16:
7178   case NEON::BI__builtin_neon_vcvtpq_u32_v:
7179   case NEON::BI__builtin_neon_vcvtpq_u64_v:
7180   case NEON::BI__builtin_neon_vcvtm_s16_f16:
7181   case NEON::BI__builtin_neon_vcvtm_s32_v:
7182   case NEON::BI__builtin_neon_vcvtm_s64_v:
7183   case NEON::BI__builtin_neon_vcvtm_u16_f16:
7184   case NEON::BI__builtin_neon_vcvtm_u32_v:
7185   case NEON::BI__builtin_neon_vcvtm_u64_v:
7186   case NEON::BI__builtin_neon_vcvtmq_s16_f16:
7187   case NEON::BI__builtin_neon_vcvtmq_s32_v:
7188   case NEON::BI__builtin_neon_vcvtmq_s64_v:
7189   case NEON::BI__builtin_neon_vcvtmq_u16_f16:
7190   case NEON::BI__builtin_neon_vcvtmq_u32_v:
7191   case NEON::BI__builtin_neon_vcvtmq_u64_v: {
7192     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
7193     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, NameHint);
7194   }
7195   case NEON::BI__builtin_neon_vcvtx_f32_v: {
7196     llvm::Type *Tys[2] = { VTy->getTruncatedElementVectorType(VTy), Ty};
7197     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, NameHint);
7198 
7199   }
7200   case NEON::BI__builtin_neon_vext_v:
7201   case NEON::BI__builtin_neon_vextq_v: {
7202     int CV = cast<ConstantInt>(Ops[2])->getSExtValue();
7203     SmallVector<int, 16> Indices;
7204     for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
7205       Indices.push_back(i+CV);
7206 
7207     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7208     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7209     return Builder.CreateShuffleVector(Ops[0], Ops[1], Indices, "vext");
7210   }
7211   case NEON::BI__builtin_neon_vfma_v:
7212   case NEON::BI__builtin_neon_vfmaq_v: {
7213     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7214     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7215     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
7216 
7217     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
7218     return emitCallMaybeConstrainedFPBuiltin(
7219         *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
7220         {Ops[1], Ops[2], Ops[0]});
7221   }
7222   case NEON::BI__builtin_neon_vld1_v:
7223   case NEON::BI__builtin_neon_vld1q_v: {
7224     llvm::Type *Tys[] = {Ty, Int8PtrTy};
7225     Ops.push_back(getAlignmentValue32(PtrOp0));
7226     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "vld1");
7227   }
7228   case NEON::BI__builtin_neon_vld1_x2_v:
7229   case NEON::BI__builtin_neon_vld1q_x2_v:
7230   case NEON::BI__builtin_neon_vld1_x3_v:
7231   case NEON::BI__builtin_neon_vld1q_x3_v:
7232   case NEON::BI__builtin_neon_vld1_x4_v:
7233   case NEON::BI__builtin_neon_vld1q_x4_v: {
7234     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy->getElementType());
7235     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
7236     llvm::Type *Tys[2] = { VTy, PTy };
7237     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
7238     Ops[1] = Builder.CreateCall(F, Ops[1], "vld1xN");
7239     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
7240     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7241     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
7242   }
7243   case NEON::BI__builtin_neon_vld2_v:
7244   case NEON::BI__builtin_neon_vld2q_v:
7245   case NEON::BI__builtin_neon_vld3_v:
7246   case NEON::BI__builtin_neon_vld3q_v:
7247   case NEON::BI__builtin_neon_vld4_v:
7248   case NEON::BI__builtin_neon_vld4q_v:
7249   case NEON::BI__builtin_neon_vld2_dup_v:
7250   case NEON::BI__builtin_neon_vld2q_dup_v:
7251   case NEON::BI__builtin_neon_vld3_dup_v:
7252   case NEON::BI__builtin_neon_vld3q_dup_v:
7253   case NEON::BI__builtin_neon_vld4_dup_v:
7254   case NEON::BI__builtin_neon_vld4q_dup_v: {
7255     llvm::Type *Tys[] = {Ty, Int8PtrTy};
7256     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
7257     Value *Align = getAlignmentValue32(PtrOp1);
7258     Ops[1] = Builder.CreateCall(F, {Ops[1], Align}, NameHint);
7259     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
7260     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7261     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
7262   }
7263   case NEON::BI__builtin_neon_vld1_dup_v:
7264   case NEON::BI__builtin_neon_vld1q_dup_v: {
7265     Value *V = PoisonValue::get(Ty);
7266     PtrOp0 = PtrOp0.withElementType(VTy->getElementType());
7267     LoadInst *Ld = Builder.CreateLoad(PtrOp0);
7268     llvm::Constant *CI = ConstantInt::get(SizeTy, 0);
7269     Ops[0] = Builder.CreateInsertElement(V, Ld, CI);
7270     return EmitNeonSplat(Ops[0], CI);
7271   }
7272   case NEON::BI__builtin_neon_vld2_lane_v:
7273   case NEON::BI__builtin_neon_vld2q_lane_v:
7274   case NEON::BI__builtin_neon_vld3_lane_v:
7275   case NEON::BI__builtin_neon_vld3q_lane_v:
7276   case NEON::BI__builtin_neon_vld4_lane_v:
7277   case NEON::BI__builtin_neon_vld4q_lane_v: {
7278     llvm::Type *Tys[] = {Ty, Int8PtrTy};
7279     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
7280     for (unsigned I = 2; I < Ops.size() - 1; ++I)
7281       Ops[I] = Builder.CreateBitCast(Ops[I], Ty);
7282     Ops.push_back(getAlignmentValue32(PtrOp1));
7283     Ops[1] = Builder.CreateCall(F, ArrayRef(Ops).slice(1), NameHint);
7284     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
7285     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7286     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
7287   }
7288   case NEON::BI__builtin_neon_vmovl_v: {
7289     llvm::FixedVectorType *DTy =
7290         llvm::FixedVectorType::getTruncatedElementVectorType(VTy);
7291     Ops[0] = Builder.CreateBitCast(Ops[0], DTy);
7292     if (Usgn)
7293       return Builder.CreateZExt(Ops[0], Ty, "vmovl");
7294     return Builder.CreateSExt(Ops[0], Ty, "vmovl");
7295   }
7296   case NEON::BI__builtin_neon_vmovn_v: {
7297     llvm::FixedVectorType *QTy =
7298         llvm::FixedVectorType::getExtendedElementVectorType(VTy);
7299     Ops[0] = Builder.CreateBitCast(Ops[0], QTy);
7300     return Builder.CreateTrunc(Ops[0], Ty, "vmovn");
7301   }
7302   case NEON::BI__builtin_neon_vmull_v:
7303     // FIXME: the integer vmull operations could be emitted in terms of pure
7304     // LLVM IR (2 exts followed by a mul). Unfortunately LLVM has a habit of
7305     // hoisting the exts outside loops. Until global ISel comes along that can
7306     // see through such movement this leads to bad CodeGen. So we need an
7307     // intrinsic for now.
7308     Int = Usgn ? Intrinsic::arm_neon_vmullu : Intrinsic::arm_neon_vmulls;
7309     Int = Type.isPoly() ? (unsigned)Intrinsic::arm_neon_vmullp : Int;
7310     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull");
7311   case NEON::BI__builtin_neon_vpadal_v:
7312   case NEON::BI__builtin_neon_vpadalq_v: {
7313     // The source operand type has twice as many elements of half the size.
7314     unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
7315     llvm::Type *EltTy =
7316       llvm::IntegerType::get(getLLVMContext(), EltBits / 2);
7317     auto *NarrowTy =
7318         llvm::FixedVectorType::get(EltTy, VTy->getNumElements() * 2);
7319     llvm::Type *Tys[2] = { Ty, NarrowTy };
7320     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
7321   }
7322   case NEON::BI__builtin_neon_vpaddl_v:
7323   case NEON::BI__builtin_neon_vpaddlq_v: {
7324     // The source operand type has twice as many elements of half the size.
7325     unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
7326     llvm::Type *EltTy = llvm::IntegerType::get(getLLVMContext(), EltBits / 2);
7327     auto *NarrowTy =
7328         llvm::FixedVectorType::get(EltTy, VTy->getNumElements() * 2);
7329     llvm::Type *Tys[2] = { Ty, NarrowTy };
7330     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vpaddl");
7331   }
7332   case NEON::BI__builtin_neon_vqdmlal_v:
7333   case NEON::BI__builtin_neon_vqdmlsl_v: {
7334     SmallVector<Value *, 2> MulOps(Ops.begin() + 1, Ops.end());
7335     Ops[1] =
7336         EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), MulOps, "vqdmlal");
7337     Ops.resize(2);
7338     return EmitNeonCall(CGM.getIntrinsic(AltLLVMIntrinsic, Ty), Ops, NameHint);
7339   }
7340   case NEON::BI__builtin_neon_vqdmulhq_lane_v:
7341   case NEON::BI__builtin_neon_vqdmulh_lane_v:
7342   case NEON::BI__builtin_neon_vqrdmulhq_lane_v:
7343   case NEON::BI__builtin_neon_vqrdmulh_lane_v: {
7344     auto *RTy = cast<llvm::FixedVectorType>(Ty);
7345     if (BuiltinID == NEON::BI__builtin_neon_vqdmulhq_lane_v ||
7346         BuiltinID == NEON::BI__builtin_neon_vqrdmulhq_lane_v)
7347       RTy = llvm::FixedVectorType::get(RTy->getElementType(),
7348                                        RTy->getNumElements() * 2);
7349     llvm::Type *Tys[2] = {
7350         RTy, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
7351                                              /*isQuad*/ false))};
7352     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
7353   }
7354   case NEON::BI__builtin_neon_vqdmulhq_laneq_v:
7355   case NEON::BI__builtin_neon_vqdmulh_laneq_v:
7356   case NEON::BI__builtin_neon_vqrdmulhq_laneq_v:
7357   case NEON::BI__builtin_neon_vqrdmulh_laneq_v: {
7358     llvm::Type *Tys[2] = {
7359         Ty, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
7360                                             /*isQuad*/ true))};
7361     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
7362   }
7363   case NEON::BI__builtin_neon_vqshl_n_v:
7364   case NEON::BI__builtin_neon_vqshlq_n_v:
7365     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshl_n",
7366                         1, false);
7367   case NEON::BI__builtin_neon_vqshlu_n_v:
7368   case NEON::BI__builtin_neon_vqshluq_n_v:
7369     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshlu_n",
7370                         1, false);
7371   case NEON::BI__builtin_neon_vrecpe_v:
7372   case NEON::BI__builtin_neon_vrecpeq_v:
7373   case NEON::BI__builtin_neon_vrsqrte_v:
7374   case NEON::BI__builtin_neon_vrsqrteq_v:
7375     Int = Ty->isFPOrFPVectorTy() ? LLVMIntrinsic : AltLLVMIntrinsic;
7376     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint);
7377   case NEON::BI__builtin_neon_vrndi_v:
7378   case NEON::BI__builtin_neon_vrndiq_v:
7379     Int = Builder.getIsFPConstrained()
7380               ? Intrinsic::experimental_constrained_nearbyint
7381               : Intrinsic::nearbyint;
7382     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint);
7383   case NEON::BI__builtin_neon_vrshr_n_v:
7384   case NEON::BI__builtin_neon_vrshrq_n_v:
7385     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshr_n",
7386                         1, true);
7387   case NEON::BI__builtin_neon_vsha512hq_u64:
7388   case NEON::BI__builtin_neon_vsha512h2q_u64:
7389   case NEON::BI__builtin_neon_vsha512su0q_u64:
7390   case NEON::BI__builtin_neon_vsha512su1q_u64: {
7391     Function *F = CGM.getIntrinsic(Int);
7392     return EmitNeonCall(F, Ops, "");
7393   }
7394   case NEON::BI__builtin_neon_vshl_n_v:
7395   case NEON::BI__builtin_neon_vshlq_n_v:
7396     Ops[1] = EmitNeonShiftVector(Ops[1], Ty, false);
7397     return Builder.CreateShl(Builder.CreateBitCast(Ops[0],Ty), Ops[1],
7398                              "vshl_n");
7399   case NEON::BI__builtin_neon_vshll_n_v: {
7400     llvm::FixedVectorType *SrcTy =
7401         llvm::FixedVectorType::getTruncatedElementVectorType(VTy);
7402     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
7403     if (Usgn)
7404       Ops[0] = Builder.CreateZExt(Ops[0], VTy);
7405     else
7406       Ops[0] = Builder.CreateSExt(Ops[0], VTy);
7407     Ops[1] = EmitNeonShiftVector(Ops[1], VTy, false);
7408     return Builder.CreateShl(Ops[0], Ops[1], "vshll_n");
7409   }
7410   case NEON::BI__builtin_neon_vshrn_n_v: {
7411     llvm::FixedVectorType *SrcTy =
7412         llvm::FixedVectorType::getExtendedElementVectorType(VTy);
7413     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
7414     Ops[1] = EmitNeonShiftVector(Ops[1], SrcTy, false);
7415     if (Usgn)
7416       Ops[0] = Builder.CreateLShr(Ops[0], Ops[1]);
7417     else
7418       Ops[0] = Builder.CreateAShr(Ops[0], Ops[1]);
7419     return Builder.CreateTrunc(Ops[0], Ty, "vshrn_n");
7420   }
7421   case NEON::BI__builtin_neon_vshr_n_v:
7422   case NEON::BI__builtin_neon_vshrq_n_v:
7423     return EmitNeonRShiftImm(Ops[0], Ops[1], Ty, Usgn, "vshr_n");
7424   case NEON::BI__builtin_neon_vst1_v:
7425   case NEON::BI__builtin_neon_vst1q_v:
7426   case NEON::BI__builtin_neon_vst2_v:
7427   case NEON::BI__builtin_neon_vst2q_v:
7428   case NEON::BI__builtin_neon_vst3_v:
7429   case NEON::BI__builtin_neon_vst3q_v:
7430   case NEON::BI__builtin_neon_vst4_v:
7431   case NEON::BI__builtin_neon_vst4q_v:
7432   case NEON::BI__builtin_neon_vst2_lane_v:
7433   case NEON::BI__builtin_neon_vst2q_lane_v:
7434   case NEON::BI__builtin_neon_vst3_lane_v:
7435   case NEON::BI__builtin_neon_vst3q_lane_v:
7436   case NEON::BI__builtin_neon_vst4_lane_v:
7437   case NEON::BI__builtin_neon_vst4q_lane_v: {
7438     llvm::Type *Tys[] = {Int8PtrTy, Ty};
7439     Ops.push_back(getAlignmentValue32(PtrOp0));
7440     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "");
7441   }
7442   case NEON::BI__builtin_neon_vsm3partw1q_u32:
7443   case NEON::BI__builtin_neon_vsm3partw2q_u32:
7444   case NEON::BI__builtin_neon_vsm3ss1q_u32:
7445   case NEON::BI__builtin_neon_vsm4ekeyq_u32:
7446   case NEON::BI__builtin_neon_vsm4eq_u32: {
7447     Function *F = CGM.getIntrinsic(Int);
7448     return EmitNeonCall(F, Ops, "");
7449   }
7450   case NEON::BI__builtin_neon_vsm3tt1aq_u32:
7451   case NEON::BI__builtin_neon_vsm3tt1bq_u32:
7452   case NEON::BI__builtin_neon_vsm3tt2aq_u32:
7453   case NEON::BI__builtin_neon_vsm3tt2bq_u32: {
7454     Function *F = CGM.getIntrinsic(Int);
7455     Ops[3] = Builder.CreateZExt(Ops[3], Int64Ty);
7456     return EmitNeonCall(F, Ops, "");
7457   }
7458   case NEON::BI__builtin_neon_vst1_x2_v:
7459   case NEON::BI__builtin_neon_vst1q_x2_v:
7460   case NEON::BI__builtin_neon_vst1_x3_v:
7461   case NEON::BI__builtin_neon_vst1q_x3_v:
7462   case NEON::BI__builtin_neon_vst1_x4_v:
7463   case NEON::BI__builtin_neon_vst1q_x4_v: {
7464     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy->getElementType());
7465     // TODO: Currently in AArch32 mode the pointer operand comes first, whereas
7466     // in AArch64 it comes last. We may want to stick to one or another.
7467     if (Arch == llvm::Triple::aarch64 || Arch == llvm::Triple::aarch64_be ||
7468         Arch == llvm::Triple::aarch64_32) {
7469       llvm::Type *Tys[2] = { VTy, PTy };
7470       std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
7471       return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "");
7472     }
7473     llvm::Type *Tys[2] = { PTy, VTy };
7474     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "");
7475   }
7476   case NEON::BI__builtin_neon_vsubhn_v: {
7477     llvm::FixedVectorType *SrcTy =
7478         llvm::FixedVectorType::getExtendedElementVectorType(VTy);
7479 
7480     // %sum = add <4 x i32> %lhs, %rhs
7481     Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
7482     Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy);
7483     Ops[0] = Builder.CreateSub(Ops[0], Ops[1], "vsubhn");
7484 
7485     // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
7486     Constant *ShiftAmt =
7487         ConstantInt::get(SrcTy, SrcTy->getScalarSizeInBits() / 2);
7488     Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vsubhn");
7489 
7490     // %res = trunc <4 x i32> %high to <4 x i16>
7491     return Builder.CreateTrunc(Ops[0], VTy, "vsubhn");
7492   }
7493   case NEON::BI__builtin_neon_vtrn_v:
7494   case NEON::BI__builtin_neon_vtrnq_v: {
7495     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
7496     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7497     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
7498     Value *SV = nullptr;
7499 
7500     for (unsigned vi = 0; vi != 2; ++vi) {
7501       SmallVector<int, 16> Indices;
7502       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
7503         Indices.push_back(i+vi);
7504         Indices.push_back(i+e+vi);
7505       }
7506       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
7507       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vtrn");
7508       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
7509     }
7510     return SV;
7511   }
7512   case NEON::BI__builtin_neon_vtst_v:
7513   case NEON::BI__builtin_neon_vtstq_v: {
7514     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7515     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7516     Ops[0] = Builder.CreateAnd(Ops[0], Ops[1]);
7517     Ops[0] = Builder.CreateICmp(ICmpInst::ICMP_NE, Ops[0],
7518                                 ConstantAggregateZero::get(Ty));
7519     return Builder.CreateSExt(Ops[0], Ty, "vtst");
7520   }
7521   case NEON::BI__builtin_neon_vuzp_v:
7522   case NEON::BI__builtin_neon_vuzpq_v: {
7523     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
7524     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7525     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
7526     Value *SV = nullptr;
7527 
7528     for (unsigned vi = 0; vi != 2; ++vi) {
7529       SmallVector<int, 16> Indices;
7530       for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
7531         Indices.push_back(2*i+vi);
7532 
7533       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
7534       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vuzp");
7535       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
7536     }
7537     return SV;
7538   }
7539   case NEON::BI__builtin_neon_vxarq_u64: {
7540     Function *F = CGM.getIntrinsic(Int);
7541     Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty);
7542     return EmitNeonCall(F, Ops, "");
7543   }
7544   case NEON::BI__builtin_neon_vzip_v:
7545   case NEON::BI__builtin_neon_vzipq_v: {
7546     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
7547     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7548     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
7549     Value *SV = nullptr;
7550 
7551     for (unsigned vi = 0; vi != 2; ++vi) {
7552       SmallVector<int, 16> Indices;
7553       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
7554         Indices.push_back((i + vi*e) >> 1);
7555         Indices.push_back(((i + vi*e) >> 1)+e);
7556       }
7557       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
7558       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vzip");
7559       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
7560     }
7561     return SV;
7562   }
7563   case NEON::BI__builtin_neon_vdot_s32:
7564   case NEON::BI__builtin_neon_vdot_u32:
7565   case NEON::BI__builtin_neon_vdotq_s32:
7566   case NEON::BI__builtin_neon_vdotq_u32: {
7567     auto *InputTy =
7568         llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
7569     llvm::Type *Tys[2] = { Ty, InputTy };
7570     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vdot");
7571   }
7572   case NEON::BI__builtin_neon_vfmlal_low_f16:
7573   case NEON::BI__builtin_neon_vfmlalq_low_f16: {
7574     auto *InputTy =
7575         llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
7576     llvm::Type *Tys[2] = { Ty, InputTy };
7577     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_low");
7578   }
7579   case NEON::BI__builtin_neon_vfmlsl_low_f16:
7580   case NEON::BI__builtin_neon_vfmlslq_low_f16: {
7581     auto *InputTy =
7582         llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
7583     llvm::Type *Tys[2] = { Ty, InputTy };
7584     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_low");
7585   }
7586   case NEON::BI__builtin_neon_vfmlal_high_f16:
7587   case NEON::BI__builtin_neon_vfmlalq_high_f16: {
7588     auto *InputTy =
7589         llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
7590     llvm::Type *Tys[2] = { Ty, InputTy };
7591     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_high");
7592   }
7593   case NEON::BI__builtin_neon_vfmlsl_high_f16:
7594   case NEON::BI__builtin_neon_vfmlslq_high_f16: {
7595     auto *InputTy =
7596         llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
7597     llvm::Type *Tys[2] = { Ty, InputTy };
7598     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_high");
7599   }
7600   case NEON::BI__builtin_neon_vmmlaq_s32:
7601   case NEON::BI__builtin_neon_vmmlaq_u32: {
7602     auto *InputTy =
7603         llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
7604     llvm::Type *Tys[2] = { Ty, InputTy };
7605     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "vmmla");
7606   }
7607   case NEON::BI__builtin_neon_vusmmlaq_s32: {
7608     auto *InputTy =
7609         llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
7610     llvm::Type *Tys[2] = { Ty, InputTy };
7611     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusmmla");
7612   }
7613   case NEON::BI__builtin_neon_vusdot_s32:
7614   case NEON::BI__builtin_neon_vusdotq_s32: {
7615     auto *InputTy =
7616         llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
7617     llvm::Type *Tys[2] = { Ty, InputTy };
7618     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusdot");
7619   }
7620   case NEON::BI__builtin_neon_vbfdot_f32:
7621   case NEON::BI__builtin_neon_vbfdotq_f32: {
7622     llvm::Type *InputTy =
7623         llvm::FixedVectorType::get(BFloatTy, Ty->getPrimitiveSizeInBits() / 16);
7624     llvm::Type *Tys[2] = { Ty, InputTy };
7625     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfdot");
7626   }
7627   case NEON::BI__builtin_neon___a32_vcvt_bf16_f32: {
7628     llvm::Type *Tys[1] = { Ty };
7629     Function *F = CGM.getIntrinsic(Int, Tys);
7630     return EmitNeonCall(F, Ops, "vcvtfp2bf");
7631   }
7632 
7633   }
7634 
7635   assert(Int && "Expected valid intrinsic number");
7636 
7637   // Determine the type(s) of this overloaded AArch64 intrinsic.
7638   Function *F = LookupNeonLLVMIntrinsic(Int, Modifier, Ty, E);
7639 
7640   Value *Result = EmitNeonCall(F, Ops, NameHint);
7641   llvm::Type *ResultType = ConvertType(E->getType());
7642   // AArch64 intrinsic one-element vector type cast to
7643   // scalar type expected by the builtin
7644   return Builder.CreateBitCast(Result, ResultType, NameHint);
7645 }
7646 
7647 Value *CodeGenFunction::EmitAArch64CompareBuiltinExpr(
7648     Value *Op, llvm::Type *Ty, const CmpInst::Predicate Fp,
7649     const CmpInst::Predicate Ip, const Twine &Name) {
7650   llvm::Type *OTy = Op->getType();
7651 
7652   // FIXME: this is utterly horrific. We should not be looking at previous
7653   // codegen context to find out what needs doing. Unfortunately TableGen
7654   // currently gives us exactly the same calls for vceqz_f32 and vceqz_s32
7655   // (etc).
7656   if (BitCastInst *BI = dyn_cast<BitCastInst>(Op))
7657     OTy = BI->getOperand(0)->getType();
7658 
7659   Op = Builder.CreateBitCast(Op, OTy);
7660   if (OTy->getScalarType()->isFloatingPointTy()) {
7661     if (Fp == CmpInst::FCMP_OEQ)
7662       Op = Builder.CreateFCmp(Fp, Op, Constant::getNullValue(OTy));
7663     else
7664       Op = Builder.CreateFCmpS(Fp, Op, Constant::getNullValue(OTy));
7665   } else {
7666     Op = Builder.CreateICmp(Ip, Op, Constant::getNullValue(OTy));
7667   }
7668   return Builder.CreateSExt(Op, Ty, Name);
7669 }
7670 
7671 static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
7672                                  Value *ExtOp, Value *IndexOp,
7673                                  llvm::Type *ResTy, unsigned IntID,
7674                                  const char *Name) {
7675   SmallVector<Value *, 2> TblOps;
7676   if (ExtOp)
7677     TblOps.push_back(ExtOp);
7678 
7679   // Build a vector containing sequential number like (0, 1, 2, ..., 15)
7680   SmallVector<int, 16> Indices;
7681   auto *TblTy = cast<llvm::FixedVectorType>(Ops[0]->getType());
7682   for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) {
7683     Indices.push_back(2*i);
7684     Indices.push_back(2*i+1);
7685   }
7686 
7687   int PairPos = 0, End = Ops.size() - 1;
7688   while (PairPos < End) {
7689     TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos],
7690                                                      Ops[PairPos+1], Indices,
7691                                                      Name));
7692     PairPos += 2;
7693   }
7694 
7695   // If there's an odd number of 64-bit lookup table, fill the high 64-bit
7696   // of the 128-bit lookup table with zero.
7697   if (PairPos == End) {
7698     Value *ZeroTbl = ConstantAggregateZero::get(TblTy);
7699     TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos],
7700                                                      ZeroTbl, Indices, Name));
7701   }
7702 
7703   Function *TblF;
7704   TblOps.push_back(IndexOp);
7705   TblF = CGF.CGM.getIntrinsic(IntID, ResTy);
7706 
7707   return CGF.EmitNeonCall(TblF, TblOps, Name);
7708 }
7709 
7710 Value *CodeGenFunction::GetValueForARMHint(unsigned BuiltinID) {
7711   unsigned Value;
7712   switch (BuiltinID) {
7713   default:
7714     return nullptr;
7715   case clang::ARM::BI__builtin_arm_nop:
7716     Value = 0;
7717     break;
7718   case clang::ARM::BI__builtin_arm_yield:
7719   case clang::ARM::BI__yield:
7720     Value = 1;
7721     break;
7722   case clang::ARM::BI__builtin_arm_wfe:
7723   case clang::ARM::BI__wfe:
7724     Value = 2;
7725     break;
7726   case clang::ARM::BI__builtin_arm_wfi:
7727   case clang::ARM::BI__wfi:
7728     Value = 3;
7729     break;
7730   case clang::ARM::BI__builtin_arm_sev:
7731   case clang::ARM::BI__sev:
7732     Value = 4;
7733     break;
7734   case clang::ARM::BI__builtin_arm_sevl:
7735   case clang::ARM::BI__sevl:
7736     Value = 5;
7737     break;
7738   }
7739 
7740   return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_hint),
7741                             llvm::ConstantInt::get(Int32Ty, Value));
7742 }
7743 
7744 enum SpecialRegisterAccessKind {
7745   NormalRead,
7746   VolatileRead,
7747   Write,
7748 };
7749 
7750 // Generates the IR for the read/write special register builtin,
7751 // ValueType is the type of the value that is to be written or read,
7752 // RegisterType is the type of the register being written to or read from.
7753 static Value *EmitSpecialRegisterBuiltin(CodeGenFunction &CGF,
7754                                          const CallExpr *E,
7755                                          llvm::Type *RegisterType,
7756                                          llvm::Type *ValueType,
7757                                          SpecialRegisterAccessKind AccessKind,
7758                                          StringRef SysReg = "") {
7759   // write and register intrinsics only support 32, 64 and 128 bit operations.
7760   assert((RegisterType->isIntegerTy(32) || RegisterType->isIntegerTy(64) ||
7761           RegisterType->isIntegerTy(128)) &&
7762          "Unsupported size for register.");
7763 
7764   CodeGen::CGBuilderTy &Builder = CGF.Builder;
7765   CodeGen::CodeGenModule &CGM = CGF.CGM;
7766   LLVMContext &Context = CGM.getLLVMContext();
7767 
7768   if (SysReg.empty()) {
7769     const Expr *SysRegStrExpr = E->getArg(0)->IgnoreParenCasts();
7770     SysReg = cast<clang::StringLiteral>(SysRegStrExpr)->getString();
7771   }
7772 
7773   llvm::Metadata *Ops[] = { llvm::MDString::get(Context, SysReg) };
7774   llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
7775   llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
7776 
7777   llvm::Type *Types[] = { RegisterType };
7778 
7779   bool MixedTypes = RegisterType->isIntegerTy(64) && ValueType->isIntegerTy(32);
7780   assert(!(RegisterType->isIntegerTy(32) && ValueType->isIntegerTy(64))
7781             && "Can't fit 64-bit value in 32-bit register");
7782 
7783   if (AccessKind != Write) {
7784     assert(AccessKind == NormalRead || AccessKind == VolatileRead);
7785     llvm::Function *F = CGM.getIntrinsic(
7786         AccessKind == VolatileRead ? llvm::Intrinsic::read_volatile_register
7787                                    : llvm::Intrinsic::read_register,
7788         Types);
7789     llvm::Value *Call = Builder.CreateCall(F, Metadata);
7790 
7791     if (MixedTypes)
7792       // Read into 64 bit register and then truncate result to 32 bit.
7793       return Builder.CreateTrunc(Call, ValueType);
7794 
7795     if (ValueType->isPointerTy())
7796       // Have i32/i64 result (Call) but want to return a VoidPtrTy (i8*).
7797       return Builder.CreateIntToPtr(Call, ValueType);
7798 
7799     return Call;
7800   }
7801 
7802   llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::write_register, Types);
7803   llvm::Value *ArgValue = CGF.EmitScalarExpr(E->getArg(1));
7804   if (MixedTypes) {
7805     // Extend 32 bit write value to 64 bit to pass to write.
7806     ArgValue = Builder.CreateZExt(ArgValue, RegisterType);
7807     return Builder.CreateCall(F, { Metadata, ArgValue });
7808   }
7809 
7810   if (ValueType->isPointerTy()) {
7811     // Have VoidPtrTy ArgValue but want to return an i32/i64.
7812     ArgValue = Builder.CreatePtrToInt(ArgValue, RegisterType);
7813     return Builder.CreateCall(F, { Metadata, ArgValue });
7814   }
7815 
7816   return Builder.CreateCall(F, { Metadata, ArgValue });
7817 }
7818 
7819 /// Return true if BuiltinID is an overloaded Neon intrinsic with an extra
7820 /// argument that specifies the vector type.
7821 static bool HasExtraNeonArgument(unsigned BuiltinID) {
7822   switch (BuiltinID) {
7823   default: break;
7824   case NEON::BI__builtin_neon_vget_lane_i8:
7825   case NEON::BI__builtin_neon_vget_lane_i16:
7826   case NEON::BI__builtin_neon_vget_lane_bf16:
7827   case NEON::BI__builtin_neon_vget_lane_i32:
7828   case NEON::BI__builtin_neon_vget_lane_i64:
7829   case NEON::BI__builtin_neon_vget_lane_f32:
7830   case NEON::BI__builtin_neon_vgetq_lane_i8:
7831   case NEON::BI__builtin_neon_vgetq_lane_i16:
7832   case NEON::BI__builtin_neon_vgetq_lane_bf16:
7833   case NEON::BI__builtin_neon_vgetq_lane_i32:
7834   case NEON::BI__builtin_neon_vgetq_lane_i64:
7835   case NEON::BI__builtin_neon_vgetq_lane_f32:
7836   case NEON::BI__builtin_neon_vduph_lane_bf16:
7837   case NEON::BI__builtin_neon_vduph_laneq_bf16:
7838   case NEON::BI__builtin_neon_vset_lane_i8:
7839   case NEON::BI__builtin_neon_vset_lane_i16:
7840   case NEON::BI__builtin_neon_vset_lane_bf16:
7841   case NEON::BI__builtin_neon_vset_lane_i32:
7842   case NEON::BI__builtin_neon_vset_lane_i64:
7843   case NEON::BI__builtin_neon_vset_lane_f32:
7844   case NEON::BI__builtin_neon_vsetq_lane_i8:
7845   case NEON::BI__builtin_neon_vsetq_lane_i16:
7846   case NEON::BI__builtin_neon_vsetq_lane_bf16:
7847   case NEON::BI__builtin_neon_vsetq_lane_i32:
7848   case NEON::BI__builtin_neon_vsetq_lane_i64:
7849   case NEON::BI__builtin_neon_vsetq_lane_f32:
7850   case NEON::BI__builtin_neon_vsha1h_u32:
7851   case NEON::BI__builtin_neon_vsha1cq_u32:
7852   case NEON::BI__builtin_neon_vsha1pq_u32:
7853   case NEON::BI__builtin_neon_vsha1mq_u32:
7854   case NEON::BI__builtin_neon_vcvth_bf16_f32:
7855   case clang::ARM::BI_MoveToCoprocessor:
7856   case clang::ARM::BI_MoveToCoprocessor2:
7857     return false;
7858   }
7859   return true;
7860 }
7861 
7862 Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
7863                                            const CallExpr *E,
7864                                            ReturnValueSlot ReturnValue,
7865                                            llvm::Triple::ArchType Arch) {
7866   if (auto Hint = GetValueForARMHint(BuiltinID))
7867     return Hint;
7868 
7869   if (BuiltinID == clang::ARM::BI__emit) {
7870     bool IsThumb = getTarget().getTriple().getArch() == llvm::Triple::thumb;
7871     llvm::FunctionType *FTy =
7872         llvm::FunctionType::get(VoidTy, /*Variadic=*/false);
7873 
7874     Expr::EvalResult Result;
7875     if (!E->getArg(0)->EvaluateAsInt(Result, CGM.getContext()))
7876       llvm_unreachable("Sema will ensure that the parameter is constant");
7877 
7878     llvm::APSInt Value = Result.Val.getInt();
7879     uint64_t ZExtValue = Value.zextOrTrunc(IsThumb ? 16 : 32).getZExtValue();
7880 
7881     llvm::InlineAsm *Emit =
7882         IsThumb ? InlineAsm::get(FTy, ".inst.n 0x" + utohexstr(ZExtValue), "",
7883                                  /*hasSideEffects=*/true)
7884                 : InlineAsm::get(FTy, ".inst 0x" + utohexstr(ZExtValue), "",
7885                                  /*hasSideEffects=*/true);
7886 
7887     return Builder.CreateCall(Emit);
7888   }
7889 
7890   if (BuiltinID == clang::ARM::BI__builtin_arm_dbg) {
7891     Value *Option = EmitScalarExpr(E->getArg(0));
7892     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_dbg), Option);
7893   }
7894 
7895   if (BuiltinID == clang::ARM::BI__builtin_arm_prefetch) {
7896     Value *Address = EmitScalarExpr(E->getArg(0));
7897     Value *RW      = EmitScalarExpr(E->getArg(1));
7898     Value *IsData  = EmitScalarExpr(E->getArg(2));
7899 
7900     // Locality is not supported on ARM target
7901     Value *Locality = llvm::ConstantInt::get(Int32Ty, 3);
7902 
7903     Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType());
7904     return Builder.CreateCall(F, {Address, RW, Locality, IsData});
7905   }
7906 
7907   if (BuiltinID == clang::ARM::BI__builtin_arm_rbit) {
7908     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
7909     return Builder.CreateCall(
7910         CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit");
7911   }
7912 
7913   if (BuiltinID == clang::ARM::BI__builtin_arm_clz ||
7914       BuiltinID == clang::ARM::BI__builtin_arm_clz64) {
7915     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
7916     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Arg->getType());
7917     Value *Res = Builder.CreateCall(F, {Arg, Builder.getInt1(false)});
7918     if (BuiltinID == clang::ARM::BI__builtin_arm_clz64)
7919       Res = Builder.CreateTrunc(Res, Builder.getInt32Ty());
7920     return Res;
7921   }
7922 
7923 
7924   if (BuiltinID == clang::ARM::BI__builtin_arm_cls) {
7925     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
7926     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_cls), Arg, "cls");
7927   }
7928   if (BuiltinID == clang::ARM::BI__builtin_arm_cls64) {
7929     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
7930     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_cls64), Arg,
7931                               "cls");
7932   }
7933 
7934   if (BuiltinID == clang::ARM::BI__clear_cache) {
7935     assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments");
7936     const FunctionDecl *FD = E->getDirectCallee();
7937     Value *Ops[2];
7938     for (unsigned i = 0; i < 2; i++)
7939       Ops[i] = EmitScalarExpr(E->getArg(i));
7940     llvm::Type *Ty = CGM.getTypes().ConvertType(FD->getType());
7941     llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
7942     StringRef Name = FD->getName();
7943     return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops);
7944   }
7945 
7946   if (BuiltinID == clang::ARM::BI__builtin_arm_mcrr ||
7947       BuiltinID == clang::ARM::BI__builtin_arm_mcrr2) {
7948     Function *F;
7949 
7950     switch (BuiltinID) {
7951     default: llvm_unreachable("unexpected builtin");
7952     case clang::ARM::BI__builtin_arm_mcrr:
7953       F = CGM.getIntrinsic(Intrinsic::arm_mcrr);
7954       break;
7955     case clang::ARM::BI__builtin_arm_mcrr2:
7956       F = CGM.getIntrinsic(Intrinsic::arm_mcrr2);
7957       break;
7958     }
7959 
7960     // MCRR{2} instruction has 5 operands but
7961     // the intrinsic has 4 because Rt and Rt2
7962     // are represented as a single unsigned 64
7963     // bit integer in the intrinsic definition
7964     // but internally it's represented as 2 32
7965     // bit integers.
7966 
7967     Value *Coproc = EmitScalarExpr(E->getArg(0));
7968     Value *Opc1 = EmitScalarExpr(E->getArg(1));
7969     Value *RtAndRt2 = EmitScalarExpr(E->getArg(2));
7970     Value *CRm = EmitScalarExpr(E->getArg(3));
7971 
7972     Value *C1 = llvm::ConstantInt::get(Int64Ty, 32);
7973     Value *Rt = Builder.CreateTruncOrBitCast(RtAndRt2, Int32Ty);
7974     Value *Rt2 = Builder.CreateLShr(RtAndRt2, C1);
7975     Rt2 = Builder.CreateTruncOrBitCast(Rt2, Int32Ty);
7976 
7977     return Builder.CreateCall(F, {Coproc, Opc1, Rt, Rt2, CRm});
7978   }
7979 
7980   if (BuiltinID == clang::ARM::BI__builtin_arm_mrrc ||
7981       BuiltinID == clang::ARM::BI__builtin_arm_mrrc2) {
7982     Function *F;
7983 
7984     switch (BuiltinID) {
7985     default: llvm_unreachable("unexpected builtin");
7986     case clang::ARM::BI__builtin_arm_mrrc:
7987       F = CGM.getIntrinsic(Intrinsic::arm_mrrc);
7988       break;
7989     case clang::ARM::BI__builtin_arm_mrrc2:
7990       F = CGM.getIntrinsic(Intrinsic::arm_mrrc2);
7991       break;
7992     }
7993 
7994     Value *Coproc = EmitScalarExpr(E->getArg(0));
7995     Value *Opc1 = EmitScalarExpr(E->getArg(1));
7996     Value *CRm  = EmitScalarExpr(E->getArg(2));
7997     Value *RtAndRt2 = Builder.CreateCall(F, {Coproc, Opc1, CRm});
7998 
7999     // Returns an unsigned 64 bit integer, represented
8000     // as two 32 bit integers.
8001 
8002     Value *Rt = Builder.CreateExtractValue(RtAndRt2, 1);
8003     Value *Rt1 = Builder.CreateExtractValue(RtAndRt2, 0);
8004     Rt = Builder.CreateZExt(Rt, Int64Ty);
8005     Rt1 = Builder.CreateZExt(Rt1, Int64Ty);
8006 
8007     Value *ShiftCast = llvm::ConstantInt::get(Int64Ty, 32);
8008     RtAndRt2 = Builder.CreateShl(Rt, ShiftCast, "shl", true);
8009     RtAndRt2 = Builder.CreateOr(RtAndRt2, Rt1);
8010 
8011     return Builder.CreateBitCast(RtAndRt2, ConvertType(E->getType()));
8012   }
8013 
8014   if (BuiltinID == clang::ARM::BI__builtin_arm_ldrexd ||
8015       ((BuiltinID == clang::ARM::BI__builtin_arm_ldrex ||
8016         BuiltinID == clang::ARM::BI__builtin_arm_ldaex) &&
8017        getContext().getTypeSize(E->getType()) == 64) ||
8018       BuiltinID == clang::ARM::BI__ldrexd) {
8019     Function *F;
8020 
8021     switch (BuiltinID) {
8022     default: llvm_unreachable("unexpected builtin");
8023     case clang::ARM::BI__builtin_arm_ldaex:
8024       F = CGM.getIntrinsic(Intrinsic::arm_ldaexd);
8025       break;
8026     case clang::ARM::BI__builtin_arm_ldrexd:
8027     case clang::ARM::BI__builtin_arm_ldrex:
8028     case clang::ARM::BI__ldrexd:
8029       F = CGM.getIntrinsic(Intrinsic::arm_ldrexd);
8030       break;
8031     }
8032 
8033     Value *LdPtr = EmitScalarExpr(E->getArg(0));
8034     Value *Val = Builder.CreateCall(F, Builder.CreateBitCast(LdPtr, Int8PtrTy),
8035                                     "ldrexd");
8036 
8037     Value *Val0 = Builder.CreateExtractValue(Val, 1);
8038     Value *Val1 = Builder.CreateExtractValue(Val, 0);
8039     Val0 = Builder.CreateZExt(Val0, Int64Ty);
8040     Val1 = Builder.CreateZExt(Val1, Int64Ty);
8041 
8042     Value *ShiftCst = llvm::ConstantInt::get(Int64Ty, 32);
8043     Val = Builder.CreateShl(Val0, ShiftCst, "shl", true /* nuw */);
8044     Val = Builder.CreateOr(Val, Val1);
8045     return Builder.CreateBitCast(Val, ConvertType(E->getType()));
8046   }
8047 
8048   if (BuiltinID == clang::ARM::BI__builtin_arm_ldrex ||
8049       BuiltinID == clang::ARM::BI__builtin_arm_ldaex) {
8050     Value *LoadAddr = EmitScalarExpr(E->getArg(0));
8051 
8052     QualType Ty = E->getType();
8053     llvm::Type *RealResTy = ConvertType(Ty);
8054     llvm::Type *IntTy =
8055         llvm::IntegerType::get(getLLVMContext(), getContext().getTypeSize(Ty));
8056     llvm::Type *PtrTy = llvm::PointerType::getUnqual(getLLVMContext());
8057 
8058     Function *F = CGM.getIntrinsic(
8059         BuiltinID == clang::ARM::BI__builtin_arm_ldaex ? Intrinsic::arm_ldaex
8060                                                        : Intrinsic::arm_ldrex,
8061         PtrTy);
8062     CallInst *Val = Builder.CreateCall(F, LoadAddr, "ldrex");
8063     Val->addParamAttr(
8064         0, Attribute::get(getLLVMContext(), Attribute::ElementType, IntTy));
8065 
8066     if (RealResTy->isPointerTy())
8067       return Builder.CreateIntToPtr(Val, RealResTy);
8068     else {
8069       llvm::Type *IntResTy = llvm::IntegerType::get(
8070           getLLVMContext(), CGM.getDataLayout().getTypeSizeInBits(RealResTy));
8071       return Builder.CreateBitCast(Builder.CreateTruncOrBitCast(Val, IntResTy),
8072                                    RealResTy);
8073     }
8074   }
8075 
8076   if (BuiltinID == clang::ARM::BI__builtin_arm_strexd ||
8077       ((BuiltinID == clang::ARM::BI__builtin_arm_stlex ||
8078         BuiltinID == clang::ARM::BI__builtin_arm_strex) &&
8079        getContext().getTypeSize(E->getArg(0)->getType()) == 64)) {
8080     Function *F = CGM.getIntrinsic(
8081         BuiltinID == clang::ARM::BI__builtin_arm_stlex ? Intrinsic::arm_stlexd
8082                                                        : Intrinsic::arm_strexd);
8083     llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty);
8084 
8085     Address Tmp = CreateMemTemp(E->getArg(0)->getType());
8086     Value *Val = EmitScalarExpr(E->getArg(0));
8087     Builder.CreateStore(Val, Tmp);
8088 
8089     Address LdPtr = Tmp.withElementType(STy);
8090     Val = Builder.CreateLoad(LdPtr);
8091 
8092     Value *Arg0 = Builder.CreateExtractValue(Val, 0);
8093     Value *Arg1 = Builder.CreateExtractValue(Val, 1);
8094     Value *StPtr = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)), Int8PtrTy);
8095     return Builder.CreateCall(F, {Arg0, Arg1, StPtr}, "strexd");
8096   }
8097 
8098   if (BuiltinID == clang::ARM::BI__builtin_arm_strex ||
8099       BuiltinID == clang::ARM::BI__builtin_arm_stlex) {
8100     Value *StoreVal = EmitScalarExpr(E->getArg(0));
8101     Value *StoreAddr = EmitScalarExpr(E->getArg(1));
8102 
8103     QualType Ty = E->getArg(0)->getType();
8104     llvm::Type *StoreTy =
8105         llvm::IntegerType::get(getLLVMContext(), getContext().getTypeSize(Ty));
8106 
8107     if (StoreVal->getType()->isPointerTy())
8108       StoreVal = Builder.CreatePtrToInt(StoreVal, Int32Ty);
8109     else {
8110       llvm::Type *IntTy = llvm::IntegerType::get(
8111           getLLVMContext(),
8112           CGM.getDataLayout().getTypeSizeInBits(StoreVal->getType()));
8113       StoreVal = Builder.CreateBitCast(StoreVal, IntTy);
8114       StoreVal = Builder.CreateZExtOrBitCast(StoreVal, Int32Ty);
8115     }
8116 
8117     Function *F = CGM.getIntrinsic(
8118         BuiltinID == clang::ARM::BI__builtin_arm_stlex ? Intrinsic::arm_stlex
8119                                                        : Intrinsic::arm_strex,
8120         StoreAddr->getType());
8121 
8122     CallInst *CI = Builder.CreateCall(F, {StoreVal, StoreAddr}, "strex");
8123     CI->addParamAttr(
8124         1, Attribute::get(getLLVMContext(), Attribute::ElementType, StoreTy));
8125     return CI;
8126   }
8127 
8128   if (BuiltinID == clang::ARM::BI__builtin_arm_clrex) {
8129     Function *F = CGM.getIntrinsic(Intrinsic::arm_clrex);
8130     return Builder.CreateCall(F);
8131   }
8132 
8133   // CRC32
8134   Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic;
8135   switch (BuiltinID) {
8136   case clang::ARM::BI__builtin_arm_crc32b:
8137     CRCIntrinsicID = Intrinsic::arm_crc32b; break;
8138   case clang::ARM::BI__builtin_arm_crc32cb:
8139     CRCIntrinsicID = Intrinsic::arm_crc32cb; break;
8140   case clang::ARM::BI__builtin_arm_crc32h:
8141     CRCIntrinsicID = Intrinsic::arm_crc32h; break;
8142   case clang::ARM::BI__builtin_arm_crc32ch:
8143     CRCIntrinsicID = Intrinsic::arm_crc32ch; break;
8144   case clang::ARM::BI__builtin_arm_crc32w:
8145   case clang::ARM::BI__builtin_arm_crc32d:
8146     CRCIntrinsicID = Intrinsic::arm_crc32w; break;
8147   case clang::ARM::BI__builtin_arm_crc32cw:
8148   case clang::ARM::BI__builtin_arm_crc32cd:
8149     CRCIntrinsicID = Intrinsic::arm_crc32cw; break;
8150   }
8151 
8152   if (CRCIntrinsicID != Intrinsic::not_intrinsic) {
8153     Value *Arg0 = EmitScalarExpr(E->getArg(0));
8154     Value *Arg1 = EmitScalarExpr(E->getArg(1));
8155 
8156     // crc32{c,}d intrinsics are implemented as two calls to crc32{c,}w
8157     // intrinsics, hence we need different codegen for these cases.
8158     if (BuiltinID == clang::ARM::BI__builtin_arm_crc32d ||
8159         BuiltinID == clang::ARM::BI__builtin_arm_crc32cd) {
8160       Value *C1 = llvm::ConstantInt::get(Int64Ty, 32);
8161       Value *Arg1a = Builder.CreateTruncOrBitCast(Arg1, Int32Ty);
8162       Value *Arg1b = Builder.CreateLShr(Arg1, C1);
8163       Arg1b = Builder.CreateTruncOrBitCast(Arg1b, Int32Ty);
8164 
8165       Function *F = CGM.getIntrinsic(CRCIntrinsicID);
8166       Value *Res = Builder.CreateCall(F, {Arg0, Arg1a});
8167       return Builder.CreateCall(F, {Res, Arg1b});
8168     } else {
8169       Arg1 = Builder.CreateZExtOrBitCast(Arg1, Int32Ty);
8170 
8171       Function *F = CGM.getIntrinsic(CRCIntrinsicID);
8172       return Builder.CreateCall(F, {Arg0, Arg1});
8173     }
8174   }
8175 
8176   if (BuiltinID == clang::ARM::BI__builtin_arm_rsr ||
8177       BuiltinID == clang::ARM::BI__builtin_arm_rsr64 ||
8178       BuiltinID == clang::ARM::BI__builtin_arm_rsrp ||
8179       BuiltinID == clang::ARM::BI__builtin_arm_wsr ||
8180       BuiltinID == clang::ARM::BI__builtin_arm_wsr64 ||
8181       BuiltinID == clang::ARM::BI__builtin_arm_wsrp) {
8182 
8183     SpecialRegisterAccessKind AccessKind = Write;
8184     if (BuiltinID == clang::ARM::BI__builtin_arm_rsr ||
8185         BuiltinID == clang::ARM::BI__builtin_arm_rsr64 ||
8186         BuiltinID == clang::ARM::BI__builtin_arm_rsrp)
8187       AccessKind = VolatileRead;
8188 
8189     bool IsPointerBuiltin = BuiltinID == clang::ARM::BI__builtin_arm_rsrp ||
8190                             BuiltinID == clang::ARM::BI__builtin_arm_wsrp;
8191 
8192     bool Is64Bit = BuiltinID == clang::ARM::BI__builtin_arm_rsr64 ||
8193                    BuiltinID == clang::ARM::BI__builtin_arm_wsr64;
8194 
8195     llvm::Type *ValueType;
8196     llvm::Type *RegisterType;
8197     if (IsPointerBuiltin) {
8198       ValueType = VoidPtrTy;
8199       RegisterType = Int32Ty;
8200     } else if (Is64Bit) {
8201       ValueType = RegisterType = Int64Ty;
8202     } else {
8203       ValueType = RegisterType = Int32Ty;
8204     }
8205 
8206     return EmitSpecialRegisterBuiltin(*this, E, RegisterType, ValueType,
8207                                       AccessKind);
8208   }
8209 
8210   if (BuiltinID == ARM::BI__builtin_sponentry) {
8211     llvm::Function *F = CGM.getIntrinsic(Intrinsic::sponentry, AllocaInt8PtrTy);
8212     return Builder.CreateCall(F);
8213   }
8214 
8215   // Handle MSVC intrinsics before argument evaluation to prevent double
8216   // evaluation.
8217   if (std::optional<MSVCIntrin> MsvcIntId = translateArmToMsvcIntrin(BuiltinID))
8218     return EmitMSVCBuiltinExpr(*MsvcIntId, E);
8219 
8220   // Deal with MVE builtins
8221   if (Value *Result = EmitARMMVEBuiltinExpr(BuiltinID, E, ReturnValue, Arch))
8222     return Result;
8223   // Handle CDE builtins
8224   if (Value *Result = EmitARMCDEBuiltinExpr(BuiltinID, E, ReturnValue, Arch))
8225     return Result;
8226 
8227   // Some intrinsics are equivalent - if they are use the base intrinsic ID.
8228   auto It = llvm::find_if(NEONEquivalentIntrinsicMap, [BuiltinID](auto &P) {
8229     return P.first == BuiltinID;
8230   });
8231   if (It != end(NEONEquivalentIntrinsicMap))
8232     BuiltinID = It->second;
8233 
8234   // Find out if any arguments are required to be integer constant
8235   // expressions.
8236   unsigned ICEArguments = 0;
8237   ASTContext::GetBuiltinTypeError Error;
8238   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
8239   assert(Error == ASTContext::GE_None && "Should not codegen an error");
8240 
8241   auto getAlignmentValue32 = [&](Address addr) -> Value* {
8242     return Builder.getInt32(addr.getAlignment().getQuantity());
8243   };
8244 
8245   Address PtrOp0 = Address::invalid();
8246   Address PtrOp1 = Address::invalid();
8247   SmallVector<Value*, 4> Ops;
8248   bool HasExtraArg = HasExtraNeonArgument(BuiltinID);
8249   unsigned NumArgs = E->getNumArgs() - (HasExtraArg ? 1 : 0);
8250   for (unsigned i = 0, e = NumArgs; i != e; i++) {
8251     if (i == 0) {
8252       switch (BuiltinID) {
8253       case NEON::BI__builtin_neon_vld1_v:
8254       case NEON::BI__builtin_neon_vld1q_v:
8255       case NEON::BI__builtin_neon_vld1q_lane_v:
8256       case NEON::BI__builtin_neon_vld1_lane_v:
8257       case NEON::BI__builtin_neon_vld1_dup_v:
8258       case NEON::BI__builtin_neon_vld1q_dup_v:
8259       case NEON::BI__builtin_neon_vst1_v:
8260       case NEON::BI__builtin_neon_vst1q_v:
8261       case NEON::BI__builtin_neon_vst1q_lane_v:
8262       case NEON::BI__builtin_neon_vst1_lane_v:
8263       case NEON::BI__builtin_neon_vst2_v:
8264       case NEON::BI__builtin_neon_vst2q_v:
8265       case NEON::BI__builtin_neon_vst2_lane_v:
8266       case NEON::BI__builtin_neon_vst2q_lane_v:
8267       case NEON::BI__builtin_neon_vst3_v:
8268       case NEON::BI__builtin_neon_vst3q_v:
8269       case NEON::BI__builtin_neon_vst3_lane_v:
8270       case NEON::BI__builtin_neon_vst3q_lane_v:
8271       case NEON::BI__builtin_neon_vst4_v:
8272       case NEON::BI__builtin_neon_vst4q_v:
8273       case NEON::BI__builtin_neon_vst4_lane_v:
8274       case NEON::BI__builtin_neon_vst4q_lane_v:
8275         // Get the alignment for the argument in addition to the value;
8276         // we'll use it later.
8277         PtrOp0 = EmitPointerWithAlignment(E->getArg(0));
8278         Ops.push_back(PtrOp0.getPointer());
8279         continue;
8280       }
8281     }
8282     if (i == 1) {
8283       switch (BuiltinID) {
8284       case NEON::BI__builtin_neon_vld2_v:
8285       case NEON::BI__builtin_neon_vld2q_v:
8286       case NEON::BI__builtin_neon_vld3_v:
8287       case NEON::BI__builtin_neon_vld3q_v:
8288       case NEON::BI__builtin_neon_vld4_v:
8289       case NEON::BI__builtin_neon_vld4q_v:
8290       case NEON::BI__builtin_neon_vld2_lane_v:
8291       case NEON::BI__builtin_neon_vld2q_lane_v:
8292       case NEON::BI__builtin_neon_vld3_lane_v:
8293       case NEON::BI__builtin_neon_vld3q_lane_v:
8294       case NEON::BI__builtin_neon_vld4_lane_v:
8295       case NEON::BI__builtin_neon_vld4q_lane_v:
8296       case NEON::BI__builtin_neon_vld2_dup_v:
8297       case NEON::BI__builtin_neon_vld2q_dup_v:
8298       case NEON::BI__builtin_neon_vld3_dup_v:
8299       case NEON::BI__builtin_neon_vld3q_dup_v:
8300       case NEON::BI__builtin_neon_vld4_dup_v:
8301       case NEON::BI__builtin_neon_vld4q_dup_v:
8302         // Get the alignment for the argument in addition to the value;
8303         // we'll use it later.
8304         PtrOp1 = EmitPointerWithAlignment(E->getArg(1));
8305         Ops.push_back(PtrOp1.getPointer());
8306         continue;
8307       }
8308     }
8309 
8310     if ((ICEArguments & (1 << i)) == 0) {
8311       Ops.push_back(EmitScalarExpr(E->getArg(i)));
8312     } else {
8313       // If this is required to be a constant, constant fold it so that we know
8314       // that the generated intrinsic gets a ConstantInt.
8315       Ops.push_back(llvm::ConstantInt::get(
8316           getLLVMContext(),
8317           *E->getArg(i)->getIntegerConstantExpr(getContext())));
8318     }
8319   }
8320 
8321   switch (BuiltinID) {
8322   default: break;
8323 
8324   case NEON::BI__builtin_neon_vget_lane_i8:
8325   case NEON::BI__builtin_neon_vget_lane_i16:
8326   case NEON::BI__builtin_neon_vget_lane_i32:
8327   case NEON::BI__builtin_neon_vget_lane_i64:
8328   case NEON::BI__builtin_neon_vget_lane_bf16:
8329   case NEON::BI__builtin_neon_vget_lane_f32:
8330   case NEON::BI__builtin_neon_vgetq_lane_i8:
8331   case NEON::BI__builtin_neon_vgetq_lane_i16:
8332   case NEON::BI__builtin_neon_vgetq_lane_i32:
8333   case NEON::BI__builtin_neon_vgetq_lane_i64:
8334   case NEON::BI__builtin_neon_vgetq_lane_bf16:
8335   case NEON::BI__builtin_neon_vgetq_lane_f32:
8336   case NEON::BI__builtin_neon_vduph_lane_bf16:
8337   case NEON::BI__builtin_neon_vduph_laneq_bf16:
8338     return Builder.CreateExtractElement(Ops[0], Ops[1], "vget_lane");
8339 
8340   case NEON::BI__builtin_neon_vrndns_f32: {
8341     Value *Arg = EmitScalarExpr(E->getArg(0));
8342     llvm::Type *Tys[] = {Arg->getType()};
8343     Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vrintn, Tys);
8344     return Builder.CreateCall(F, {Arg}, "vrndn"); }
8345 
8346   case NEON::BI__builtin_neon_vset_lane_i8:
8347   case NEON::BI__builtin_neon_vset_lane_i16:
8348   case NEON::BI__builtin_neon_vset_lane_i32:
8349   case NEON::BI__builtin_neon_vset_lane_i64:
8350   case NEON::BI__builtin_neon_vset_lane_bf16:
8351   case NEON::BI__builtin_neon_vset_lane_f32:
8352   case NEON::BI__builtin_neon_vsetq_lane_i8:
8353   case NEON::BI__builtin_neon_vsetq_lane_i16:
8354   case NEON::BI__builtin_neon_vsetq_lane_i32:
8355   case NEON::BI__builtin_neon_vsetq_lane_i64:
8356   case NEON::BI__builtin_neon_vsetq_lane_bf16:
8357   case NEON::BI__builtin_neon_vsetq_lane_f32:
8358     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
8359 
8360   case NEON::BI__builtin_neon_vsha1h_u32:
8361     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1h), Ops,
8362                         "vsha1h");
8363   case NEON::BI__builtin_neon_vsha1cq_u32:
8364     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1c), Ops,
8365                         "vsha1h");
8366   case NEON::BI__builtin_neon_vsha1pq_u32:
8367     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1p), Ops,
8368                         "vsha1h");
8369   case NEON::BI__builtin_neon_vsha1mq_u32:
8370     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1m), Ops,
8371                         "vsha1h");
8372 
8373   case NEON::BI__builtin_neon_vcvth_bf16_f32: {
8374     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vcvtbfp2bf), Ops,
8375                         "vcvtbfp2bf");
8376   }
8377 
8378   // The ARM _MoveToCoprocessor builtins put the input register value as
8379   // the first argument, but the LLVM intrinsic expects it as the third one.
8380   case clang::ARM::BI_MoveToCoprocessor:
8381   case clang::ARM::BI_MoveToCoprocessor2: {
8382     Function *F = CGM.getIntrinsic(BuiltinID == clang::ARM::BI_MoveToCoprocessor
8383                                        ? Intrinsic::arm_mcr
8384                                        : Intrinsic::arm_mcr2);
8385     return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0],
8386                                   Ops[3], Ops[4], Ops[5]});
8387   }
8388   }
8389 
8390   // Get the last argument, which specifies the vector type.
8391   assert(HasExtraArg);
8392   const Expr *Arg = E->getArg(E->getNumArgs()-1);
8393   std::optional<llvm::APSInt> Result =
8394       Arg->getIntegerConstantExpr(getContext());
8395   if (!Result)
8396     return nullptr;
8397 
8398   if (BuiltinID == clang::ARM::BI__builtin_arm_vcvtr_f ||
8399       BuiltinID == clang::ARM::BI__builtin_arm_vcvtr_d) {
8400     // Determine the overloaded type of this builtin.
8401     llvm::Type *Ty;
8402     if (BuiltinID == clang::ARM::BI__builtin_arm_vcvtr_f)
8403       Ty = FloatTy;
8404     else
8405       Ty = DoubleTy;
8406 
8407     // Determine whether this is an unsigned conversion or not.
8408     bool usgn = Result->getZExtValue() == 1;
8409     unsigned Int = usgn ? Intrinsic::arm_vcvtru : Intrinsic::arm_vcvtr;
8410 
8411     // Call the appropriate intrinsic.
8412     Function *F = CGM.getIntrinsic(Int, Ty);
8413     return Builder.CreateCall(F, Ops, "vcvtr");
8414   }
8415 
8416   // Determine the type of this overloaded NEON intrinsic.
8417   NeonTypeFlags Type = Result->getZExtValue();
8418   bool usgn = Type.isUnsigned();
8419   bool rightShift = false;
8420 
8421   llvm::FixedVectorType *VTy =
8422       GetNeonType(this, Type, getTarget().hasLegalHalfType(), false,
8423                   getTarget().hasBFloat16Type());
8424   llvm::Type *Ty = VTy;
8425   if (!Ty)
8426     return nullptr;
8427 
8428   // Many NEON builtins have identical semantics and uses in ARM and
8429   // AArch64. Emit these in a single function.
8430   auto IntrinsicMap = ArrayRef(ARMSIMDIntrinsicMap);
8431   const ARMVectorIntrinsicInfo *Builtin = findARMVectorIntrinsicInMap(
8432       IntrinsicMap, BuiltinID, NEONSIMDIntrinsicsProvenSorted);
8433   if (Builtin)
8434     return EmitCommonNeonBuiltinExpr(
8435         Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic,
8436         Builtin->NameHint, Builtin->TypeModifier, E, Ops, PtrOp0, PtrOp1, Arch);
8437 
8438   unsigned Int;
8439   switch (BuiltinID) {
8440   default: return nullptr;
8441   case NEON::BI__builtin_neon_vld1q_lane_v:
8442     // Handle 64-bit integer elements as a special case.  Use shuffles of
8443     // one-element vectors to avoid poor code for i64 in the backend.
8444     if (VTy->getElementType()->isIntegerTy(64)) {
8445       // Extract the other lane.
8446       Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8447       int Lane = cast<ConstantInt>(Ops[2])->getZExtValue();
8448       Value *SV = llvm::ConstantVector::get(ConstantInt::get(Int32Ty, 1-Lane));
8449       Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV);
8450       // Load the value as a one-element vector.
8451       Ty = llvm::FixedVectorType::get(VTy->getElementType(), 1);
8452       llvm::Type *Tys[] = {Ty, Int8PtrTy};
8453       Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld1, Tys);
8454       Value *Align = getAlignmentValue32(PtrOp0);
8455       Value *Ld = Builder.CreateCall(F, {Ops[0], Align});
8456       // Combine them.
8457       int Indices[] = {1 - Lane, Lane};
8458       return Builder.CreateShuffleVector(Ops[1], Ld, Indices, "vld1q_lane");
8459     }
8460     [[fallthrough]];
8461   case NEON::BI__builtin_neon_vld1_lane_v: {
8462     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8463     PtrOp0 = PtrOp0.withElementType(VTy->getElementType());
8464     Value *Ld = Builder.CreateLoad(PtrOp0);
8465     return Builder.CreateInsertElement(Ops[1], Ld, Ops[2], "vld1_lane");
8466   }
8467   case NEON::BI__builtin_neon_vqrshrn_n_v:
8468     Int =
8469       usgn ? Intrinsic::arm_neon_vqrshiftnu : Intrinsic::arm_neon_vqrshiftns;
8470     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n",
8471                         1, true);
8472   case NEON::BI__builtin_neon_vqrshrun_n_v:
8473     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqrshiftnsu, Ty),
8474                         Ops, "vqrshrun_n", 1, true);
8475   case NEON::BI__builtin_neon_vqshrn_n_v:
8476     Int = usgn ? Intrinsic::arm_neon_vqshiftnu : Intrinsic::arm_neon_vqshiftns;
8477     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n",
8478                         1, true);
8479   case NEON::BI__builtin_neon_vqshrun_n_v:
8480     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqshiftnsu, Ty),
8481                         Ops, "vqshrun_n", 1, true);
8482   case NEON::BI__builtin_neon_vrecpe_v:
8483   case NEON::BI__builtin_neon_vrecpeq_v:
8484     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrecpe, Ty),
8485                         Ops, "vrecpe");
8486   case NEON::BI__builtin_neon_vrshrn_n_v:
8487     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrshiftn, Ty),
8488                         Ops, "vrshrn_n", 1, true);
8489   case NEON::BI__builtin_neon_vrsra_n_v:
8490   case NEON::BI__builtin_neon_vrsraq_n_v:
8491     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
8492     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8493     Ops[2] = EmitNeonShiftVector(Ops[2], Ty, true);
8494     Int = usgn ? Intrinsic::arm_neon_vrshiftu : Intrinsic::arm_neon_vrshifts;
8495     Ops[1] = Builder.CreateCall(CGM.getIntrinsic(Int, Ty), {Ops[1], Ops[2]});
8496     return Builder.CreateAdd(Ops[0], Ops[1], "vrsra_n");
8497   case NEON::BI__builtin_neon_vsri_n_v:
8498   case NEON::BI__builtin_neon_vsriq_n_v:
8499     rightShift = true;
8500     [[fallthrough]];
8501   case NEON::BI__builtin_neon_vsli_n_v:
8502   case NEON::BI__builtin_neon_vsliq_n_v:
8503     Ops[2] = EmitNeonShiftVector(Ops[2], Ty, rightShift);
8504     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vshiftins, Ty),
8505                         Ops, "vsli_n");
8506   case NEON::BI__builtin_neon_vsra_n_v:
8507   case NEON::BI__builtin_neon_vsraq_n_v:
8508     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
8509     Ops[1] = EmitNeonRShiftImm(Ops[1], Ops[2], Ty, usgn, "vsra_n");
8510     return Builder.CreateAdd(Ops[0], Ops[1]);
8511   case NEON::BI__builtin_neon_vst1q_lane_v:
8512     // Handle 64-bit integer elements as a special case.  Use a shuffle to get
8513     // a one-element vector and avoid poor code for i64 in the backend.
8514     if (VTy->getElementType()->isIntegerTy(64)) {
8515       Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8516       Value *SV = llvm::ConstantVector::get(cast<llvm::Constant>(Ops[2]));
8517       Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV);
8518       Ops[2] = getAlignmentValue32(PtrOp0);
8519       llvm::Type *Tys[] = {Int8PtrTy, Ops[1]->getType()};
8520       return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst1,
8521                                                  Tys), Ops);
8522     }
8523     [[fallthrough]];
8524   case NEON::BI__builtin_neon_vst1_lane_v: {
8525     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8526     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
8527     return Builder.CreateStore(Ops[1],
8528                                PtrOp0.withElementType(Ops[1]->getType()));
8529   }
8530   case NEON::BI__builtin_neon_vtbl1_v:
8531     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl1),
8532                         Ops, "vtbl1");
8533   case NEON::BI__builtin_neon_vtbl2_v:
8534     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl2),
8535                         Ops, "vtbl2");
8536   case NEON::BI__builtin_neon_vtbl3_v:
8537     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl3),
8538                         Ops, "vtbl3");
8539   case NEON::BI__builtin_neon_vtbl4_v:
8540     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl4),
8541                         Ops, "vtbl4");
8542   case NEON::BI__builtin_neon_vtbx1_v:
8543     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx1),
8544                         Ops, "vtbx1");
8545   case NEON::BI__builtin_neon_vtbx2_v:
8546     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx2),
8547                         Ops, "vtbx2");
8548   case NEON::BI__builtin_neon_vtbx3_v:
8549     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx3),
8550                         Ops, "vtbx3");
8551   case NEON::BI__builtin_neon_vtbx4_v:
8552     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx4),
8553                         Ops, "vtbx4");
8554   }
8555 }
8556 
8557 template<typename Integer>
8558 static Integer GetIntegerConstantValue(const Expr *E, ASTContext &Context) {
8559   return E->getIntegerConstantExpr(Context)->getExtValue();
8560 }
8561 
8562 static llvm::Value *SignOrZeroExtend(CGBuilderTy &Builder, llvm::Value *V,
8563                                      llvm::Type *T, bool Unsigned) {
8564   // Helper function called by Tablegen-constructed ARM MVE builtin codegen,
8565   // which finds it convenient to specify signed/unsigned as a boolean flag.
8566   return Unsigned ? Builder.CreateZExt(V, T) : Builder.CreateSExt(V, T);
8567 }
8568 
8569 static llvm::Value *MVEImmediateShr(CGBuilderTy &Builder, llvm::Value *V,
8570                                     uint32_t Shift, bool Unsigned) {
8571   // MVE helper function for integer shift right. This must handle signed vs
8572   // unsigned, and also deal specially with the case where the shift count is
8573   // equal to the lane size. In LLVM IR, an LShr with that parameter would be
8574   // undefined behavior, but in MVE it's legal, so we must convert it to code
8575   // that is not undefined in IR.
8576   unsigned LaneBits = cast<llvm::VectorType>(V->getType())
8577                           ->getElementType()
8578                           ->getPrimitiveSizeInBits();
8579   if (Shift == LaneBits) {
8580     // An unsigned shift of the full lane size always generates zero, so we can
8581     // simply emit a zero vector. A signed shift of the full lane size does the
8582     // same thing as shifting by one bit fewer.
8583     if (Unsigned)
8584       return llvm::Constant::getNullValue(V->getType());
8585     else
8586       --Shift;
8587   }
8588   return Unsigned ? Builder.CreateLShr(V, Shift) : Builder.CreateAShr(V, Shift);
8589 }
8590 
8591 static llvm::Value *ARMMVEVectorSplat(CGBuilderTy &Builder, llvm::Value *V) {
8592   // MVE-specific helper function for a vector splat, which infers the element
8593   // count of the output vector by knowing that MVE vectors are all 128 bits
8594   // wide.
8595   unsigned Elements = 128 / V->getType()->getPrimitiveSizeInBits();
8596   return Builder.CreateVectorSplat(Elements, V);
8597 }
8598 
8599 static llvm::Value *ARMMVEVectorReinterpret(CGBuilderTy &Builder,
8600                                             CodeGenFunction *CGF,
8601                                             llvm::Value *V,
8602                                             llvm::Type *DestType) {
8603   // Convert one MVE vector type into another by reinterpreting its in-register
8604   // format.
8605   //
8606   // Little-endian, this is identical to a bitcast (which reinterprets the
8607   // memory format). But big-endian, they're not necessarily the same, because
8608   // the register and memory formats map to each other differently depending on
8609   // the lane size.
8610   //
8611   // We generate a bitcast whenever we can (if we're little-endian, or if the
8612   // lane sizes are the same anyway). Otherwise we fall back to an IR intrinsic
8613   // that performs the different kind of reinterpretation.
8614   if (CGF->getTarget().isBigEndian() &&
8615       V->getType()->getScalarSizeInBits() != DestType->getScalarSizeInBits()) {
8616     return Builder.CreateCall(
8617         CGF->CGM.getIntrinsic(Intrinsic::arm_mve_vreinterpretq,
8618                               {DestType, V->getType()}),
8619         V);
8620   } else {
8621     return Builder.CreateBitCast(V, DestType);
8622   }
8623 }
8624 
8625 static llvm::Value *VectorUnzip(CGBuilderTy &Builder, llvm::Value *V, bool Odd) {
8626   // Make a shufflevector that extracts every other element of a vector (evens
8627   // or odds, as desired).
8628   SmallVector<int, 16> Indices;
8629   unsigned InputElements =
8630       cast<llvm::FixedVectorType>(V->getType())->getNumElements();
8631   for (unsigned i = 0; i < InputElements; i += 2)
8632     Indices.push_back(i + Odd);
8633   return Builder.CreateShuffleVector(V, Indices);
8634 }
8635 
8636 static llvm::Value *VectorZip(CGBuilderTy &Builder, llvm::Value *V0,
8637                               llvm::Value *V1) {
8638   // Make a shufflevector that interleaves two vectors element by element.
8639   assert(V0->getType() == V1->getType() && "Can't zip different vector types");
8640   SmallVector<int, 16> Indices;
8641   unsigned InputElements =
8642       cast<llvm::FixedVectorType>(V0->getType())->getNumElements();
8643   for (unsigned i = 0; i < InputElements; i++) {
8644     Indices.push_back(i);
8645     Indices.push_back(i + InputElements);
8646   }
8647   return Builder.CreateShuffleVector(V0, V1, Indices);
8648 }
8649 
8650 template<unsigned HighBit, unsigned OtherBits>
8651 static llvm::Value *ARMMVEConstantSplat(CGBuilderTy &Builder, llvm::Type *VT) {
8652   // MVE-specific helper function to make a vector splat of a constant such as
8653   // UINT_MAX or INT_MIN, in which all bits below the highest one are equal.
8654   llvm::Type *T = cast<llvm::VectorType>(VT)->getElementType();
8655   unsigned LaneBits = T->getPrimitiveSizeInBits();
8656   uint32_t Value = HighBit << (LaneBits - 1);
8657   if (OtherBits)
8658     Value |= (1UL << (LaneBits - 1)) - 1;
8659   llvm::Value *Lane = llvm::ConstantInt::get(T, Value);
8660   return ARMMVEVectorSplat(Builder, Lane);
8661 }
8662 
8663 static llvm::Value *ARMMVEVectorElementReverse(CGBuilderTy &Builder,
8664                                                llvm::Value *V,
8665                                                unsigned ReverseWidth) {
8666   // MVE-specific helper function which reverses the elements of a
8667   // vector within every (ReverseWidth)-bit collection of lanes.
8668   SmallVector<int, 16> Indices;
8669   unsigned LaneSize = V->getType()->getScalarSizeInBits();
8670   unsigned Elements = 128 / LaneSize;
8671   unsigned Mask = ReverseWidth / LaneSize - 1;
8672   for (unsigned i = 0; i < Elements; i++)
8673     Indices.push_back(i ^ Mask);
8674   return Builder.CreateShuffleVector(V, Indices);
8675 }
8676 
8677 Value *CodeGenFunction::EmitARMMVEBuiltinExpr(unsigned BuiltinID,
8678                                               const CallExpr *E,
8679                                               ReturnValueSlot ReturnValue,
8680                                               llvm::Triple::ArchType Arch) {
8681   enum class CustomCodeGen { VLD24, VST24 } CustomCodeGenType;
8682   Intrinsic::ID IRIntr;
8683   unsigned NumVectors;
8684 
8685   // Code autogenerated by Tablegen will handle all the simple builtins.
8686   switch (BuiltinID) {
8687     #include "clang/Basic/arm_mve_builtin_cg.inc"
8688 
8689     // If we didn't match an MVE builtin id at all, go back to the
8690     // main EmitARMBuiltinExpr.
8691   default:
8692     return nullptr;
8693   }
8694 
8695   // Anything that breaks from that switch is an MVE builtin that
8696   // needs handwritten code to generate.
8697 
8698   switch (CustomCodeGenType) {
8699 
8700   case CustomCodeGen::VLD24: {
8701     llvm::SmallVector<Value *, 4> Ops;
8702     llvm::SmallVector<llvm::Type *, 4> Tys;
8703 
8704     auto MvecCType = E->getType();
8705     auto MvecLType = ConvertType(MvecCType);
8706     assert(MvecLType->isStructTy() &&
8707            "Return type for vld[24]q should be a struct");
8708     assert(MvecLType->getStructNumElements() == 1 &&
8709            "Return-type struct for vld[24]q should have one element");
8710     auto MvecLTypeInner = MvecLType->getStructElementType(0);
8711     assert(MvecLTypeInner->isArrayTy() &&
8712            "Return-type struct for vld[24]q should contain an array");
8713     assert(MvecLTypeInner->getArrayNumElements() == NumVectors &&
8714            "Array member of return-type struct vld[24]q has wrong length");
8715     auto VecLType = MvecLTypeInner->getArrayElementType();
8716 
8717     Tys.push_back(VecLType);
8718 
8719     auto Addr = E->getArg(0);
8720     Ops.push_back(EmitScalarExpr(Addr));
8721     Tys.push_back(ConvertType(Addr->getType()));
8722 
8723     Function *F = CGM.getIntrinsic(IRIntr, ArrayRef(Tys));
8724     Value *LoadResult = Builder.CreateCall(F, Ops);
8725     Value *MvecOut = PoisonValue::get(MvecLType);
8726     for (unsigned i = 0; i < NumVectors; ++i) {
8727       Value *Vec = Builder.CreateExtractValue(LoadResult, i);
8728       MvecOut = Builder.CreateInsertValue(MvecOut, Vec, {0, i});
8729     }
8730 
8731     if (ReturnValue.isNull())
8732       return MvecOut;
8733     else
8734       return Builder.CreateStore(MvecOut, ReturnValue.getValue());
8735   }
8736 
8737   case CustomCodeGen::VST24: {
8738     llvm::SmallVector<Value *, 4> Ops;
8739     llvm::SmallVector<llvm::Type *, 4> Tys;
8740 
8741     auto Addr = E->getArg(0);
8742     Ops.push_back(EmitScalarExpr(Addr));
8743     Tys.push_back(ConvertType(Addr->getType()));
8744 
8745     auto MvecCType = E->getArg(1)->getType();
8746     auto MvecLType = ConvertType(MvecCType);
8747     assert(MvecLType->isStructTy() && "Data type for vst2q should be a struct");
8748     assert(MvecLType->getStructNumElements() == 1 &&
8749            "Data-type struct for vst2q should have one element");
8750     auto MvecLTypeInner = MvecLType->getStructElementType(0);
8751     assert(MvecLTypeInner->isArrayTy() &&
8752            "Data-type struct for vst2q should contain an array");
8753     assert(MvecLTypeInner->getArrayNumElements() == NumVectors &&
8754            "Array member of return-type struct vld[24]q has wrong length");
8755     auto VecLType = MvecLTypeInner->getArrayElementType();
8756 
8757     Tys.push_back(VecLType);
8758 
8759     AggValueSlot MvecSlot = CreateAggTemp(MvecCType);
8760     EmitAggExpr(E->getArg(1), MvecSlot);
8761     auto Mvec = Builder.CreateLoad(MvecSlot.getAddress());
8762     for (unsigned i = 0; i < NumVectors; i++)
8763       Ops.push_back(Builder.CreateExtractValue(Mvec, {0, i}));
8764 
8765     Function *F = CGM.getIntrinsic(IRIntr, ArrayRef(Tys));
8766     Value *ToReturn = nullptr;
8767     for (unsigned i = 0; i < NumVectors; i++) {
8768       Ops.push_back(llvm::ConstantInt::get(Int32Ty, i));
8769       ToReturn = Builder.CreateCall(F, Ops);
8770       Ops.pop_back();
8771     }
8772     return ToReturn;
8773   }
8774   }
8775   llvm_unreachable("unknown custom codegen type.");
8776 }
8777 
8778 Value *CodeGenFunction::EmitARMCDEBuiltinExpr(unsigned BuiltinID,
8779                                               const CallExpr *E,
8780                                               ReturnValueSlot ReturnValue,
8781                                               llvm::Triple::ArchType Arch) {
8782   switch (BuiltinID) {
8783   default:
8784     return nullptr;
8785 #include "clang/Basic/arm_cde_builtin_cg.inc"
8786   }
8787 }
8788 
8789 static Value *EmitAArch64TblBuiltinExpr(CodeGenFunction &CGF, unsigned BuiltinID,
8790                                       const CallExpr *E,
8791                                       SmallVectorImpl<Value *> &Ops,
8792                                       llvm::Triple::ArchType Arch) {
8793   unsigned int Int = 0;
8794   const char *s = nullptr;
8795 
8796   switch (BuiltinID) {
8797   default:
8798     return nullptr;
8799   case NEON::BI__builtin_neon_vtbl1_v:
8800   case NEON::BI__builtin_neon_vqtbl1_v:
8801   case NEON::BI__builtin_neon_vqtbl1q_v:
8802   case NEON::BI__builtin_neon_vtbl2_v:
8803   case NEON::BI__builtin_neon_vqtbl2_v:
8804   case NEON::BI__builtin_neon_vqtbl2q_v:
8805   case NEON::BI__builtin_neon_vtbl3_v:
8806   case NEON::BI__builtin_neon_vqtbl3_v:
8807   case NEON::BI__builtin_neon_vqtbl3q_v:
8808   case NEON::BI__builtin_neon_vtbl4_v:
8809   case NEON::BI__builtin_neon_vqtbl4_v:
8810   case NEON::BI__builtin_neon_vqtbl4q_v:
8811     break;
8812   case NEON::BI__builtin_neon_vtbx1_v:
8813   case NEON::BI__builtin_neon_vqtbx1_v:
8814   case NEON::BI__builtin_neon_vqtbx1q_v:
8815   case NEON::BI__builtin_neon_vtbx2_v:
8816   case NEON::BI__builtin_neon_vqtbx2_v:
8817   case NEON::BI__builtin_neon_vqtbx2q_v:
8818   case NEON::BI__builtin_neon_vtbx3_v:
8819   case NEON::BI__builtin_neon_vqtbx3_v:
8820   case NEON::BI__builtin_neon_vqtbx3q_v:
8821   case NEON::BI__builtin_neon_vtbx4_v:
8822   case NEON::BI__builtin_neon_vqtbx4_v:
8823   case NEON::BI__builtin_neon_vqtbx4q_v:
8824     break;
8825   }
8826 
8827   assert(E->getNumArgs() >= 3);
8828 
8829   // Get the last argument, which specifies the vector type.
8830   const Expr *Arg = E->getArg(E->getNumArgs() - 1);
8831   std::optional<llvm::APSInt> Result =
8832       Arg->getIntegerConstantExpr(CGF.getContext());
8833   if (!Result)
8834     return nullptr;
8835 
8836   // Determine the type of this overloaded NEON intrinsic.
8837   NeonTypeFlags Type = Result->getZExtValue();
8838   llvm::FixedVectorType *Ty = GetNeonType(&CGF, Type);
8839   if (!Ty)
8840     return nullptr;
8841 
8842   CodeGen::CGBuilderTy &Builder = CGF.Builder;
8843 
8844   // AArch64 scalar builtins are not overloaded, they do not have an extra
8845   // argument that specifies the vector type, need to handle each case.
8846   switch (BuiltinID) {
8847   case NEON::BI__builtin_neon_vtbl1_v: {
8848     return packTBLDVectorList(CGF, ArrayRef(Ops).slice(0, 1), nullptr, Ops[1],
8849                               Ty, Intrinsic::aarch64_neon_tbl1, "vtbl1");
8850   }
8851   case NEON::BI__builtin_neon_vtbl2_v: {
8852     return packTBLDVectorList(CGF, ArrayRef(Ops).slice(0, 2), nullptr, Ops[2],
8853                               Ty, Intrinsic::aarch64_neon_tbl1, "vtbl1");
8854   }
8855   case NEON::BI__builtin_neon_vtbl3_v: {
8856     return packTBLDVectorList(CGF, ArrayRef(Ops).slice(0, 3), nullptr, Ops[3],
8857                               Ty, Intrinsic::aarch64_neon_tbl2, "vtbl2");
8858   }
8859   case NEON::BI__builtin_neon_vtbl4_v: {
8860     return packTBLDVectorList(CGF, ArrayRef(Ops).slice(0, 4), nullptr, Ops[4],
8861                               Ty, Intrinsic::aarch64_neon_tbl2, "vtbl2");
8862   }
8863   case NEON::BI__builtin_neon_vtbx1_v: {
8864     Value *TblRes =
8865         packTBLDVectorList(CGF, ArrayRef(Ops).slice(1, 1), nullptr, Ops[2], Ty,
8866                            Intrinsic::aarch64_neon_tbl1, "vtbl1");
8867 
8868     llvm::Constant *EightV = ConstantInt::get(Ty, 8);
8869     Value *CmpRes = Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[2], EightV);
8870     CmpRes = Builder.CreateSExt(CmpRes, Ty);
8871 
8872     Value *EltsFromInput = Builder.CreateAnd(CmpRes, Ops[0]);
8873     Value *EltsFromTbl = Builder.CreateAnd(Builder.CreateNot(CmpRes), TblRes);
8874     return Builder.CreateOr(EltsFromInput, EltsFromTbl, "vtbx");
8875   }
8876   case NEON::BI__builtin_neon_vtbx2_v: {
8877     return packTBLDVectorList(CGF, ArrayRef(Ops).slice(1, 2), Ops[0], Ops[3],
8878                               Ty, Intrinsic::aarch64_neon_tbx1, "vtbx1");
8879   }
8880   case NEON::BI__builtin_neon_vtbx3_v: {
8881     Value *TblRes =
8882         packTBLDVectorList(CGF, ArrayRef(Ops).slice(1, 3), nullptr, Ops[4], Ty,
8883                            Intrinsic::aarch64_neon_tbl2, "vtbl2");
8884 
8885     llvm::Constant *TwentyFourV = ConstantInt::get(Ty, 24);
8886     Value *CmpRes = Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[4],
8887                                            TwentyFourV);
8888     CmpRes = Builder.CreateSExt(CmpRes, Ty);
8889 
8890     Value *EltsFromInput = Builder.CreateAnd(CmpRes, Ops[0]);
8891     Value *EltsFromTbl = Builder.CreateAnd(Builder.CreateNot(CmpRes), TblRes);
8892     return Builder.CreateOr(EltsFromInput, EltsFromTbl, "vtbx");
8893   }
8894   case NEON::BI__builtin_neon_vtbx4_v: {
8895     return packTBLDVectorList(CGF, ArrayRef(Ops).slice(1, 4), Ops[0], Ops[5],
8896                               Ty, Intrinsic::aarch64_neon_tbx2, "vtbx2");
8897   }
8898   case NEON::BI__builtin_neon_vqtbl1_v:
8899   case NEON::BI__builtin_neon_vqtbl1q_v:
8900     Int = Intrinsic::aarch64_neon_tbl1; s = "vtbl1"; break;
8901   case NEON::BI__builtin_neon_vqtbl2_v:
8902   case NEON::BI__builtin_neon_vqtbl2q_v: {
8903     Int = Intrinsic::aarch64_neon_tbl2; s = "vtbl2"; break;
8904   case NEON::BI__builtin_neon_vqtbl3_v:
8905   case NEON::BI__builtin_neon_vqtbl3q_v:
8906     Int = Intrinsic::aarch64_neon_tbl3; s = "vtbl3"; break;
8907   case NEON::BI__builtin_neon_vqtbl4_v:
8908   case NEON::BI__builtin_neon_vqtbl4q_v:
8909     Int = Intrinsic::aarch64_neon_tbl4; s = "vtbl4"; break;
8910   case NEON::BI__builtin_neon_vqtbx1_v:
8911   case NEON::BI__builtin_neon_vqtbx1q_v:
8912     Int = Intrinsic::aarch64_neon_tbx1; s = "vtbx1"; break;
8913   case NEON::BI__builtin_neon_vqtbx2_v:
8914   case NEON::BI__builtin_neon_vqtbx2q_v:
8915     Int = Intrinsic::aarch64_neon_tbx2; s = "vtbx2"; break;
8916   case NEON::BI__builtin_neon_vqtbx3_v:
8917   case NEON::BI__builtin_neon_vqtbx3q_v:
8918     Int = Intrinsic::aarch64_neon_tbx3; s = "vtbx3"; break;
8919   case NEON::BI__builtin_neon_vqtbx4_v:
8920   case NEON::BI__builtin_neon_vqtbx4q_v:
8921     Int = Intrinsic::aarch64_neon_tbx4; s = "vtbx4"; break;
8922   }
8923   }
8924 
8925   if (!Int)
8926     return nullptr;
8927 
8928   Function *F = CGF.CGM.getIntrinsic(Int, Ty);
8929   return CGF.EmitNeonCall(F, Ops, s);
8930 }
8931 
8932 Value *CodeGenFunction::vectorWrapScalar16(Value *Op) {
8933   auto *VTy = llvm::FixedVectorType::get(Int16Ty, 4);
8934   Op = Builder.CreateBitCast(Op, Int16Ty);
8935   Value *V = PoisonValue::get(VTy);
8936   llvm::Constant *CI = ConstantInt::get(SizeTy, 0);
8937   Op = Builder.CreateInsertElement(V, Op, CI);
8938   return Op;
8939 }
8940 
8941 /// SVEBuiltinMemEltTy - Returns the memory element type for this memory
8942 /// access builtin.  Only required if it can't be inferred from the base pointer
8943 /// operand.
8944 llvm::Type *CodeGenFunction::SVEBuiltinMemEltTy(const SVETypeFlags &TypeFlags) {
8945   switch (TypeFlags.getMemEltType()) {
8946   case SVETypeFlags::MemEltTyDefault:
8947     return getEltType(TypeFlags);
8948   case SVETypeFlags::MemEltTyInt8:
8949     return Builder.getInt8Ty();
8950   case SVETypeFlags::MemEltTyInt16:
8951     return Builder.getInt16Ty();
8952   case SVETypeFlags::MemEltTyInt32:
8953     return Builder.getInt32Ty();
8954   case SVETypeFlags::MemEltTyInt64:
8955     return Builder.getInt64Ty();
8956   }
8957   llvm_unreachable("Unknown MemEltType");
8958 }
8959 
8960 llvm::Type *CodeGenFunction::getEltType(const SVETypeFlags &TypeFlags) {
8961   switch (TypeFlags.getEltType()) {
8962   default:
8963     llvm_unreachable("Invalid SVETypeFlag!");
8964 
8965   case SVETypeFlags::EltTyInt8:
8966     return Builder.getInt8Ty();
8967   case SVETypeFlags::EltTyInt16:
8968     return Builder.getInt16Ty();
8969   case SVETypeFlags::EltTyInt32:
8970     return Builder.getInt32Ty();
8971   case SVETypeFlags::EltTyInt64:
8972     return Builder.getInt64Ty();
8973   case SVETypeFlags::EltTyInt128:
8974     return Builder.getInt128Ty();
8975 
8976   case SVETypeFlags::EltTyFloat16:
8977     return Builder.getHalfTy();
8978   case SVETypeFlags::EltTyFloat32:
8979     return Builder.getFloatTy();
8980   case SVETypeFlags::EltTyFloat64:
8981     return Builder.getDoubleTy();
8982 
8983   case SVETypeFlags::EltTyBFloat16:
8984     return Builder.getBFloatTy();
8985 
8986   case SVETypeFlags::EltTyBool8:
8987   case SVETypeFlags::EltTyBool16:
8988   case SVETypeFlags::EltTyBool32:
8989   case SVETypeFlags::EltTyBool64:
8990     return Builder.getInt1Ty();
8991   }
8992 }
8993 
8994 // Return the llvm predicate vector type corresponding to the specified element
8995 // TypeFlags.
8996 llvm::ScalableVectorType *
8997 CodeGenFunction::getSVEPredType(const SVETypeFlags &TypeFlags) {
8998   switch (TypeFlags.getEltType()) {
8999   default: llvm_unreachable("Unhandled SVETypeFlag!");
9000 
9001   case SVETypeFlags::EltTyInt8:
9002     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
9003   case SVETypeFlags::EltTyInt16:
9004     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
9005   case SVETypeFlags::EltTyInt32:
9006     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4);
9007   case SVETypeFlags::EltTyInt64:
9008     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2);
9009 
9010   case SVETypeFlags::EltTyBFloat16:
9011     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
9012   case SVETypeFlags::EltTyFloat16:
9013     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
9014   case SVETypeFlags::EltTyFloat32:
9015     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4);
9016   case SVETypeFlags::EltTyFloat64:
9017     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2);
9018 
9019   case SVETypeFlags::EltTyBool8:
9020     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
9021   case SVETypeFlags::EltTyBool16:
9022     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
9023   case SVETypeFlags::EltTyBool32:
9024     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4);
9025   case SVETypeFlags::EltTyBool64:
9026     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2);
9027   }
9028 }
9029 
9030 // Return the llvm vector type corresponding to the specified element TypeFlags.
9031 llvm::ScalableVectorType *
9032 CodeGenFunction::getSVEType(const SVETypeFlags &TypeFlags) {
9033   switch (TypeFlags.getEltType()) {
9034   default:
9035     llvm_unreachable("Invalid SVETypeFlag!");
9036 
9037   case SVETypeFlags::EltTyInt8:
9038     return llvm::ScalableVectorType::get(Builder.getInt8Ty(), 16);
9039   case SVETypeFlags::EltTyInt16:
9040     return llvm::ScalableVectorType::get(Builder.getInt16Ty(), 8);
9041   case SVETypeFlags::EltTyInt32:
9042     return llvm::ScalableVectorType::get(Builder.getInt32Ty(), 4);
9043   case SVETypeFlags::EltTyInt64:
9044     return llvm::ScalableVectorType::get(Builder.getInt64Ty(), 2);
9045 
9046   case SVETypeFlags::EltTyFloat16:
9047     return llvm::ScalableVectorType::get(Builder.getHalfTy(), 8);
9048   case SVETypeFlags::EltTyBFloat16:
9049     return llvm::ScalableVectorType::get(Builder.getBFloatTy(), 8);
9050   case SVETypeFlags::EltTyFloat32:
9051     return llvm::ScalableVectorType::get(Builder.getFloatTy(), 4);
9052   case SVETypeFlags::EltTyFloat64:
9053     return llvm::ScalableVectorType::get(Builder.getDoubleTy(), 2);
9054 
9055   case SVETypeFlags::EltTyBool8:
9056     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
9057   case SVETypeFlags::EltTyBool16:
9058     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
9059   case SVETypeFlags::EltTyBool32:
9060     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4);
9061   case SVETypeFlags::EltTyBool64:
9062     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2);
9063   }
9064 }
9065 
9066 llvm::Value *
9067 CodeGenFunction::EmitSVEAllTruePred(const SVETypeFlags &TypeFlags) {
9068   Function *Ptrue =
9069       CGM.getIntrinsic(Intrinsic::aarch64_sve_ptrue, getSVEPredType(TypeFlags));
9070   return Builder.CreateCall(Ptrue, {Builder.getInt32(/*SV_ALL*/ 31)});
9071 }
9072 
9073 constexpr unsigned SVEBitsPerBlock = 128;
9074 
9075 static llvm::ScalableVectorType *getSVEVectorForElementType(llvm::Type *EltTy) {
9076   unsigned NumElts = SVEBitsPerBlock / EltTy->getScalarSizeInBits();
9077   return llvm::ScalableVectorType::get(EltTy, NumElts);
9078 }
9079 
9080 // Reinterpret the input predicate so that it can be used to correctly isolate
9081 // the elements of the specified datatype.
9082 Value *CodeGenFunction::EmitSVEPredicateCast(Value *Pred,
9083                                              llvm::ScalableVectorType *VTy) {
9084   auto *RTy = llvm::VectorType::get(IntegerType::get(getLLVMContext(), 1), VTy);
9085   if (Pred->getType() == RTy)
9086     return Pred;
9087 
9088   unsigned IntID;
9089   llvm::Type *IntrinsicTy;
9090   switch (VTy->getMinNumElements()) {
9091   default:
9092     llvm_unreachable("unsupported element count!");
9093   case 1:
9094   case 2:
9095   case 4:
9096   case 8:
9097     IntID = Intrinsic::aarch64_sve_convert_from_svbool;
9098     IntrinsicTy = RTy;
9099     break;
9100   case 16:
9101     IntID = Intrinsic::aarch64_sve_convert_to_svbool;
9102     IntrinsicTy = Pred->getType();
9103     break;
9104   }
9105 
9106   Function *F = CGM.getIntrinsic(IntID, IntrinsicTy);
9107   Value *C = Builder.CreateCall(F, Pred);
9108   assert(C->getType() == RTy && "Unexpected return type!");
9109   return C;
9110 }
9111 
9112 Value *CodeGenFunction::EmitSVEGatherLoad(const SVETypeFlags &TypeFlags,
9113                                           SmallVectorImpl<Value *> &Ops,
9114                                           unsigned IntID) {
9115   auto *ResultTy = getSVEType(TypeFlags);
9116   auto *OverloadedTy =
9117       llvm::ScalableVectorType::get(SVEBuiltinMemEltTy(TypeFlags), ResultTy);
9118 
9119   // At the ACLE level there's only one predicate type, svbool_t, which is
9120   // mapped to <n x 16 x i1>. However, this might be incompatible with the
9121   // actual type being loaded. For example, when loading doubles (i64) the
9122   // predicated should be <n x 2 x i1> instead. At the IR level the type of
9123   // the predicate and the data being loaded must match. Cast accordingly.
9124   Ops[0] = EmitSVEPredicateCast(Ops[0], OverloadedTy);
9125 
9126   Function *F = nullptr;
9127   if (Ops[1]->getType()->isVectorTy())
9128     // This is the "vector base, scalar offset" case. In order to uniquely
9129     // map this built-in to an LLVM IR intrinsic, we need both the return type
9130     // and the type of the vector base.
9131     F = CGM.getIntrinsic(IntID, {OverloadedTy, Ops[1]->getType()});
9132   else
9133     // This is the "scalar base, vector offset case". The type of the offset
9134     // is encoded in the name of the intrinsic. We only need to specify the
9135     // return type in order to uniquely map this built-in to an LLVM IR
9136     // intrinsic.
9137     F = CGM.getIntrinsic(IntID, OverloadedTy);
9138 
9139   // Pass 0 when the offset is missing. This can only be applied when using
9140   // the "vector base" addressing mode for which ACLE allows no offset. The
9141   // corresponding LLVM IR always requires an offset.
9142   if (Ops.size() == 2) {
9143     assert(Ops[1]->getType()->isVectorTy() && "Scalar base requires an offset");
9144     Ops.push_back(ConstantInt::get(Int64Ty, 0));
9145   }
9146 
9147   // For "vector base, scalar index" scale the index so that it becomes a
9148   // scalar offset.
9149   if (!TypeFlags.isByteIndexed() && Ops[1]->getType()->isVectorTy()) {
9150     unsigned BytesPerElt =
9151         OverloadedTy->getElementType()->getScalarSizeInBits() / 8;
9152     Ops[2] = Builder.CreateShl(Ops[2], Log2_32(BytesPerElt));
9153   }
9154 
9155   Value *Call = Builder.CreateCall(F, Ops);
9156 
9157   // The following sext/zext is only needed when ResultTy != OverloadedTy. In
9158   // other cases it's folded into a nop.
9159   return TypeFlags.isZExtReturn() ? Builder.CreateZExt(Call, ResultTy)
9160                                   : Builder.CreateSExt(Call, ResultTy);
9161 }
9162 
9163 Value *CodeGenFunction::EmitSVEScatterStore(const SVETypeFlags &TypeFlags,
9164                                             SmallVectorImpl<Value *> &Ops,
9165                                             unsigned IntID) {
9166   auto *SrcDataTy = getSVEType(TypeFlags);
9167   auto *OverloadedTy =
9168       llvm::ScalableVectorType::get(SVEBuiltinMemEltTy(TypeFlags), SrcDataTy);
9169 
9170   // In ACLE the source data is passed in the last argument, whereas in LLVM IR
9171   // it's the first argument. Move it accordingly.
9172   Ops.insert(Ops.begin(), Ops.pop_back_val());
9173 
9174   Function *F = nullptr;
9175   if (Ops[2]->getType()->isVectorTy())
9176     // This is the "vector base, scalar offset" case. In order to uniquely
9177     // map this built-in to an LLVM IR intrinsic, we need both the return type
9178     // and the type of the vector base.
9179     F = CGM.getIntrinsic(IntID, {OverloadedTy, Ops[2]->getType()});
9180   else
9181     // This is the "scalar base, vector offset case". The type of the offset
9182     // is encoded in the name of the intrinsic. We only need to specify the
9183     // return type in order to uniquely map this built-in to an LLVM IR
9184     // intrinsic.
9185     F = CGM.getIntrinsic(IntID, OverloadedTy);
9186 
9187   // Pass 0 when the offset is missing. This can only be applied when using
9188   // the "vector base" addressing mode for which ACLE allows no offset. The
9189   // corresponding LLVM IR always requires an offset.
9190   if (Ops.size() == 3) {
9191     assert(Ops[1]->getType()->isVectorTy() && "Scalar base requires an offset");
9192     Ops.push_back(ConstantInt::get(Int64Ty, 0));
9193   }
9194 
9195   // Truncation is needed when SrcDataTy != OverloadedTy. In other cases it's
9196   // folded into a nop.
9197   Ops[0] = Builder.CreateTrunc(Ops[0], OverloadedTy);
9198 
9199   // At the ACLE level there's only one predicate type, svbool_t, which is
9200   // mapped to <n x 16 x i1>. However, this might be incompatible with the
9201   // actual type being stored. For example, when storing doubles (i64) the
9202   // predicated should be <n x 2 x i1> instead. At the IR level the type of
9203   // the predicate and the data being stored must match. Cast accordingly.
9204   Ops[1] = EmitSVEPredicateCast(Ops[1], OverloadedTy);
9205 
9206   // For "vector base, scalar index" scale the index so that it becomes a
9207   // scalar offset.
9208   if (!TypeFlags.isByteIndexed() && Ops[2]->getType()->isVectorTy()) {
9209     unsigned BytesPerElt =
9210         OverloadedTy->getElementType()->getScalarSizeInBits() / 8;
9211     Ops[3] = Builder.CreateShl(Ops[3], Log2_32(BytesPerElt));
9212   }
9213 
9214   return Builder.CreateCall(F, Ops);
9215 }
9216 
9217 Value *CodeGenFunction::EmitSVEGatherPrefetch(const SVETypeFlags &TypeFlags,
9218                                               SmallVectorImpl<Value *> &Ops,
9219                                               unsigned IntID) {
9220   // The gather prefetches are overloaded on the vector input - this can either
9221   // be the vector of base addresses or vector of offsets.
9222   auto *OverloadedTy = dyn_cast<llvm::ScalableVectorType>(Ops[1]->getType());
9223   if (!OverloadedTy)
9224     OverloadedTy = cast<llvm::ScalableVectorType>(Ops[2]->getType());
9225 
9226   // Cast the predicate from svbool_t to the right number of elements.
9227   Ops[0] = EmitSVEPredicateCast(Ops[0], OverloadedTy);
9228 
9229   // vector + imm addressing modes
9230   if (Ops[1]->getType()->isVectorTy()) {
9231     if (Ops.size() == 3) {
9232       // Pass 0 for 'vector+imm' when the index is omitted.
9233       Ops.push_back(ConstantInt::get(Int64Ty, 0));
9234 
9235       // The sv_prfop is the last operand in the builtin and IR intrinsic.
9236       std::swap(Ops[2], Ops[3]);
9237     } else {
9238       // Index needs to be passed as scaled offset.
9239       llvm::Type *MemEltTy = SVEBuiltinMemEltTy(TypeFlags);
9240       unsigned BytesPerElt = MemEltTy->getPrimitiveSizeInBits() / 8;
9241       if (BytesPerElt > 1)
9242         Ops[2] = Builder.CreateShl(Ops[2], Log2_32(BytesPerElt));
9243     }
9244   }
9245 
9246   Function *F = CGM.getIntrinsic(IntID, OverloadedTy);
9247   return Builder.CreateCall(F, Ops);
9248 }
9249 
9250 Value *CodeGenFunction::EmitSVEStructLoad(const SVETypeFlags &TypeFlags,
9251                                           SmallVectorImpl<Value*> &Ops,
9252                                           unsigned IntID) {
9253   llvm::ScalableVectorType *VTy = getSVEType(TypeFlags);
9254   auto VecPtrTy = llvm::PointerType::getUnqual(VTy);
9255   auto EltPtrTy = llvm::PointerType::getUnqual(VTy->getElementType());
9256 
9257   unsigned N;
9258   switch (IntID) {
9259   case Intrinsic::aarch64_sve_ld2_sret:
9260     N = 2;
9261     break;
9262   case Intrinsic::aarch64_sve_ld3_sret:
9263     N = 3;
9264     break;
9265   case Intrinsic::aarch64_sve_ld4_sret:
9266     N = 4;
9267     break;
9268   default:
9269     llvm_unreachable("unknown intrinsic!");
9270   }
9271   auto RetTy = llvm::VectorType::get(VTy->getElementType(),
9272                                      VTy->getElementCount() * N);
9273 
9274 	Value *Predicate = EmitSVEPredicateCast(Ops[0], VTy);
9275   Value *BasePtr= Builder.CreateBitCast(Ops[1], VecPtrTy);
9276 
9277   // Does the load have an offset?
9278   if (Ops.size() > 2)
9279     BasePtr = Builder.CreateGEP(VTy, BasePtr, Ops[2]);
9280 
9281   BasePtr = Builder.CreateBitCast(BasePtr, EltPtrTy);
9282   Function *F = CGM.getIntrinsic(IntID, {VTy});
9283   Value *Call = Builder.CreateCall(F, {Predicate, BasePtr});
9284   unsigned MinElts = VTy->getMinNumElements();
9285   Value *Ret = llvm::PoisonValue::get(RetTy);
9286   for (unsigned I = 0; I < N; I++) {
9287     Value *Idx = ConstantInt::get(CGM.Int64Ty, I * MinElts);
9288     Value *SRet = Builder.CreateExtractValue(Call, I);
9289     Ret = Builder.CreateInsertVector(RetTy, Ret, SRet, Idx);
9290   }
9291   return Ret;
9292 }
9293 
9294 Value *CodeGenFunction::EmitSVEStructStore(const SVETypeFlags &TypeFlags,
9295                                            SmallVectorImpl<Value*> &Ops,
9296                                            unsigned IntID) {
9297   llvm::ScalableVectorType *VTy = getSVEType(TypeFlags);
9298   auto VecPtrTy = llvm::PointerType::getUnqual(VTy);
9299   auto EltPtrTy = llvm::PointerType::getUnqual(VTy->getElementType());
9300 
9301   unsigned N;
9302   switch (IntID) {
9303   case Intrinsic::aarch64_sve_st2:
9304     N = 2;
9305     break;
9306   case Intrinsic::aarch64_sve_st3:
9307     N = 3;
9308     break;
9309   case Intrinsic::aarch64_sve_st4:
9310     N = 4;
9311     break;
9312   default:
9313     llvm_unreachable("unknown intrinsic!");
9314   }
9315 
9316   Value *Predicate = EmitSVEPredicateCast(Ops[0], VTy);
9317   Value *BasePtr = Builder.CreateBitCast(Ops[1], VecPtrTy);
9318 
9319   // Does the store have an offset?
9320   if (Ops.size() > 3)
9321     BasePtr = Builder.CreateGEP(VTy, BasePtr, Ops[2]);
9322 
9323   BasePtr = Builder.CreateBitCast(BasePtr, EltPtrTy);
9324   Value *Val = Ops.back();
9325 
9326   // The llvm.aarch64.sve.st2/3/4 intrinsics take legal part vectors, so we
9327   // need to break up the tuple vector.
9328   SmallVector<llvm::Value*, 5> Operands;
9329   unsigned MinElts = VTy->getElementCount().getKnownMinValue();
9330   for (unsigned I = 0; I < N; ++I) {
9331     Value *Idx = ConstantInt::get(CGM.Int64Ty, I * MinElts);
9332     Operands.push_back(Builder.CreateExtractVector(VTy, Val, Idx));
9333   }
9334   Operands.append({Predicate, BasePtr});
9335 
9336   Function *F = CGM.getIntrinsic(IntID, { VTy });
9337   return Builder.CreateCall(F, Operands);
9338 }
9339 
9340 // SVE2's svpmullb and svpmullt builtins are similar to the svpmullb_pair and
9341 // svpmullt_pair intrinsics, with the exception that their results are bitcast
9342 // to a wider type.
9343 Value *CodeGenFunction::EmitSVEPMull(const SVETypeFlags &TypeFlags,
9344                                      SmallVectorImpl<Value *> &Ops,
9345                                      unsigned BuiltinID) {
9346   // Splat scalar operand to vector (intrinsics with _n infix)
9347   if (TypeFlags.hasSplatOperand()) {
9348     unsigned OpNo = TypeFlags.getSplatOperand();
9349     Ops[OpNo] = EmitSVEDupX(Ops[OpNo]);
9350   }
9351 
9352   // The pair-wise function has a narrower overloaded type.
9353   Function *F = CGM.getIntrinsic(BuiltinID, Ops[0]->getType());
9354   Value *Call = Builder.CreateCall(F, {Ops[0], Ops[1]});
9355 
9356   // Now bitcast to the wider result type.
9357   llvm::ScalableVectorType *Ty = getSVEType(TypeFlags);
9358   return EmitSVEReinterpret(Call, Ty);
9359 }
9360 
9361 Value *CodeGenFunction::EmitSVEMovl(const SVETypeFlags &TypeFlags,
9362                                     ArrayRef<Value *> Ops, unsigned BuiltinID) {
9363   llvm::Type *OverloadedTy = getSVEType(TypeFlags);
9364   Function *F = CGM.getIntrinsic(BuiltinID, OverloadedTy);
9365   return Builder.CreateCall(F, {Ops[0], Builder.getInt32(0)});
9366 }
9367 
9368 Value *CodeGenFunction::EmitSVEPrefetchLoad(const SVETypeFlags &TypeFlags,
9369                                             SmallVectorImpl<Value *> &Ops,
9370                                             unsigned BuiltinID) {
9371   auto *MemEltTy = SVEBuiltinMemEltTy(TypeFlags);
9372   auto *VectorTy = getSVEVectorForElementType(MemEltTy);
9373   auto *MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
9374 
9375   Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy);
9376   Value *BasePtr = Ops[1];
9377 
9378   // Implement the index operand if not omitted.
9379   if (Ops.size() > 3)
9380     BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Ops[2]);
9381 
9382   Value *PrfOp = Ops.back();
9383 
9384   Function *F = CGM.getIntrinsic(BuiltinID, Predicate->getType());
9385   return Builder.CreateCall(F, {Predicate, BasePtr, PrfOp});
9386 }
9387 
9388 Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E,
9389                                           llvm::Type *ReturnTy,
9390                                           SmallVectorImpl<Value *> &Ops,
9391                                           unsigned BuiltinID,
9392                                           bool IsZExtReturn) {
9393   QualType LangPTy = E->getArg(1)->getType();
9394   llvm::Type *MemEltTy = CGM.getTypes().ConvertType(
9395       LangPTy->castAs<PointerType>()->getPointeeType());
9396 
9397   // The vector type that is returned may be different from the
9398   // eventual type loaded from memory.
9399   auto VectorTy = cast<llvm::ScalableVectorType>(ReturnTy);
9400   auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
9401 
9402   Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy);
9403   Value *BasePtr = Ops[1];
9404 
9405   // Does the load have an offset?
9406   if (Ops.size() > 2)
9407     BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Ops[2]);
9408 
9409   Function *F = CGM.getIntrinsic(BuiltinID, MemoryTy);
9410   auto *Load =
9411       cast<llvm::Instruction>(Builder.CreateCall(F, {Predicate, BasePtr}));
9412   auto TBAAInfo = CGM.getTBAAAccessInfo(LangPTy->getPointeeType());
9413   CGM.DecorateInstructionWithTBAA(Load, TBAAInfo);
9414 
9415   return IsZExtReturn ? Builder.CreateZExt(Load, VectorTy)
9416                      : Builder.CreateSExt(Load, VectorTy);
9417 }
9418 
9419 Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E,
9420                                            SmallVectorImpl<Value *> &Ops,
9421                                            unsigned BuiltinID) {
9422   QualType LangPTy = E->getArg(1)->getType();
9423   llvm::Type *MemEltTy = CGM.getTypes().ConvertType(
9424       LangPTy->castAs<PointerType>()->getPointeeType());
9425 
9426   // The vector type that is stored may be different from the
9427   // eventual type stored to memory.
9428   auto VectorTy = cast<llvm::ScalableVectorType>(Ops.back()->getType());
9429   auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
9430 
9431   Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy);
9432   Value *BasePtr = Ops[1];
9433 
9434   // Does the store have an offset?
9435   if (Ops.size() == 4)
9436     BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Ops[2]);
9437 
9438   // Last value is always the data
9439   llvm::Value *Val = Builder.CreateTrunc(Ops.back(), MemoryTy);
9440 
9441   Function *F = CGM.getIntrinsic(BuiltinID, MemoryTy);
9442   auto *Store =
9443       cast<llvm::Instruction>(Builder.CreateCall(F, {Val, Predicate, BasePtr}));
9444   auto TBAAInfo = CGM.getTBAAAccessInfo(LangPTy->getPointeeType());
9445   CGM.DecorateInstructionWithTBAA(Store, TBAAInfo);
9446   return Store;
9447 }
9448 
9449 Value *CodeGenFunction::EmitTileslice(Value *Offset, Value *Base) {
9450   llvm::Value *CastOffset = Builder.CreateIntCast(Offset, Int32Ty, false);
9451   return Builder.CreateAdd(Base, CastOffset, "tileslice");
9452 }
9453 
9454 Value *CodeGenFunction::EmitSMELd1St1(SVETypeFlags TypeFlags,
9455                                       SmallVectorImpl<Value *> &Ops,
9456                                       unsigned IntID) {
9457   Ops[3] = EmitSVEPredicateCast(
9458       Ops[3], getSVEVectorForElementType(SVEBuiltinMemEltTy(TypeFlags)));
9459 
9460   SmallVector<Value *> NewOps;
9461   NewOps.push_back(Ops[3]);
9462 
9463   llvm::Value *BasePtr = Ops[4];
9464 
9465   // If the intrinsic contains the vnum parameter, multiply it with the vector
9466   // size in bytes.
9467   if (Ops.size() == 6) {
9468     Function *StreamingVectorLength =
9469         CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb);
9470     llvm::Value *StreamingVectorLengthCall =
9471         Builder.CreateCall(StreamingVectorLength);
9472     llvm::Value *Mulvl =
9473         Builder.CreateMul(StreamingVectorLengthCall, Ops[5], "mulvl");
9474     // The type of the ptr parameter is void *, so use Int8Ty here.
9475     BasePtr = Builder.CreateGEP(Int8Ty, Ops[4], Mulvl);
9476   }
9477   NewOps.push_back(BasePtr);
9478   NewOps.push_back(Ops[0]);
9479   NewOps.push_back(EmitTileslice(Ops[2], Ops[1]));
9480   Function *F = CGM.getIntrinsic(IntID);
9481   return Builder.CreateCall(F, NewOps);
9482 }
9483 
9484 Value *CodeGenFunction::EmitSMEReadWrite(SVETypeFlags TypeFlags,
9485                                          SmallVectorImpl<Value *> &Ops,
9486                                          unsigned IntID) {
9487   auto *VecTy = getSVEType(TypeFlags);
9488   Function *F = CGM.getIntrinsic(IntID, VecTy);
9489   if (TypeFlags.isReadZA()) {
9490     Ops[1] = EmitSVEPredicateCast(Ops[1], VecTy);
9491     Ops[3] = EmitTileslice(Ops[4], Ops[3]);
9492     Ops.erase(&Ops[4]);
9493   } else if (TypeFlags.isWriteZA()) {
9494     Ops[1] = EmitTileslice(Ops[2], Ops[1]);
9495     Ops[2] = EmitSVEPredicateCast(Ops[3], VecTy);
9496     Ops.erase(&Ops[3]);
9497   }
9498   return Builder.CreateCall(F, Ops);
9499 }
9500 
9501 Value *CodeGenFunction::EmitSMEZero(SVETypeFlags TypeFlags,
9502                                     SmallVectorImpl<Value *> &Ops,
9503                                     unsigned IntID) {
9504   // svzero_za() intrinsic zeros the entire za tile and has no paramters.
9505   if (Ops.size() == 0)
9506     Ops.push_back(llvm::ConstantInt::get(Int32Ty, 255));
9507   Function *F = CGM.getIntrinsic(IntID, {});
9508   return Builder.CreateCall(F, Ops);
9509 }
9510 
9511 Value *CodeGenFunction::EmitSMELdrStr(SVETypeFlags TypeFlags,
9512                                       SmallVectorImpl<Value *> &Ops,
9513                                       unsigned IntID) {
9514   Function *Cntsb = CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb);
9515   llvm::Value *CntsbCall = Builder.CreateCall(Cntsb, {}, "svlb");
9516   llvm::Value *MulVL = Builder.CreateMul(
9517       CntsbCall,
9518       Builder.getInt64(cast<llvm::ConstantInt>(Ops[1])->getZExtValue()),
9519       "mulvl");
9520   Ops[2] = Builder.CreateGEP(Int8Ty, Ops[2], MulVL);
9521   Ops[0] = EmitTileslice(Ops[1], Ops[0]);
9522   Ops.erase(&Ops[1]);
9523   Function *F = CGM.getIntrinsic(IntID, {});
9524   return Builder.CreateCall(F, Ops);
9525 }
9526 
9527 // Limit the usage of scalable llvm IR generated by the ACLE by using the
9528 // sve dup.x intrinsic instead of IRBuilder::CreateVectorSplat.
9529 Value *CodeGenFunction::EmitSVEDupX(Value *Scalar, llvm::Type *Ty) {
9530   return Builder.CreateVectorSplat(
9531       cast<llvm::VectorType>(Ty)->getElementCount(), Scalar);
9532 }
9533 
9534 Value *CodeGenFunction::EmitSVEDupX(Value* Scalar) {
9535   return EmitSVEDupX(Scalar, getSVEVectorForElementType(Scalar->getType()));
9536 }
9537 
9538 Value *CodeGenFunction::EmitSVEReinterpret(Value *Val, llvm::Type *Ty) {
9539   // FIXME: For big endian this needs an additional REV, or needs a separate
9540   // intrinsic that is code-generated as a no-op, because the LLVM bitcast
9541   // instruction is defined as 'bitwise' equivalent from memory point of
9542   // view (when storing/reloading), whereas the svreinterpret builtin
9543   // implements bitwise equivalent cast from register point of view.
9544   // LLVM CodeGen for a bitcast must add an explicit REV for big-endian.
9545   return Builder.CreateBitCast(Val, Ty);
9546 }
9547 
9548 static void InsertExplicitZeroOperand(CGBuilderTy &Builder, llvm::Type *Ty,
9549                                       SmallVectorImpl<Value *> &Ops) {
9550   auto *SplatZero = Constant::getNullValue(Ty);
9551   Ops.insert(Ops.begin(), SplatZero);
9552 }
9553 
9554 static void InsertExplicitUndefOperand(CGBuilderTy &Builder, llvm::Type *Ty,
9555                                        SmallVectorImpl<Value *> &Ops) {
9556   auto *SplatUndef = UndefValue::get(Ty);
9557   Ops.insert(Ops.begin(), SplatUndef);
9558 }
9559 
9560 SmallVector<llvm::Type *, 2>
9561 CodeGenFunction::getSVEOverloadTypes(const SVETypeFlags &TypeFlags,
9562                                      llvm::Type *ResultType,
9563                                      ArrayRef<Value *> Ops) {
9564   if (TypeFlags.isOverloadNone())
9565     return {};
9566 
9567   llvm::Type *DefaultType = getSVEType(TypeFlags);
9568 
9569   if (TypeFlags.isOverloadWhile())
9570     return {DefaultType, Ops[1]->getType()};
9571 
9572   if (TypeFlags.isOverloadWhileRW())
9573     return {getSVEPredType(TypeFlags), Ops[0]->getType()};
9574 
9575   if (TypeFlags.isOverloadCvt())
9576     return {Ops[0]->getType(), Ops.back()->getType()};
9577 
9578   assert(TypeFlags.isOverloadDefault() && "Unexpected value for overloads");
9579   return {DefaultType};
9580 }
9581 
9582 Value *CodeGenFunction::EmitSVETupleSetOrGet(const SVETypeFlags &TypeFlags,
9583                                              llvm::Type *Ty,
9584                                              ArrayRef<Value *> Ops) {
9585   assert((TypeFlags.isTupleSet() || TypeFlags.isTupleGet()) &&
9586          "Expects TypleFlag isTupleSet or TypeFlags.isTupleSet()");
9587 
9588   unsigned I = cast<ConstantInt>(Ops[1])->getSExtValue();
9589   auto *SingleVecTy = dyn_cast<llvm::ScalableVectorType>(
9590                       TypeFlags.isTupleSet() ? Ops[2]->getType() : Ty);
9591   Value *Idx = ConstantInt::get(CGM.Int64Ty,
9592                                 I * SingleVecTy->getMinNumElements());
9593 
9594   if (TypeFlags.isTupleSet())
9595     return Builder.CreateInsertVector(Ty, Ops[0], Ops[2], Idx);
9596   return Builder.CreateExtractVector(Ty, Ops[0], Idx);
9597 }
9598 
9599 Value *CodeGenFunction::EmitSVETupleCreate(const SVETypeFlags &TypeFlags,
9600                                              llvm::Type *Ty,
9601                                              ArrayRef<Value *> Ops) {
9602   assert(TypeFlags.isTupleCreate() && "Expects TypleFlag isTupleCreate");
9603 
9604   auto *SrcTy = dyn_cast<llvm::ScalableVectorType>(Ops[0]->getType());
9605   unsigned MinElts = SrcTy->getMinNumElements();
9606   Value *Call = llvm::PoisonValue::get(Ty);
9607   for (unsigned I = 0; I < Ops.size(); I++) {
9608     Value *Idx = ConstantInt::get(CGM.Int64Ty, I * MinElts);
9609     Call = Builder.CreateInsertVector(Ty, Call, Ops[I], Idx);
9610   }
9611 
9612   return Call;
9613 }
9614 
9615 Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
9616                                                   const CallExpr *E) {
9617   // Find out if any arguments are required to be integer constant expressions.
9618   unsigned ICEArguments = 0;
9619   ASTContext::GetBuiltinTypeError Error;
9620   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
9621   assert(Error == ASTContext::GE_None && "Should not codegen an error");
9622 
9623   llvm::Type *Ty = ConvertType(E->getType());
9624   if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 &&
9625       BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64) {
9626     Value *Val = EmitScalarExpr(E->getArg(0));
9627     return EmitSVEReinterpret(Val, Ty);
9628   }
9629 
9630   llvm::SmallVector<Value *, 4> Ops;
9631   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
9632     if ((ICEArguments & (1 << i)) == 0)
9633       Ops.push_back(EmitScalarExpr(E->getArg(i)));
9634     else {
9635       // If this is required to be a constant, constant fold it so that we know
9636       // that the generated intrinsic gets a ConstantInt.
9637       std::optional<llvm::APSInt> Result =
9638           E->getArg(i)->getIntegerConstantExpr(getContext());
9639       assert(Result && "Expected argument to be a constant");
9640 
9641       // Immediates for SVE llvm intrinsics are always 32bit.  We can safely
9642       // truncate because the immediate has been range checked and no valid
9643       // immediate requires more than a handful of bits.
9644       *Result = Result->extOrTrunc(32);
9645       Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), *Result));
9646     }
9647   }
9648 
9649   auto *Builtin = findARMVectorIntrinsicInMap(AArch64SVEIntrinsicMap, BuiltinID,
9650                                               AArch64SVEIntrinsicsProvenSorted);
9651   SVETypeFlags TypeFlags(Builtin->TypeModifier);
9652   if (TypeFlags.isLoad())
9653     return EmitSVEMaskedLoad(E, Ty, Ops, Builtin->LLVMIntrinsic,
9654                              TypeFlags.isZExtReturn());
9655   else if (TypeFlags.isStore())
9656     return EmitSVEMaskedStore(E, Ops, Builtin->LLVMIntrinsic);
9657   else if (TypeFlags.isGatherLoad())
9658     return EmitSVEGatherLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
9659   else if (TypeFlags.isScatterStore())
9660     return EmitSVEScatterStore(TypeFlags, Ops, Builtin->LLVMIntrinsic);
9661   else if (TypeFlags.isPrefetch())
9662     return EmitSVEPrefetchLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
9663   else if (TypeFlags.isGatherPrefetch())
9664     return EmitSVEGatherPrefetch(TypeFlags, Ops, Builtin->LLVMIntrinsic);
9665 	else if (TypeFlags.isStructLoad())
9666 		return EmitSVEStructLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
9667 	else if (TypeFlags.isStructStore())
9668 		return EmitSVEStructStore(TypeFlags, Ops, Builtin->LLVMIntrinsic);
9669   else if (TypeFlags.isTupleSet() || TypeFlags.isTupleGet())
9670         return EmitSVETupleSetOrGet(TypeFlags, Ty, Ops);
9671   else if (TypeFlags.isTupleCreate())
9672         return EmitSVETupleCreate(TypeFlags, Ty, Ops);
9673   else if (TypeFlags.isUndef())
9674     return UndefValue::get(Ty);
9675   else if (Builtin->LLVMIntrinsic != 0) {
9676     if (TypeFlags.getMergeType() == SVETypeFlags::MergeZeroExp)
9677       InsertExplicitZeroOperand(Builder, Ty, Ops);
9678 
9679     if (TypeFlags.getMergeType() == SVETypeFlags::MergeAnyExp)
9680       InsertExplicitUndefOperand(Builder, Ty, Ops);
9681 
9682     // Some ACLE builtins leave out the argument to specify the predicate
9683     // pattern, which is expected to be expanded to an SV_ALL pattern.
9684     if (TypeFlags.isAppendSVALL())
9685       Ops.push_back(Builder.getInt32(/*SV_ALL*/ 31));
9686     if (TypeFlags.isInsertOp1SVALL())
9687       Ops.insert(&Ops[1], Builder.getInt32(/*SV_ALL*/ 31));
9688 
9689     // Predicates must match the main datatype.
9690     for (unsigned i = 0, e = Ops.size(); i != e; ++i)
9691       if (auto PredTy = dyn_cast<llvm::VectorType>(Ops[i]->getType()))
9692         if (PredTy->getElementType()->isIntegerTy(1))
9693           Ops[i] = EmitSVEPredicateCast(Ops[i], getSVEType(TypeFlags));
9694 
9695     // Splat scalar operand to vector (intrinsics with _n infix)
9696     if (TypeFlags.hasSplatOperand()) {
9697       unsigned OpNo = TypeFlags.getSplatOperand();
9698       Ops[OpNo] = EmitSVEDupX(Ops[OpNo]);
9699     }
9700 
9701     if (TypeFlags.isReverseCompare())
9702       std::swap(Ops[1], Ops[2]);
9703     else if (TypeFlags.isReverseUSDOT())
9704       std::swap(Ops[1], Ops[2]);
9705     else if (TypeFlags.isReverseMergeAnyBinOp() &&
9706              TypeFlags.getMergeType() == SVETypeFlags::MergeAny)
9707       std::swap(Ops[1], Ops[2]);
9708     else if (TypeFlags.isReverseMergeAnyAccOp() &&
9709              TypeFlags.getMergeType() == SVETypeFlags::MergeAny)
9710       std::swap(Ops[1], Ops[3]);
9711 
9712     // Predicated intrinsics with _z suffix need a select w/ zeroinitializer.
9713     if (TypeFlags.getMergeType() == SVETypeFlags::MergeZero) {
9714       llvm::Type *OpndTy = Ops[1]->getType();
9715       auto *SplatZero = Constant::getNullValue(OpndTy);
9716       Ops[1] = Builder.CreateSelect(Ops[0], Ops[1], SplatZero);
9717     }
9718 
9719     Function *F = CGM.getIntrinsic(Builtin->LLVMIntrinsic,
9720                                    getSVEOverloadTypes(TypeFlags, Ty, Ops));
9721     Value *Call = Builder.CreateCall(F, Ops);
9722 
9723     // Predicate results must be converted to svbool_t.
9724     if (auto PredTy = dyn_cast<llvm::VectorType>(Call->getType()))
9725       if (PredTy->getScalarType()->isIntegerTy(1))
9726         Call = EmitSVEPredicateCast(Call, cast<llvm::ScalableVectorType>(Ty));
9727 
9728     return Call;
9729   }
9730 
9731   switch (BuiltinID) {
9732   default:
9733     return nullptr;
9734 
9735   case SVE::BI__builtin_sve_svmov_b_z: {
9736     // svmov_b_z(pg, op) <=> svand_b_z(pg, op, op)
9737     SVETypeFlags TypeFlags(Builtin->TypeModifier);
9738     llvm::Type* OverloadedTy = getSVEType(TypeFlags);
9739     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_and_z, OverloadedTy);
9740     return Builder.CreateCall(F, {Ops[0], Ops[1], Ops[1]});
9741   }
9742 
9743   case SVE::BI__builtin_sve_svnot_b_z: {
9744     // svnot_b_z(pg, op) <=> sveor_b_z(pg, op, pg)
9745     SVETypeFlags TypeFlags(Builtin->TypeModifier);
9746     llvm::Type* OverloadedTy = getSVEType(TypeFlags);
9747     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_eor_z, OverloadedTy);
9748     return Builder.CreateCall(F, {Ops[0], Ops[1], Ops[0]});
9749   }
9750 
9751   case SVE::BI__builtin_sve_svmovlb_u16:
9752   case SVE::BI__builtin_sve_svmovlb_u32:
9753   case SVE::BI__builtin_sve_svmovlb_u64:
9754     return EmitSVEMovl(TypeFlags, Ops, Intrinsic::aarch64_sve_ushllb);
9755 
9756   case SVE::BI__builtin_sve_svmovlb_s16:
9757   case SVE::BI__builtin_sve_svmovlb_s32:
9758   case SVE::BI__builtin_sve_svmovlb_s64:
9759     return EmitSVEMovl(TypeFlags, Ops, Intrinsic::aarch64_sve_sshllb);
9760 
9761   case SVE::BI__builtin_sve_svmovlt_u16:
9762   case SVE::BI__builtin_sve_svmovlt_u32:
9763   case SVE::BI__builtin_sve_svmovlt_u64:
9764     return EmitSVEMovl(TypeFlags, Ops, Intrinsic::aarch64_sve_ushllt);
9765 
9766   case SVE::BI__builtin_sve_svmovlt_s16:
9767   case SVE::BI__builtin_sve_svmovlt_s32:
9768   case SVE::BI__builtin_sve_svmovlt_s64:
9769     return EmitSVEMovl(TypeFlags, Ops, Intrinsic::aarch64_sve_sshllt);
9770 
9771   case SVE::BI__builtin_sve_svpmullt_u16:
9772   case SVE::BI__builtin_sve_svpmullt_u64:
9773   case SVE::BI__builtin_sve_svpmullt_n_u16:
9774   case SVE::BI__builtin_sve_svpmullt_n_u64:
9775     return EmitSVEPMull(TypeFlags, Ops, Intrinsic::aarch64_sve_pmullt_pair);
9776 
9777   case SVE::BI__builtin_sve_svpmullb_u16:
9778   case SVE::BI__builtin_sve_svpmullb_u64:
9779   case SVE::BI__builtin_sve_svpmullb_n_u16:
9780   case SVE::BI__builtin_sve_svpmullb_n_u64:
9781     return EmitSVEPMull(TypeFlags, Ops, Intrinsic::aarch64_sve_pmullb_pair);
9782 
9783   case SVE::BI__builtin_sve_svdup_n_b8:
9784   case SVE::BI__builtin_sve_svdup_n_b16:
9785   case SVE::BI__builtin_sve_svdup_n_b32:
9786   case SVE::BI__builtin_sve_svdup_n_b64: {
9787     Value *CmpNE =
9788         Builder.CreateICmpNE(Ops[0], Constant::getNullValue(Ops[0]->getType()));
9789     llvm::ScalableVectorType *OverloadedTy = getSVEType(TypeFlags);
9790     Value *Dup = EmitSVEDupX(CmpNE, OverloadedTy);
9791     return EmitSVEPredicateCast(Dup, cast<llvm::ScalableVectorType>(Ty));
9792   }
9793 
9794   case SVE::BI__builtin_sve_svdupq_n_b8:
9795   case SVE::BI__builtin_sve_svdupq_n_b16:
9796   case SVE::BI__builtin_sve_svdupq_n_b32:
9797   case SVE::BI__builtin_sve_svdupq_n_b64:
9798   case SVE::BI__builtin_sve_svdupq_n_u8:
9799   case SVE::BI__builtin_sve_svdupq_n_s8:
9800   case SVE::BI__builtin_sve_svdupq_n_u64:
9801   case SVE::BI__builtin_sve_svdupq_n_f64:
9802   case SVE::BI__builtin_sve_svdupq_n_s64:
9803   case SVE::BI__builtin_sve_svdupq_n_u16:
9804   case SVE::BI__builtin_sve_svdupq_n_f16:
9805   case SVE::BI__builtin_sve_svdupq_n_bf16:
9806   case SVE::BI__builtin_sve_svdupq_n_s16:
9807   case SVE::BI__builtin_sve_svdupq_n_u32:
9808   case SVE::BI__builtin_sve_svdupq_n_f32:
9809   case SVE::BI__builtin_sve_svdupq_n_s32: {
9810     // These builtins are implemented by storing each element to an array and using
9811     // ld1rq to materialize a vector.
9812     unsigned NumOpnds = Ops.size();
9813 
9814     bool IsBoolTy =
9815         cast<llvm::VectorType>(Ty)->getElementType()->isIntegerTy(1);
9816 
9817     // For svdupq_n_b* the element type of is an integer of type 128/numelts,
9818     // so that the compare can use the width that is natural for the expected
9819     // number of predicate lanes.
9820     llvm::Type *EltTy = Ops[0]->getType();
9821     if (IsBoolTy)
9822       EltTy = IntegerType::get(getLLVMContext(), SVEBitsPerBlock / NumOpnds);
9823 
9824     SmallVector<llvm::Value *, 16> VecOps;
9825     for (unsigned I = 0; I < NumOpnds; ++I)
9826         VecOps.push_back(Builder.CreateZExt(Ops[I], EltTy));
9827     Value *Vec = BuildVector(VecOps);
9828 
9829     llvm::Type *OverloadedTy = getSVEVectorForElementType(EltTy);
9830     Value *InsertSubVec = Builder.CreateInsertVector(
9831         OverloadedTy, PoisonValue::get(OverloadedTy), Vec, Builder.getInt64(0));
9832 
9833     Function *F =
9834         CGM.getIntrinsic(Intrinsic::aarch64_sve_dupq_lane, OverloadedTy);
9835     Value *DupQLane =
9836         Builder.CreateCall(F, {InsertSubVec, Builder.getInt64(0)});
9837 
9838     if (!IsBoolTy)
9839       return DupQLane;
9840 
9841     SVETypeFlags TypeFlags(Builtin->TypeModifier);
9842     Value *Pred = EmitSVEAllTruePred(TypeFlags);
9843 
9844     // For svdupq_n_b* we need to add an additional 'cmpne' with '0'.
9845     F = CGM.getIntrinsic(NumOpnds == 2 ? Intrinsic::aarch64_sve_cmpne
9846                                        : Intrinsic::aarch64_sve_cmpne_wide,
9847                          OverloadedTy);
9848     Value *Call = Builder.CreateCall(
9849         F, {Pred, DupQLane, EmitSVEDupX(Builder.getInt64(0))});
9850     return EmitSVEPredicateCast(Call, cast<llvm::ScalableVectorType>(Ty));
9851   }
9852 
9853   case SVE::BI__builtin_sve_svpfalse_b:
9854     return ConstantInt::getFalse(Ty);
9855 
9856   case SVE::BI__builtin_sve_svlen_bf16:
9857   case SVE::BI__builtin_sve_svlen_f16:
9858   case SVE::BI__builtin_sve_svlen_f32:
9859   case SVE::BI__builtin_sve_svlen_f64:
9860   case SVE::BI__builtin_sve_svlen_s8:
9861   case SVE::BI__builtin_sve_svlen_s16:
9862   case SVE::BI__builtin_sve_svlen_s32:
9863   case SVE::BI__builtin_sve_svlen_s64:
9864   case SVE::BI__builtin_sve_svlen_u8:
9865   case SVE::BI__builtin_sve_svlen_u16:
9866   case SVE::BI__builtin_sve_svlen_u32:
9867   case SVE::BI__builtin_sve_svlen_u64: {
9868     SVETypeFlags TF(Builtin->TypeModifier);
9869     auto VTy = cast<llvm::VectorType>(getSVEType(TF));
9870     auto *NumEls =
9871         llvm::ConstantInt::get(Ty, VTy->getElementCount().getKnownMinValue());
9872 
9873     Function *F = CGM.getIntrinsic(Intrinsic::vscale, Ty);
9874     return Builder.CreateMul(NumEls, Builder.CreateCall(F));
9875   }
9876 
9877   case SVE::BI__builtin_sve_svtbl2_u8:
9878   case SVE::BI__builtin_sve_svtbl2_s8:
9879   case SVE::BI__builtin_sve_svtbl2_u16:
9880   case SVE::BI__builtin_sve_svtbl2_s16:
9881   case SVE::BI__builtin_sve_svtbl2_u32:
9882   case SVE::BI__builtin_sve_svtbl2_s32:
9883   case SVE::BI__builtin_sve_svtbl2_u64:
9884   case SVE::BI__builtin_sve_svtbl2_s64:
9885   case SVE::BI__builtin_sve_svtbl2_f16:
9886   case SVE::BI__builtin_sve_svtbl2_bf16:
9887   case SVE::BI__builtin_sve_svtbl2_f32:
9888   case SVE::BI__builtin_sve_svtbl2_f64: {
9889     SVETypeFlags TF(Builtin->TypeModifier);
9890     auto VTy = cast<llvm::ScalableVectorType>(getSVEType(TF));
9891     Value *V0 = Builder.CreateExtractVector(VTy, Ops[0],
9892                                             ConstantInt::get(CGM.Int64Ty, 0));
9893     unsigned MinElts = VTy->getMinNumElements();
9894     Value *V1 = Builder.CreateExtractVector(
9895         VTy, Ops[0], ConstantInt::get(CGM.Int64Ty, MinElts));
9896     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_tbl2, VTy);
9897     return Builder.CreateCall(F, {V0, V1, Ops[1]});
9898   }
9899 
9900   case SVE::BI__builtin_sve_svset_neonq_s8:
9901   case SVE::BI__builtin_sve_svset_neonq_s16:
9902   case SVE::BI__builtin_sve_svset_neonq_s32:
9903   case SVE::BI__builtin_sve_svset_neonq_s64:
9904   case SVE::BI__builtin_sve_svset_neonq_u8:
9905   case SVE::BI__builtin_sve_svset_neonq_u16:
9906   case SVE::BI__builtin_sve_svset_neonq_u32:
9907   case SVE::BI__builtin_sve_svset_neonq_u64:
9908   case SVE::BI__builtin_sve_svset_neonq_f16:
9909   case SVE::BI__builtin_sve_svset_neonq_f32:
9910   case SVE::BI__builtin_sve_svset_neonq_f64:
9911   case SVE::BI__builtin_sve_svset_neonq_bf16: {
9912     return Builder.CreateInsertVector(Ty, Ops[0], Ops[1], Builder.getInt64(0));
9913   }
9914 
9915   case SVE::BI__builtin_sve_svget_neonq_s8:
9916   case SVE::BI__builtin_sve_svget_neonq_s16:
9917   case SVE::BI__builtin_sve_svget_neonq_s32:
9918   case SVE::BI__builtin_sve_svget_neonq_s64:
9919   case SVE::BI__builtin_sve_svget_neonq_u8:
9920   case SVE::BI__builtin_sve_svget_neonq_u16:
9921   case SVE::BI__builtin_sve_svget_neonq_u32:
9922   case SVE::BI__builtin_sve_svget_neonq_u64:
9923   case SVE::BI__builtin_sve_svget_neonq_f16:
9924   case SVE::BI__builtin_sve_svget_neonq_f32:
9925   case SVE::BI__builtin_sve_svget_neonq_f64:
9926   case SVE::BI__builtin_sve_svget_neonq_bf16: {
9927     return Builder.CreateExtractVector(Ty, Ops[0], Builder.getInt64(0));
9928   }
9929 
9930   case SVE::BI__builtin_sve_svdup_neonq_s8:
9931   case SVE::BI__builtin_sve_svdup_neonq_s16:
9932   case SVE::BI__builtin_sve_svdup_neonq_s32:
9933   case SVE::BI__builtin_sve_svdup_neonq_s64:
9934   case SVE::BI__builtin_sve_svdup_neonq_u8:
9935   case SVE::BI__builtin_sve_svdup_neonq_u16:
9936   case SVE::BI__builtin_sve_svdup_neonq_u32:
9937   case SVE::BI__builtin_sve_svdup_neonq_u64:
9938   case SVE::BI__builtin_sve_svdup_neonq_f16:
9939   case SVE::BI__builtin_sve_svdup_neonq_f32:
9940   case SVE::BI__builtin_sve_svdup_neonq_f64:
9941   case SVE::BI__builtin_sve_svdup_neonq_bf16: {
9942     Value *Insert = Builder.CreateInsertVector(Ty, PoisonValue::get(Ty), Ops[0],
9943                                                Builder.getInt64(0));
9944     return Builder.CreateIntrinsic(Intrinsic::aarch64_sve_dupq_lane, {Ty},
9945                                    {Insert, Builder.getInt64(0)});
9946   }
9947   }
9948 
9949   /// Should not happen
9950   return nullptr;
9951 }
9952 
9953 Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
9954                                                   const CallExpr *E) {
9955   // Find out if any arguments are required to be integer constant expressions.
9956   unsigned ICEArguments = 0;
9957   ASTContext::GetBuiltinTypeError Error;
9958   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
9959   assert(Error == ASTContext::GE_None && "Should not codegen an error");
9960 
9961   llvm::Type *Ty = ConvertType(E->getType());
9962   llvm::SmallVector<Value *, 4> Ops;
9963   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
9964     if ((ICEArguments & (1 << i)) == 0)
9965       Ops.push_back(EmitScalarExpr(E->getArg(i)));
9966     else {
9967       // If this is required to be a constant, constant fold it so that we know
9968       // that the generated intrinsic gets a ConstantInt.
9969       std::optional<llvm::APSInt> Result =
9970           E->getArg(i)->getIntegerConstantExpr(getContext());
9971       assert(Result && "Expected argument to be a constant");
9972 
9973       // Immediates for SVE llvm intrinsics are always 32bit.  We can safely
9974       // truncate because the immediate has been range checked and no valid
9975       // immediate requires more than a handful of bits.
9976       *Result = Result->extOrTrunc(32);
9977       Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), *Result));
9978     }
9979   }
9980 
9981   auto *Builtin = findARMVectorIntrinsicInMap(AArch64SMEIntrinsicMap, BuiltinID,
9982                                               AArch64SMEIntrinsicsProvenSorted);
9983   SVETypeFlags TypeFlags(Builtin->TypeModifier);
9984   if (TypeFlags.isLoad() || TypeFlags.isStore())
9985     return EmitSMELd1St1(TypeFlags, Ops, Builtin->LLVMIntrinsic);
9986   else if (TypeFlags.isReadZA() || TypeFlags.isWriteZA())
9987     return EmitSMEReadWrite(TypeFlags, Ops, Builtin->LLVMIntrinsic);
9988   else if (BuiltinID == SME::BI__builtin_sme_svzero_mask_za ||
9989            BuiltinID == SME::BI__builtin_sme_svzero_za)
9990     return EmitSMEZero(TypeFlags, Ops, Builtin->LLVMIntrinsic);
9991   else if (BuiltinID == SME::BI__builtin_sme_svldr_vnum_za ||
9992            BuiltinID == SME::BI__builtin_sme_svstr_vnum_za)
9993     return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic);
9994   else if (Builtin->LLVMIntrinsic != 0) {
9995     // Predicates must match the main datatype.
9996     for (unsigned i = 0, e = Ops.size(); i != e; ++i)
9997       if (auto PredTy = dyn_cast<llvm::VectorType>(Ops[i]->getType()))
9998         if (PredTy->getElementType()->isIntegerTy(1))
9999           Ops[i] = EmitSVEPredicateCast(Ops[i], getSVEType(TypeFlags));
10000 
10001     Function *F = CGM.getIntrinsic(Builtin->LLVMIntrinsic,
10002                                    getSVEOverloadTypes(TypeFlags, Ty, Ops));
10003     Value *Call = Builder.CreateCall(F, Ops);
10004     return Call;
10005   }
10006 
10007   /// Should not happen
10008   return nullptr;
10009 }
10010 
10011 Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
10012                                                const CallExpr *E,
10013                                                llvm::Triple::ArchType Arch) {
10014   if (BuiltinID >= clang::AArch64::FirstSVEBuiltin &&
10015       BuiltinID <= clang::AArch64::LastSVEBuiltin)
10016     return EmitAArch64SVEBuiltinExpr(BuiltinID, E);
10017 
10018   if (BuiltinID >= clang::AArch64::FirstSMEBuiltin &&
10019       BuiltinID <= clang::AArch64::LastSMEBuiltin)
10020     return EmitAArch64SMEBuiltinExpr(BuiltinID, E);
10021 
10022   unsigned HintID = static_cast<unsigned>(-1);
10023   switch (BuiltinID) {
10024   default: break;
10025   case clang::AArch64::BI__builtin_arm_nop:
10026     HintID = 0;
10027     break;
10028   case clang::AArch64::BI__builtin_arm_yield:
10029   case clang::AArch64::BI__yield:
10030     HintID = 1;
10031     break;
10032   case clang::AArch64::BI__builtin_arm_wfe:
10033   case clang::AArch64::BI__wfe:
10034     HintID = 2;
10035     break;
10036   case clang::AArch64::BI__builtin_arm_wfi:
10037   case clang::AArch64::BI__wfi:
10038     HintID = 3;
10039     break;
10040   case clang::AArch64::BI__builtin_arm_sev:
10041   case clang::AArch64::BI__sev:
10042     HintID = 4;
10043     break;
10044   case clang::AArch64::BI__builtin_arm_sevl:
10045   case clang::AArch64::BI__sevl:
10046     HintID = 5;
10047     break;
10048   }
10049 
10050   if (HintID != static_cast<unsigned>(-1)) {
10051     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_hint);
10052     return Builder.CreateCall(F, llvm::ConstantInt::get(Int32Ty, HintID));
10053   }
10054 
10055   if (BuiltinID == clang::AArch64::BI__builtin_arm_rbit) {
10056     assert((getContext().getTypeSize(E->getType()) == 32) &&
10057            "rbit of unusual size!");
10058     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10059     return Builder.CreateCall(
10060         CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit");
10061   }
10062   if (BuiltinID == clang::AArch64::BI__builtin_arm_rbit64) {
10063     assert((getContext().getTypeSize(E->getType()) == 64) &&
10064            "rbit of unusual size!");
10065     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10066     return Builder.CreateCall(
10067         CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit");
10068   }
10069 
10070   if (BuiltinID == clang::AArch64::BI__builtin_arm_clz ||
10071       BuiltinID == clang::AArch64::BI__builtin_arm_clz64) {
10072     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10073     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Arg->getType());
10074     Value *Res = Builder.CreateCall(F, {Arg, Builder.getInt1(false)});
10075     if (BuiltinID == clang::AArch64::BI__builtin_arm_clz64)
10076       Res = Builder.CreateTrunc(Res, Builder.getInt32Ty());
10077     return Res;
10078   }
10079 
10080   if (BuiltinID == clang::AArch64::BI__builtin_arm_cls) {
10081     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10082     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_cls), Arg,
10083                               "cls");
10084   }
10085   if (BuiltinID == clang::AArch64::BI__builtin_arm_cls64) {
10086     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10087     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_cls64), Arg,
10088                               "cls");
10089   }
10090 
10091   if (BuiltinID == clang::AArch64::BI__builtin_arm_rint32zf ||
10092       BuiltinID == clang::AArch64::BI__builtin_arm_rint32z) {
10093     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10094     llvm::Type *Ty = Arg->getType();
10095     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_frint32z, Ty),
10096                               Arg, "frint32z");
10097   }
10098 
10099   if (BuiltinID == clang::AArch64::BI__builtin_arm_rint64zf ||
10100       BuiltinID == clang::AArch64::BI__builtin_arm_rint64z) {
10101     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10102     llvm::Type *Ty = Arg->getType();
10103     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_frint64z, Ty),
10104                               Arg, "frint64z");
10105   }
10106 
10107   if (BuiltinID == clang::AArch64::BI__builtin_arm_rint32xf ||
10108       BuiltinID == clang::AArch64::BI__builtin_arm_rint32x) {
10109     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10110     llvm::Type *Ty = Arg->getType();
10111     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_frint32x, Ty),
10112                               Arg, "frint32x");
10113   }
10114 
10115   if (BuiltinID == clang::AArch64::BI__builtin_arm_rint64xf ||
10116       BuiltinID == clang::AArch64::BI__builtin_arm_rint64x) {
10117     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10118     llvm::Type *Ty = Arg->getType();
10119     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_frint64x, Ty),
10120                               Arg, "frint64x");
10121   }
10122 
10123   if (BuiltinID == clang::AArch64::BI__builtin_arm_jcvt) {
10124     assert((getContext().getTypeSize(E->getType()) == 32) &&
10125            "__jcvt of unusual size!");
10126     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10127     return Builder.CreateCall(
10128         CGM.getIntrinsic(Intrinsic::aarch64_fjcvtzs), Arg);
10129   }
10130 
10131   if (BuiltinID == clang::AArch64::BI__builtin_arm_ld64b ||
10132       BuiltinID == clang::AArch64::BI__builtin_arm_st64b ||
10133       BuiltinID == clang::AArch64::BI__builtin_arm_st64bv ||
10134       BuiltinID == clang::AArch64::BI__builtin_arm_st64bv0) {
10135     llvm::Value *MemAddr = EmitScalarExpr(E->getArg(0));
10136     llvm::Value *ValPtr = EmitScalarExpr(E->getArg(1));
10137 
10138     if (BuiltinID == clang::AArch64::BI__builtin_arm_ld64b) {
10139       // Load from the address via an LLVM intrinsic, receiving a
10140       // tuple of 8 i64 words, and store each one to ValPtr.
10141       Function *F = CGM.getIntrinsic(Intrinsic::aarch64_ld64b);
10142       llvm::Value *Val = Builder.CreateCall(F, MemAddr);
10143       llvm::Value *ToRet;
10144       for (size_t i = 0; i < 8; i++) {
10145         llvm::Value *ValOffsetPtr =
10146             Builder.CreateGEP(Int64Ty, ValPtr, Builder.getInt32(i));
10147         Address Addr =
10148             Address(ValOffsetPtr, Int64Ty, CharUnits::fromQuantity(8));
10149         ToRet = Builder.CreateStore(Builder.CreateExtractValue(Val, i), Addr);
10150       }
10151       return ToRet;
10152     } else {
10153       // Load 8 i64 words from ValPtr, and store them to the address
10154       // via an LLVM intrinsic.
10155       SmallVector<llvm::Value *, 9> Args;
10156       Args.push_back(MemAddr);
10157       for (size_t i = 0; i < 8; i++) {
10158         llvm::Value *ValOffsetPtr =
10159             Builder.CreateGEP(Int64Ty, ValPtr, Builder.getInt32(i));
10160         Address Addr =
10161             Address(ValOffsetPtr, Int64Ty, CharUnits::fromQuantity(8));
10162         Args.push_back(Builder.CreateLoad(Addr));
10163       }
10164 
10165       auto Intr = (BuiltinID == clang::AArch64::BI__builtin_arm_st64b
10166                        ? Intrinsic::aarch64_st64b
10167                    : BuiltinID == clang::AArch64::BI__builtin_arm_st64bv
10168                        ? Intrinsic::aarch64_st64bv
10169                        : Intrinsic::aarch64_st64bv0);
10170       Function *F = CGM.getIntrinsic(Intr);
10171       return Builder.CreateCall(F, Args);
10172     }
10173   }
10174 
10175   if (BuiltinID == clang::AArch64::BI__builtin_arm_rndr ||
10176       BuiltinID == clang::AArch64::BI__builtin_arm_rndrrs) {
10177 
10178     auto Intr = (BuiltinID == clang::AArch64::BI__builtin_arm_rndr
10179                      ? Intrinsic::aarch64_rndr
10180                      : Intrinsic::aarch64_rndrrs);
10181     Function *F = CGM.getIntrinsic(Intr);
10182     llvm::Value *Val = Builder.CreateCall(F);
10183     Value *RandomValue = Builder.CreateExtractValue(Val, 0);
10184     Value *Status = Builder.CreateExtractValue(Val, 1);
10185 
10186     Address MemAddress = EmitPointerWithAlignment(E->getArg(0));
10187     Builder.CreateStore(RandomValue, MemAddress);
10188     Status = Builder.CreateZExt(Status, Int32Ty);
10189     return Status;
10190   }
10191 
10192   if (BuiltinID == clang::AArch64::BI__clear_cache) {
10193     assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments");
10194     const FunctionDecl *FD = E->getDirectCallee();
10195     Value *Ops[2];
10196     for (unsigned i = 0; i < 2; i++)
10197       Ops[i] = EmitScalarExpr(E->getArg(i));
10198     llvm::Type *Ty = CGM.getTypes().ConvertType(FD->getType());
10199     llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
10200     StringRef Name = FD->getName();
10201     return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops);
10202   }
10203 
10204   if ((BuiltinID == clang::AArch64::BI__builtin_arm_ldrex ||
10205        BuiltinID == clang::AArch64::BI__builtin_arm_ldaex) &&
10206       getContext().getTypeSize(E->getType()) == 128) {
10207     Function *F =
10208         CGM.getIntrinsic(BuiltinID == clang::AArch64::BI__builtin_arm_ldaex
10209                              ? Intrinsic::aarch64_ldaxp
10210                              : Intrinsic::aarch64_ldxp);
10211 
10212     Value *LdPtr = EmitScalarExpr(E->getArg(0));
10213     Value *Val = Builder.CreateCall(F, Builder.CreateBitCast(LdPtr, Int8PtrTy),
10214                                     "ldxp");
10215 
10216     Value *Val0 = Builder.CreateExtractValue(Val, 1);
10217     Value *Val1 = Builder.CreateExtractValue(Val, 0);
10218     llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
10219     Val0 = Builder.CreateZExt(Val0, Int128Ty);
10220     Val1 = Builder.CreateZExt(Val1, Int128Ty);
10221 
10222     Value *ShiftCst = llvm::ConstantInt::get(Int128Ty, 64);
10223     Val = Builder.CreateShl(Val0, ShiftCst, "shl", true /* nuw */);
10224     Val = Builder.CreateOr(Val, Val1);
10225     return Builder.CreateBitCast(Val, ConvertType(E->getType()));
10226   } else if (BuiltinID == clang::AArch64::BI__builtin_arm_ldrex ||
10227              BuiltinID == clang::AArch64::BI__builtin_arm_ldaex) {
10228     Value *LoadAddr = EmitScalarExpr(E->getArg(0));
10229 
10230     QualType Ty = E->getType();
10231     llvm::Type *RealResTy = ConvertType(Ty);
10232     llvm::Type *IntTy =
10233         llvm::IntegerType::get(getLLVMContext(), getContext().getTypeSize(Ty));
10234     llvm::Type *PtrTy = llvm::PointerType::getUnqual(getLLVMContext());
10235 
10236     Function *F =
10237         CGM.getIntrinsic(BuiltinID == clang::AArch64::BI__builtin_arm_ldaex
10238                              ? Intrinsic::aarch64_ldaxr
10239                              : Intrinsic::aarch64_ldxr,
10240                          PtrTy);
10241     CallInst *Val = Builder.CreateCall(F, LoadAddr, "ldxr");
10242     Val->addParamAttr(
10243         0, Attribute::get(getLLVMContext(), Attribute::ElementType, IntTy));
10244 
10245     if (RealResTy->isPointerTy())
10246       return Builder.CreateIntToPtr(Val, RealResTy);
10247 
10248     llvm::Type *IntResTy = llvm::IntegerType::get(
10249         getLLVMContext(), CGM.getDataLayout().getTypeSizeInBits(RealResTy));
10250     return Builder.CreateBitCast(Builder.CreateTruncOrBitCast(Val, IntResTy),
10251                                  RealResTy);
10252   }
10253 
10254   if ((BuiltinID == clang::AArch64::BI__builtin_arm_strex ||
10255        BuiltinID == clang::AArch64::BI__builtin_arm_stlex) &&
10256       getContext().getTypeSize(E->getArg(0)->getType()) == 128) {
10257     Function *F =
10258         CGM.getIntrinsic(BuiltinID == clang::AArch64::BI__builtin_arm_stlex
10259                              ? Intrinsic::aarch64_stlxp
10260                              : Intrinsic::aarch64_stxp);
10261     llvm::Type *STy = llvm::StructType::get(Int64Ty, Int64Ty);
10262 
10263     Address Tmp = CreateMemTemp(E->getArg(0)->getType());
10264     EmitAnyExprToMem(E->getArg(0), Tmp, Qualifiers(), /*init*/ true);
10265 
10266     Tmp = Tmp.withElementType(STy);
10267     llvm::Value *Val = Builder.CreateLoad(Tmp);
10268 
10269     Value *Arg0 = Builder.CreateExtractValue(Val, 0);
10270     Value *Arg1 = Builder.CreateExtractValue(Val, 1);
10271     Value *StPtr = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)),
10272                                          Int8PtrTy);
10273     return Builder.CreateCall(F, {Arg0, Arg1, StPtr}, "stxp");
10274   }
10275 
10276   if (BuiltinID == clang::AArch64::BI__builtin_arm_strex ||
10277       BuiltinID == clang::AArch64::BI__builtin_arm_stlex) {
10278     Value *StoreVal = EmitScalarExpr(E->getArg(0));
10279     Value *StoreAddr = EmitScalarExpr(E->getArg(1));
10280 
10281     QualType Ty = E->getArg(0)->getType();
10282     llvm::Type *StoreTy =
10283         llvm::IntegerType::get(getLLVMContext(), getContext().getTypeSize(Ty));
10284 
10285     if (StoreVal->getType()->isPointerTy())
10286       StoreVal = Builder.CreatePtrToInt(StoreVal, Int64Ty);
10287     else {
10288       llvm::Type *IntTy = llvm::IntegerType::get(
10289           getLLVMContext(),
10290           CGM.getDataLayout().getTypeSizeInBits(StoreVal->getType()));
10291       StoreVal = Builder.CreateBitCast(StoreVal, IntTy);
10292       StoreVal = Builder.CreateZExtOrBitCast(StoreVal, Int64Ty);
10293     }
10294 
10295     Function *F =
10296         CGM.getIntrinsic(BuiltinID == clang::AArch64::BI__builtin_arm_stlex
10297                              ? Intrinsic::aarch64_stlxr
10298                              : Intrinsic::aarch64_stxr,
10299                          StoreAddr->getType());
10300     CallInst *CI = Builder.CreateCall(F, {StoreVal, StoreAddr}, "stxr");
10301     CI->addParamAttr(
10302         1, Attribute::get(getLLVMContext(), Attribute::ElementType, StoreTy));
10303     return CI;
10304   }
10305 
10306   if (BuiltinID == clang::AArch64::BI__getReg) {
10307     Expr::EvalResult Result;
10308     if (!E->getArg(0)->EvaluateAsInt(Result, CGM.getContext()))
10309       llvm_unreachable("Sema will ensure that the parameter is constant");
10310 
10311     llvm::APSInt Value = Result.Val.getInt();
10312     LLVMContext &Context = CGM.getLLVMContext();
10313     std::string Reg = Value == 31 ? "sp" : "x" + toString(Value, 10);
10314 
10315     llvm::Metadata *Ops[] = {llvm::MDString::get(Context, Reg)};
10316     llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
10317     llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
10318 
10319     llvm::Function *F =
10320         CGM.getIntrinsic(llvm::Intrinsic::read_register, {Int64Ty});
10321     return Builder.CreateCall(F, Metadata);
10322   }
10323 
10324   if (BuiltinID == clang::AArch64::BI__break) {
10325     Expr::EvalResult Result;
10326     if (!E->getArg(0)->EvaluateAsInt(Result, CGM.getContext()))
10327       llvm_unreachable("Sema will ensure that the parameter is constant");
10328 
10329     llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::aarch64_break);
10330     return Builder.CreateCall(F, {EmitScalarExpr(E->getArg(0))});
10331   }
10332 
10333   if (BuiltinID == clang::AArch64::BI__builtin_arm_clrex) {
10334     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_clrex);
10335     return Builder.CreateCall(F);
10336   }
10337 
10338   if (BuiltinID == clang::AArch64::BI_ReadWriteBarrier)
10339     return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
10340                                llvm::SyncScope::SingleThread);
10341 
10342   // CRC32
10343   Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic;
10344   switch (BuiltinID) {
10345   case clang::AArch64::BI__builtin_arm_crc32b:
10346     CRCIntrinsicID = Intrinsic::aarch64_crc32b; break;
10347   case clang::AArch64::BI__builtin_arm_crc32cb:
10348     CRCIntrinsicID = Intrinsic::aarch64_crc32cb; break;
10349   case clang::AArch64::BI__builtin_arm_crc32h:
10350     CRCIntrinsicID = Intrinsic::aarch64_crc32h; break;
10351   case clang::AArch64::BI__builtin_arm_crc32ch:
10352     CRCIntrinsicID = Intrinsic::aarch64_crc32ch; break;
10353   case clang::AArch64::BI__builtin_arm_crc32w:
10354     CRCIntrinsicID = Intrinsic::aarch64_crc32w; break;
10355   case clang::AArch64::BI__builtin_arm_crc32cw:
10356     CRCIntrinsicID = Intrinsic::aarch64_crc32cw; break;
10357   case clang::AArch64::BI__builtin_arm_crc32d:
10358     CRCIntrinsicID = Intrinsic::aarch64_crc32x; break;
10359   case clang::AArch64::BI__builtin_arm_crc32cd:
10360     CRCIntrinsicID = Intrinsic::aarch64_crc32cx; break;
10361   }
10362 
10363   if (CRCIntrinsicID != Intrinsic::not_intrinsic) {
10364     Value *Arg0 = EmitScalarExpr(E->getArg(0));
10365     Value *Arg1 = EmitScalarExpr(E->getArg(1));
10366     Function *F = CGM.getIntrinsic(CRCIntrinsicID);
10367 
10368     llvm::Type *DataTy = F->getFunctionType()->getParamType(1);
10369     Arg1 = Builder.CreateZExtOrBitCast(Arg1, DataTy);
10370 
10371     return Builder.CreateCall(F, {Arg0, Arg1});
10372   }
10373 
10374   // Memory Operations (MOPS)
10375   if (BuiltinID == AArch64::BI__builtin_arm_mops_memset_tag) {
10376     Value *Dst = EmitScalarExpr(E->getArg(0));
10377     Value *Val = EmitScalarExpr(E->getArg(1));
10378     Value *Size = EmitScalarExpr(E->getArg(2));
10379     Dst = Builder.CreatePointerCast(Dst, Int8PtrTy);
10380     Val = Builder.CreateTrunc(Val, Int8Ty);
10381     Size = Builder.CreateIntCast(Size, Int64Ty, false);
10382     return Builder.CreateCall(
10383         CGM.getIntrinsic(Intrinsic::aarch64_mops_memset_tag), {Dst, Val, Size});
10384   }
10385 
10386   // Memory Tagging Extensions (MTE) Intrinsics
10387   Intrinsic::ID MTEIntrinsicID = Intrinsic::not_intrinsic;
10388   switch (BuiltinID) {
10389   case clang::AArch64::BI__builtin_arm_irg:
10390     MTEIntrinsicID = Intrinsic::aarch64_irg; break;
10391   case clang::AArch64::BI__builtin_arm_addg:
10392     MTEIntrinsicID = Intrinsic::aarch64_addg; break;
10393   case clang::AArch64::BI__builtin_arm_gmi:
10394     MTEIntrinsicID = Intrinsic::aarch64_gmi; break;
10395   case clang::AArch64::BI__builtin_arm_ldg:
10396     MTEIntrinsicID = Intrinsic::aarch64_ldg; break;
10397   case clang::AArch64::BI__builtin_arm_stg:
10398     MTEIntrinsicID = Intrinsic::aarch64_stg; break;
10399   case clang::AArch64::BI__builtin_arm_subp:
10400     MTEIntrinsicID = Intrinsic::aarch64_subp; break;
10401   }
10402 
10403   if (MTEIntrinsicID != Intrinsic::not_intrinsic) {
10404     llvm::Type *T = ConvertType(E->getType());
10405 
10406     if (MTEIntrinsicID == Intrinsic::aarch64_irg) {
10407       Value *Pointer = EmitScalarExpr(E->getArg(0));
10408       Value *Mask = EmitScalarExpr(E->getArg(1));
10409 
10410       Pointer = Builder.CreatePointerCast(Pointer, Int8PtrTy);
10411       Mask = Builder.CreateZExt(Mask, Int64Ty);
10412       Value *RV = Builder.CreateCall(
10413                        CGM.getIntrinsic(MTEIntrinsicID), {Pointer, Mask});
10414        return Builder.CreatePointerCast(RV, T);
10415     }
10416     if (MTEIntrinsicID == Intrinsic::aarch64_addg) {
10417       Value *Pointer = EmitScalarExpr(E->getArg(0));
10418       Value *TagOffset = EmitScalarExpr(E->getArg(1));
10419 
10420       Pointer = Builder.CreatePointerCast(Pointer, Int8PtrTy);
10421       TagOffset = Builder.CreateZExt(TagOffset, Int64Ty);
10422       Value *RV = Builder.CreateCall(
10423                        CGM.getIntrinsic(MTEIntrinsicID), {Pointer, TagOffset});
10424       return Builder.CreatePointerCast(RV, T);
10425     }
10426     if (MTEIntrinsicID == Intrinsic::aarch64_gmi) {
10427       Value *Pointer = EmitScalarExpr(E->getArg(0));
10428       Value *ExcludedMask = EmitScalarExpr(E->getArg(1));
10429 
10430       ExcludedMask = Builder.CreateZExt(ExcludedMask, Int64Ty);
10431       Pointer = Builder.CreatePointerCast(Pointer, Int8PtrTy);
10432       return Builder.CreateCall(
10433                        CGM.getIntrinsic(MTEIntrinsicID), {Pointer, ExcludedMask});
10434     }
10435     // Although it is possible to supply a different return
10436     // address (first arg) to this intrinsic, for now we set
10437     // return address same as input address.
10438     if (MTEIntrinsicID == Intrinsic::aarch64_ldg) {
10439       Value *TagAddress = EmitScalarExpr(E->getArg(0));
10440       TagAddress = Builder.CreatePointerCast(TagAddress, Int8PtrTy);
10441       Value *RV = Builder.CreateCall(
10442                     CGM.getIntrinsic(MTEIntrinsicID), {TagAddress, TagAddress});
10443       return Builder.CreatePointerCast(RV, T);
10444     }
10445     // Although it is possible to supply a different tag (to set)
10446     // to this intrinsic (as first arg), for now we supply
10447     // the tag that is in input address arg (common use case).
10448     if (MTEIntrinsicID == Intrinsic::aarch64_stg) {
10449         Value *TagAddress = EmitScalarExpr(E->getArg(0));
10450         TagAddress = Builder.CreatePointerCast(TagAddress, Int8PtrTy);
10451         return Builder.CreateCall(
10452                  CGM.getIntrinsic(MTEIntrinsicID), {TagAddress, TagAddress});
10453     }
10454     if (MTEIntrinsicID == Intrinsic::aarch64_subp) {
10455       Value *PointerA = EmitScalarExpr(E->getArg(0));
10456       Value *PointerB = EmitScalarExpr(E->getArg(1));
10457       PointerA = Builder.CreatePointerCast(PointerA, Int8PtrTy);
10458       PointerB = Builder.CreatePointerCast(PointerB, Int8PtrTy);
10459       return Builder.CreateCall(
10460                        CGM.getIntrinsic(MTEIntrinsicID), {PointerA, PointerB});
10461     }
10462   }
10463 
10464   if (BuiltinID == clang::AArch64::BI__builtin_arm_rsr ||
10465       BuiltinID == clang::AArch64::BI__builtin_arm_rsr64 ||
10466       BuiltinID == clang::AArch64::BI__builtin_arm_rsr128 ||
10467       BuiltinID == clang::AArch64::BI__builtin_arm_rsrp ||
10468       BuiltinID == clang::AArch64::BI__builtin_arm_wsr ||
10469       BuiltinID == clang::AArch64::BI__builtin_arm_wsr64 ||
10470       BuiltinID == clang::AArch64::BI__builtin_arm_wsr128 ||
10471       BuiltinID == clang::AArch64::BI__builtin_arm_wsrp) {
10472 
10473     SpecialRegisterAccessKind AccessKind = Write;
10474     if (BuiltinID == clang::AArch64::BI__builtin_arm_rsr ||
10475         BuiltinID == clang::AArch64::BI__builtin_arm_rsr64 ||
10476         BuiltinID == clang::AArch64::BI__builtin_arm_rsr128 ||
10477         BuiltinID == clang::AArch64::BI__builtin_arm_rsrp)
10478       AccessKind = VolatileRead;
10479 
10480     bool IsPointerBuiltin = BuiltinID == clang::AArch64::BI__builtin_arm_rsrp ||
10481                             BuiltinID == clang::AArch64::BI__builtin_arm_wsrp;
10482 
10483     bool Is32Bit = BuiltinID == clang::AArch64::BI__builtin_arm_rsr ||
10484                    BuiltinID == clang::AArch64::BI__builtin_arm_wsr;
10485 
10486     bool Is128Bit = BuiltinID == clang::AArch64::BI__builtin_arm_rsr128 ||
10487                     BuiltinID == clang::AArch64::BI__builtin_arm_wsr128;
10488 
10489     llvm::Type *ValueType;
10490     llvm::Type *RegisterType = Int64Ty;
10491     if (Is32Bit) {
10492       ValueType = Int32Ty;
10493     } else if (Is128Bit) {
10494       llvm::Type *Int128Ty =
10495           llvm::IntegerType::getInt128Ty(CGM.getLLVMContext());
10496       ValueType = Int128Ty;
10497       RegisterType = Int128Ty;
10498     } else if (IsPointerBuiltin) {
10499       ValueType = VoidPtrTy;
10500     } else {
10501       ValueType = Int64Ty;
10502     };
10503 
10504     return EmitSpecialRegisterBuiltin(*this, E, RegisterType, ValueType,
10505                                       AccessKind);
10506   }
10507 
10508   if (BuiltinID == clang::AArch64::BI_ReadStatusReg ||
10509       BuiltinID == clang::AArch64::BI_WriteStatusReg) {
10510     LLVMContext &Context = CGM.getLLVMContext();
10511 
10512     unsigned SysReg =
10513       E->getArg(0)->EvaluateKnownConstInt(getContext()).getZExtValue();
10514 
10515     std::string SysRegStr;
10516     llvm::raw_string_ostream(SysRegStr) <<
10517                        ((1 << 1) | ((SysReg >> 14) & 1))  << ":" <<
10518                        ((SysReg >> 11) & 7)               << ":" <<
10519                        ((SysReg >> 7)  & 15)              << ":" <<
10520                        ((SysReg >> 3)  & 15)              << ":" <<
10521                        ( SysReg        & 7);
10522 
10523     llvm::Metadata *Ops[] = { llvm::MDString::get(Context, SysRegStr) };
10524     llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
10525     llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
10526 
10527     llvm::Type *RegisterType = Int64Ty;
10528     llvm::Type *Types[] = { RegisterType };
10529 
10530     if (BuiltinID == clang::AArch64::BI_ReadStatusReg) {
10531       llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::read_register, Types);
10532 
10533       return Builder.CreateCall(F, Metadata);
10534     }
10535 
10536     llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::write_register, Types);
10537     llvm::Value *ArgValue = EmitScalarExpr(E->getArg(1));
10538 
10539     return Builder.CreateCall(F, { Metadata, ArgValue });
10540   }
10541 
10542   if (BuiltinID == clang::AArch64::BI_AddressOfReturnAddress) {
10543     llvm::Function *F =
10544         CGM.getIntrinsic(Intrinsic::addressofreturnaddress, AllocaInt8PtrTy);
10545     return Builder.CreateCall(F);
10546   }
10547 
10548   if (BuiltinID == clang::AArch64::BI__builtin_sponentry) {
10549     llvm::Function *F = CGM.getIntrinsic(Intrinsic::sponentry, AllocaInt8PtrTy);
10550     return Builder.CreateCall(F);
10551   }
10552 
10553   if (BuiltinID == clang::AArch64::BI__mulh ||
10554       BuiltinID == clang::AArch64::BI__umulh) {
10555     llvm::Type *ResType = ConvertType(E->getType());
10556     llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
10557 
10558     bool IsSigned = BuiltinID == clang::AArch64::BI__mulh;
10559     Value *LHS =
10560         Builder.CreateIntCast(EmitScalarExpr(E->getArg(0)), Int128Ty, IsSigned);
10561     Value *RHS =
10562         Builder.CreateIntCast(EmitScalarExpr(E->getArg(1)), Int128Ty, IsSigned);
10563 
10564     Value *MulResult, *HigherBits;
10565     if (IsSigned) {
10566       MulResult = Builder.CreateNSWMul(LHS, RHS);
10567       HigherBits = Builder.CreateAShr(MulResult, 64);
10568     } else {
10569       MulResult = Builder.CreateNUWMul(LHS, RHS);
10570       HigherBits = Builder.CreateLShr(MulResult, 64);
10571     }
10572     HigherBits = Builder.CreateIntCast(HigherBits, ResType, IsSigned);
10573 
10574     return HigherBits;
10575   }
10576 
10577   if (BuiltinID == AArch64::BI__writex18byte ||
10578       BuiltinID == AArch64::BI__writex18word ||
10579       BuiltinID == AArch64::BI__writex18dword ||
10580       BuiltinID == AArch64::BI__writex18qword) {
10581     llvm::Type *IntTy = ConvertType(E->getArg(1)->getType());
10582 
10583     // Read x18 as i8*
10584     LLVMContext &Context = CGM.getLLVMContext();
10585     llvm::Metadata *Ops[] = {llvm::MDString::get(Context, "x18")};
10586     llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
10587     llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
10588     llvm::Function *F =
10589         CGM.getIntrinsic(llvm::Intrinsic::read_register, {Int64Ty});
10590     llvm::Value *X18 = Builder.CreateCall(F, Metadata);
10591     X18 = Builder.CreateIntToPtr(X18, llvm::PointerType::get(Int8Ty, 0));
10592 
10593     // Store val at x18 + offset
10594     Value *Offset = Builder.CreateZExt(EmitScalarExpr(E->getArg(0)), Int64Ty);
10595     Value *Ptr = Builder.CreateGEP(Int8Ty, X18, Offset);
10596     Ptr = Builder.CreatePointerCast(Ptr, llvm::PointerType::get(IntTy, 0));
10597     Value *Val = EmitScalarExpr(E->getArg(1));
10598     StoreInst *Store = Builder.CreateAlignedStore(Val, Ptr, CharUnits::One());
10599     return Store;
10600   }
10601 
10602   if (BuiltinID == AArch64::BI__readx18byte ||
10603       BuiltinID == AArch64::BI__readx18word ||
10604       BuiltinID == AArch64::BI__readx18dword ||
10605       BuiltinID == AArch64::BI__readx18qword) {
10606     llvm::Type *IntTy = ConvertType(E->getType());
10607 
10608     // Read x18 as i8*
10609     LLVMContext &Context = CGM.getLLVMContext();
10610     llvm::Metadata *Ops[] = {llvm::MDString::get(Context, "x18")};
10611     llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
10612     llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
10613     llvm::Function *F =
10614         CGM.getIntrinsic(llvm::Intrinsic::read_register, {Int64Ty});
10615     llvm::Value *X18 = Builder.CreateCall(F, Metadata);
10616     X18 = Builder.CreateIntToPtr(X18, llvm::PointerType::get(Int8Ty, 0));
10617 
10618     // Load x18 + offset
10619     Value *Offset = Builder.CreateZExt(EmitScalarExpr(E->getArg(0)), Int64Ty);
10620     Value *Ptr = Builder.CreateGEP(Int8Ty, X18, Offset);
10621     Ptr = Builder.CreatePointerCast(Ptr, llvm::PointerType::get(IntTy, 0));
10622     LoadInst *Load = Builder.CreateAlignedLoad(IntTy, Ptr, CharUnits::One());
10623     return Load;
10624   }
10625 
10626   // Handle MSVC intrinsics before argument evaluation to prevent double
10627   // evaluation.
10628   if (std::optional<MSVCIntrin> MsvcIntId =
10629           translateAarch64ToMsvcIntrin(BuiltinID))
10630     return EmitMSVCBuiltinExpr(*MsvcIntId, E);
10631 
10632   // Some intrinsics are equivalent - if they are use the base intrinsic ID.
10633   auto It = llvm::find_if(NEONEquivalentIntrinsicMap, [BuiltinID](auto &P) {
10634     return P.first == BuiltinID;
10635   });
10636   if (It != end(NEONEquivalentIntrinsicMap))
10637     BuiltinID = It->second;
10638 
10639   // Find out if any arguments are required to be integer constant
10640   // expressions.
10641   unsigned ICEArguments = 0;
10642   ASTContext::GetBuiltinTypeError Error;
10643   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
10644   assert(Error == ASTContext::GE_None && "Should not codegen an error");
10645 
10646   llvm::SmallVector<Value*, 4> Ops;
10647   Address PtrOp0 = Address::invalid();
10648   for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; i++) {
10649     if (i == 0) {
10650       switch (BuiltinID) {
10651       case NEON::BI__builtin_neon_vld1_v:
10652       case NEON::BI__builtin_neon_vld1q_v:
10653       case NEON::BI__builtin_neon_vld1_dup_v:
10654       case NEON::BI__builtin_neon_vld1q_dup_v:
10655       case NEON::BI__builtin_neon_vld1_lane_v:
10656       case NEON::BI__builtin_neon_vld1q_lane_v:
10657       case NEON::BI__builtin_neon_vst1_v:
10658       case NEON::BI__builtin_neon_vst1q_v:
10659       case NEON::BI__builtin_neon_vst1_lane_v:
10660       case NEON::BI__builtin_neon_vst1q_lane_v:
10661       case NEON::BI__builtin_neon_vldap1_lane_s64:
10662       case NEON::BI__builtin_neon_vldap1q_lane_s64:
10663       case NEON::BI__builtin_neon_vstl1_lane_s64:
10664       case NEON::BI__builtin_neon_vstl1q_lane_s64:
10665         // Get the alignment for the argument in addition to the value;
10666         // we'll use it later.
10667         PtrOp0 = EmitPointerWithAlignment(E->getArg(0));
10668         Ops.push_back(PtrOp0.getPointer());
10669         continue;
10670       }
10671     }
10672     if ((ICEArguments & (1 << i)) == 0) {
10673       Ops.push_back(EmitScalarExpr(E->getArg(i)));
10674     } else {
10675       // If this is required to be a constant, constant fold it so that we know
10676       // that the generated intrinsic gets a ConstantInt.
10677       Ops.push_back(llvm::ConstantInt::get(
10678           getLLVMContext(),
10679           *E->getArg(i)->getIntegerConstantExpr(getContext())));
10680     }
10681   }
10682 
10683   auto SISDMap = ArrayRef(AArch64SISDIntrinsicMap);
10684   const ARMVectorIntrinsicInfo *Builtin = findARMVectorIntrinsicInMap(
10685       SISDMap, BuiltinID, AArch64SISDIntrinsicsProvenSorted);
10686 
10687   if (Builtin) {
10688     Ops.push_back(EmitScalarExpr(E->getArg(E->getNumArgs() - 1)));
10689     Value *Result = EmitCommonNeonSISDBuiltinExpr(*this, *Builtin, Ops, E);
10690     assert(Result && "SISD intrinsic should have been handled");
10691     return Result;
10692   }
10693 
10694   const Expr *Arg = E->getArg(E->getNumArgs()-1);
10695   NeonTypeFlags Type(0);
10696   if (std::optional<llvm::APSInt> Result =
10697           Arg->getIntegerConstantExpr(getContext()))
10698     // Determine the type of this overloaded NEON intrinsic.
10699     Type = NeonTypeFlags(Result->getZExtValue());
10700 
10701   bool usgn = Type.isUnsigned();
10702   bool quad = Type.isQuad();
10703 
10704   // Handle non-overloaded intrinsics first.
10705   switch (BuiltinID) {
10706   default: break;
10707   case NEON::BI__builtin_neon_vabsh_f16:
10708     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10709     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, HalfTy), Ops, "vabs");
10710   case NEON::BI__builtin_neon_vaddq_p128: {
10711     llvm::Type *Ty = GetNeonType(this, NeonTypeFlags::Poly128);
10712     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10713     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
10714     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
10715     Ops[0] =  Builder.CreateXor(Ops[0], Ops[1]);
10716     llvm::Type *Int128Ty = llvm::Type::getIntNTy(getLLVMContext(), 128);
10717     return Builder.CreateBitCast(Ops[0], Int128Ty);
10718   }
10719   case NEON::BI__builtin_neon_vldrq_p128: {
10720     llvm::Type *Int128Ty = llvm::Type::getIntNTy(getLLVMContext(), 128);
10721     llvm::Type *Int128PTy = llvm::PointerType::get(Int128Ty, 0);
10722     Value *Ptr = Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), Int128PTy);
10723     return Builder.CreateAlignedLoad(Int128Ty, Ptr,
10724                                      CharUnits::fromQuantity(16));
10725   }
10726   case NEON::BI__builtin_neon_vstrq_p128: {
10727     llvm::Type *Int128PTy = llvm::Type::getIntNPtrTy(getLLVMContext(), 128);
10728     Value *Ptr = Builder.CreateBitCast(Ops[0], Int128PTy);
10729     return Builder.CreateDefaultAlignedStore(EmitScalarExpr(E->getArg(1)), Ptr);
10730   }
10731   case NEON::BI__builtin_neon_vcvts_f32_u32:
10732   case NEON::BI__builtin_neon_vcvtd_f64_u64:
10733     usgn = true;
10734     [[fallthrough]];
10735   case NEON::BI__builtin_neon_vcvts_f32_s32:
10736   case NEON::BI__builtin_neon_vcvtd_f64_s64: {
10737     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10738     bool Is64 = Ops[0]->getType()->getPrimitiveSizeInBits() == 64;
10739     llvm::Type *InTy = Is64 ? Int64Ty : Int32Ty;
10740     llvm::Type *FTy = Is64 ? DoubleTy : FloatTy;
10741     Ops[0] = Builder.CreateBitCast(Ops[0], InTy);
10742     if (usgn)
10743       return Builder.CreateUIToFP(Ops[0], FTy);
10744     return Builder.CreateSIToFP(Ops[0], FTy);
10745   }
10746   case NEON::BI__builtin_neon_vcvth_f16_u16:
10747   case NEON::BI__builtin_neon_vcvth_f16_u32:
10748   case NEON::BI__builtin_neon_vcvth_f16_u64:
10749     usgn = true;
10750     [[fallthrough]];
10751   case NEON::BI__builtin_neon_vcvth_f16_s16:
10752   case NEON::BI__builtin_neon_vcvth_f16_s32:
10753   case NEON::BI__builtin_neon_vcvth_f16_s64: {
10754     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10755     llvm::Type *FTy = HalfTy;
10756     llvm::Type *InTy;
10757     if (Ops[0]->getType()->getPrimitiveSizeInBits() == 64)
10758       InTy = Int64Ty;
10759     else if (Ops[0]->getType()->getPrimitiveSizeInBits() == 32)
10760       InTy = Int32Ty;
10761     else
10762       InTy = Int16Ty;
10763     Ops[0] = Builder.CreateBitCast(Ops[0], InTy);
10764     if (usgn)
10765       return Builder.CreateUIToFP(Ops[0], FTy);
10766     return Builder.CreateSIToFP(Ops[0], FTy);
10767   }
10768   case NEON::BI__builtin_neon_vcvtah_u16_f16:
10769   case NEON::BI__builtin_neon_vcvtmh_u16_f16:
10770   case NEON::BI__builtin_neon_vcvtnh_u16_f16:
10771   case NEON::BI__builtin_neon_vcvtph_u16_f16:
10772   case NEON::BI__builtin_neon_vcvth_u16_f16:
10773   case NEON::BI__builtin_neon_vcvtah_s16_f16:
10774   case NEON::BI__builtin_neon_vcvtmh_s16_f16:
10775   case NEON::BI__builtin_neon_vcvtnh_s16_f16:
10776   case NEON::BI__builtin_neon_vcvtph_s16_f16:
10777   case NEON::BI__builtin_neon_vcvth_s16_f16: {
10778     unsigned Int;
10779     llvm::Type* InTy = Int32Ty;
10780     llvm::Type* FTy  = HalfTy;
10781     llvm::Type *Tys[2] = {InTy, FTy};
10782     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10783     switch (BuiltinID) {
10784     default: llvm_unreachable("missing builtin ID in switch!");
10785     case NEON::BI__builtin_neon_vcvtah_u16_f16:
10786       Int = Intrinsic::aarch64_neon_fcvtau; break;
10787     case NEON::BI__builtin_neon_vcvtmh_u16_f16:
10788       Int = Intrinsic::aarch64_neon_fcvtmu; break;
10789     case NEON::BI__builtin_neon_vcvtnh_u16_f16:
10790       Int = Intrinsic::aarch64_neon_fcvtnu; break;
10791     case NEON::BI__builtin_neon_vcvtph_u16_f16:
10792       Int = Intrinsic::aarch64_neon_fcvtpu; break;
10793     case NEON::BI__builtin_neon_vcvth_u16_f16:
10794       Int = Intrinsic::aarch64_neon_fcvtzu; break;
10795     case NEON::BI__builtin_neon_vcvtah_s16_f16:
10796       Int = Intrinsic::aarch64_neon_fcvtas; break;
10797     case NEON::BI__builtin_neon_vcvtmh_s16_f16:
10798       Int = Intrinsic::aarch64_neon_fcvtms; break;
10799     case NEON::BI__builtin_neon_vcvtnh_s16_f16:
10800       Int = Intrinsic::aarch64_neon_fcvtns; break;
10801     case NEON::BI__builtin_neon_vcvtph_s16_f16:
10802       Int = Intrinsic::aarch64_neon_fcvtps; break;
10803     case NEON::BI__builtin_neon_vcvth_s16_f16:
10804       Int = Intrinsic::aarch64_neon_fcvtzs; break;
10805     }
10806     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvt");
10807     return Builder.CreateTrunc(Ops[0], Int16Ty);
10808   }
10809   case NEON::BI__builtin_neon_vcaleh_f16:
10810   case NEON::BI__builtin_neon_vcalth_f16:
10811   case NEON::BI__builtin_neon_vcageh_f16:
10812   case NEON::BI__builtin_neon_vcagth_f16: {
10813     unsigned Int;
10814     llvm::Type* InTy = Int32Ty;
10815     llvm::Type* FTy  = HalfTy;
10816     llvm::Type *Tys[2] = {InTy, FTy};
10817     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10818     switch (BuiltinID) {
10819     default: llvm_unreachable("missing builtin ID in switch!");
10820     case NEON::BI__builtin_neon_vcageh_f16:
10821       Int = Intrinsic::aarch64_neon_facge; break;
10822     case NEON::BI__builtin_neon_vcagth_f16:
10823       Int = Intrinsic::aarch64_neon_facgt; break;
10824     case NEON::BI__builtin_neon_vcaleh_f16:
10825       Int = Intrinsic::aarch64_neon_facge; std::swap(Ops[0], Ops[1]); break;
10826     case NEON::BI__builtin_neon_vcalth_f16:
10827       Int = Intrinsic::aarch64_neon_facgt; std::swap(Ops[0], Ops[1]); break;
10828     }
10829     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "facg");
10830     return Builder.CreateTrunc(Ops[0], Int16Ty);
10831   }
10832   case NEON::BI__builtin_neon_vcvth_n_s16_f16:
10833   case NEON::BI__builtin_neon_vcvth_n_u16_f16: {
10834     unsigned Int;
10835     llvm::Type* InTy = Int32Ty;
10836     llvm::Type* FTy  = HalfTy;
10837     llvm::Type *Tys[2] = {InTy, FTy};
10838     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10839     switch (BuiltinID) {
10840     default: llvm_unreachable("missing builtin ID in switch!");
10841     case NEON::BI__builtin_neon_vcvth_n_s16_f16:
10842       Int = Intrinsic::aarch64_neon_vcvtfp2fxs; break;
10843     case NEON::BI__builtin_neon_vcvth_n_u16_f16:
10844       Int = Intrinsic::aarch64_neon_vcvtfp2fxu; break;
10845     }
10846     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvth_n");
10847     return Builder.CreateTrunc(Ops[0], Int16Ty);
10848   }
10849   case NEON::BI__builtin_neon_vcvth_n_f16_s16:
10850   case NEON::BI__builtin_neon_vcvth_n_f16_u16: {
10851     unsigned Int;
10852     llvm::Type* FTy  = HalfTy;
10853     llvm::Type* InTy = Int32Ty;
10854     llvm::Type *Tys[2] = {FTy, InTy};
10855     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10856     switch (BuiltinID) {
10857     default: llvm_unreachable("missing builtin ID in switch!");
10858     case NEON::BI__builtin_neon_vcvth_n_f16_s16:
10859       Int = Intrinsic::aarch64_neon_vcvtfxs2fp;
10860       Ops[0] = Builder.CreateSExt(Ops[0], InTy, "sext");
10861       break;
10862     case NEON::BI__builtin_neon_vcvth_n_f16_u16:
10863       Int = Intrinsic::aarch64_neon_vcvtfxu2fp;
10864       Ops[0] = Builder.CreateZExt(Ops[0], InTy);
10865       break;
10866     }
10867     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvth_n");
10868   }
10869   case NEON::BI__builtin_neon_vpaddd_s64: {
10870     auto *Ty = llvm::FixedVectorType::get(Int64Ty, 2);
10871     Value *Vec = EmitScalarExpr(E->getArg(0));
10872     // The vector is v2f64, so make sure it's bitcast to that.
10873     Vec = Builder.CreateBitCast(Vec, Ty, "v2i64");
10874     llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0);
10875     llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1);
10876     Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
10877     Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
10878     // Pairwise addition of a v2f64 into a scalar f64.
10879     return Builder.CreateAdd(Op0, Op1, "vpaddd");
10880   }
10881   case NEON::BI__builtin_neon_vpaddd_f64: {
10882     auto *Ty = llvm::FixedVectorType::get(DoubleTy, 2);
10883     Value *Vec = EmitScalarExpr(E->getArg(0));
10884     // The vector is v2f64, so make sure it's bitcast to that.
10885     Vec = Builder.CreateBitCast(Vec, Ty, "v2f64");
10886     llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0);
10887     llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1);
10888     Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
10889     Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
10890     // Pairwise addition of a v2f64 into a scalar f64.
10891     return Builder.CreateFAdd(Op0, Op1, "vpaddd");
10892   }
10893   case NEON::BI__builtin_neon_vpadds_f32: {
10894     auto *Ty = llvm::FixedVectorType::get(FloatTy, 2);
10895     Value *Vec = EmitScalarExpr(E->getArg(0));
10896     // The vector is v2f32, so make sure it's bitcast to that.
10897     Vec = Builder.CreateBitCast(Vec, Ty, "v2f32");
10898     llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0);
10899     llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1);
10900     Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
10901     Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
10902     // Pairwise addition of a v2f32 into a scalar f32.
10903     return Builder.CreateFAdd(Op0, Op1, "vpaddd");
10904   }
10905   case NEON::BI__builtin_neon_vceqzd_s64:
10906   case NEON::BI__builtin_neon_vceqzd_f64:
10907   case NEON::BI__builtin_neon_vceqzs_f32:
10908   case NEON::BI__builtin_neon_vceqzh_f16:
10909     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10910     return EmitAArch64CompareBuiltinExpr(
10911         Ops[0], ConvertType(E->getCallReturnType(getContext())),
10912         ICmpInst::FCMP_OEQ, ICmpInst::ICMP_EQ, "vceqz");
10913   case NEON::BI__builtin_neon_vcgezd_s64:
10914   case NEON::BI__builtin_neon_vcgezd_f64:
10915   case NEON::BI__builtin_neon_vcgezs_f32:
10916   case NEON::BI__builtin_neon_vcgezh_f16:
10917     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10918     return EmitAArch64CompareBuiltinExpr(
10919         Ops[0], ConvertType(E->getCallReturnType(getContext())),
10920         ICmpInst::FCMP_OGE, ICmpInst::ICMP_SGE, "vcgez");
10921   case NEON::BI__builtin_neon_vclezd_s64:
10922   case NEON::BI__builtin_neon_vclezd_f64:
10923   case NEON::BI__builtin_neon_vclezs_f32:
10924   case NEON::BI__builtin_neon_vclezh_f16:
10925     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10926     return EmitAArch64CompareBuiltinExpr(
10927         Ops[0], ConvertType(E->getCallReturnType(getContext())),
10928         ICmpInst::FCMP_OLE, ICmpInst::ICMP_SLE, "vclez");
10929   case NEON::BI__builtin_neon_vcgtzd_s64:
10930   case NEON::BI__builtin_neon_vcgtzd_f64:
10931   case NEON::BI__builtin_neon_vcgtzs_f32:
10932   case NEON::BI__builtin_neon_vcgtzh_f16:
10933     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10934     return EmitAArch64CompareBuiltinExpr(
10935         Ops[0], ConvertType(E->getCallReturnType(getContext())),
10936         ICmpInst::FCMP_OGT, ICmpInst::ICMP_SGT, "vcgtz");
10937   case NEON::BI__builtin_neon_vcltzd_s64:
10938   case NEON::BI__builtin_neon_vcltzd_f64:
10939   case NEON::BI__builtin_neon_vcltzs_f32:
10940   case NEON::BI__builtin_neon_vcltzh_f16:
10941     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10942     return EmitAArch64CompareBuiltinExpr(
10943         Ops[0], ConvertType(E->getCallReturnType(getContext())),
10944         ICmpInst::FCMP_OLT, ICmpInst::ICMP_SLT, "vcltz");
10945 
10946   case NEON::BI__builtin_neon_vceqzd_u64: {
10947     Ops.push_back(EmitScalarExpr(E->getArg(0)));
10948     Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty);
10949     Ops[0] =
10950         Builder.CreateICmpEQ(Ops[0], llvm::Constant::getNullValue(Int64Ty));
10951     return Builder.CreateSExt(Ops[0], Int64Ty, "vceqzd");
10952   }
10953   case NEON::BI__builtin_neon_vceqd_f64:
10954   case NEON::BI__builtin_neon_vcled_f64:
10955   case NEON::BI__builtin_neon_vcltd_f64:
10956   case NEON::BI__builtin_neon_vcged_f64:
10957   case NEON::BI__builtin_neon_vcgtd_f64: {
10958     llvm::CmpInst::Predicate P;
10959     switch (BuiltinID) {
10960     default: llvm_unreachable("missing builtin ID in switch!");
10961     case NEON::BI__builtin_neon_vceqd_f64: P = llvm::FCmpInst::FCMP_OEQ; break;
10962     case NEON::BI__builtin_neon_vcled_f64: P = llvm::FCmpInst::FCMP_OLE; break;
10963     case NEON::BI__builtin_neon_vcltd_f64: P = llvm::FCmpInst::FCMP_OLT; break;
10964     case NEON::BI__builtin_neon_vcged_f64: P = llvm::FCmpInst::FCMP_OGE; break;
10965     case NEON::BI__builtin_neon_vcgtd_f64: P = llvm::FCmpInst::FCMP_OGT; break;
10966     }
10967     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10968     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
10969     Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy);
10970     if (P == llvm::FCmpInst::FCMP_OEQ)
10971       Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
10972     else
10973       Ops[0] = Builder.CreateFCmpS(P, Ops[0], Ops[1]);
10974     return Builder.CreateSExt(Ops[0], Int64Ty, "vcmpd");
10975   }
10976   case NEON::BI__builtin_neon_vceqs_f32:
10977   case NEON::BI__builtin_neon_vcles_f32:
10978   case NEON::BI__builtin_neon_vclts_f32:
10979   case NEON::BI__builtin_neon_vcges_f32:
10980   case NEON::BI__builtin_neon_vcgts_f32: {
10981     llvm::CmpInst::Predicate P;
10982     switch (BuiltinID) {
10983     default: llvm_unreachable("missing builtin ID in switch!");
10984     case NEON::BI__builtin_neon_vceqs_f32: P = llvm::FCmpInst::FCMP_OEQ; break;
10985     case NEON::BI__builtin_neon_vcles_f32: P = llvm::FCmpInst::FCMP_OLE; break;
10986     case NEON::BI__builtin_neon_vclts_f32: P = llvm::FCmpInst::FCMP_OLT; break;
10987     case NEON::BI__builtin_neon_vcges_f32: P = llvm::FCmpInst::FCMP_OGE; break;
10988     case NEON::BI__builtin_neon_vcgts_f32: P = llvm::FCmpInst::FCMP_OGT; break;
10989     }
10990     Ops.push_back(EmitScalarExpr(E->getArg(1)));
10991     Ops[0] = Builder.CreateBitCast(Ops[0], FloatTy);
10992     Ops[1] = Builder.CreateBitCast(Ops[1], FloatTy);
10993     if (P == llvm::FCmpInst::FCMP_OEQ)
10994       Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
10995     else
10996       Ops[0] = Builder.CreateFCmpS(P, Ops[0], Ops[1]);
10997     return Builder.CreateSExt(Ops[0], Int32Ty, "vcmpd");
10998   }
10999   case NEON::BI__builtin_neon_vceqh_f16:
11000   case NEON::BI__builtin_neon_vcleh_f16:
11001   case NEON::BI__builtin_neon_vclth_f16:
11002   case NEON::BI__builtin_neon_vcgeh_f16:
11003   case NEON::BI__builtin_neon_vcgth_f16: {
11004     llvm::CmpInst::Predicate P;
11005     switch (BuiltinID) {
11006     default: llvm_unreachable("missing builtin ID in switch!");
11007     case NEON::BI__builtin_neon_vceqh_f16: P = llvm::FCmpInst::FCMP_OEQ; break;
11008     case NEON::BI__builtin_neon_vcleh_f16: P = llvm::FCmpInst::FCMP_OLE; break;
11009     case NEON::BI__builtin_neon_vclth_f16: P = llvm::FCmpInst::FCMP_OLT; break;
11010     case NEON::BI__builtin_neon_vcgeh_f16: P = llvm::FCmpInst::FCMP_OGE; break;
11011     case NEON::BI__builtin_neon_vcgth_f16: P = llvm::FCmpInst::FCMP_OGT; break;
11012     }
11013     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11014     Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
11015     Ops[1] = Builder.CreateBitCast(Ops[1], HalfTy);
11016     if (P == llvm::FCmpInst::FCMP_OEQ)
11017       Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
11018     else
11019       Ops[0] = Builder.CreateFCmpS(P, Ops[0], Ops[1]);
11020     return Builder.CreateSExt(Ops[0], Int16Ty, "vcmpd");
11021   }
11022   case NEON::BI__builtin_neon_vceqd_s64:
11023   case NEON::BI__builtin_neon_vceqd_u64:
11024   case NEON::BI__builtin_neon_vcgtd_s64:
11025   case NEON::BI__builtin_neon_vcgtd_u64:
11026   case NEON::BI__builtin_neon_vcltd_s64:
11027   case NEON::BI__builtin_neon_vcltd_u64:
11028   case NEON::BI__builtin_neon_vcged_u64:
11029   case NEON::BI__builtin_neon_vcged_s64:
11030   case NEON::BI__builtin_neon_vcled_u64:
11031   case NEON::BI__builtin_neon_vcled_s64: {
11032     llvm::CmpInst::Predicate P;
11033     switch (BuiltinID) {
11034     default: llvm_unreachable("missing builtin ID in switch!");
11035     case NEON::BI__builtin_neon_vceqd_s64:
11036     case NEON::BI__builtin_neon_vceqd_u64:P = llvm::ICmpInst::ICMP_EQ;break;
11037     case NEON::BI__builtin_neon_vcgtd_s64:P = llvm::ICmpInst::ICMP_SGT;break;
11038     case NEON::BI__builtin_neon_vcgtd_u64:P = llvm::ICmpInst::ICMP_UGT;break;
11039     case NEON::BI__builtin_neon_vcltd_s64:P = llvm::ICmpInst::ICMP_SLT;break;
11040     case NEON::BI__builtin_neon_vcltd_u64:P = llvm::ICmpInst::ICMP_ULT;break;
11041     case NEON::BI__builtin_neon_vcged_u64:P = llvm::ICmpInst::ICMP_UGE;break;
11042     case NEON::BI__builtin_neon_vcged_s64:P = llvm::ICmpInst::ICMP_SGE;break;
11043     case NEON::BI__builtin_neon_vcled_u64:P = llvm::ICmpInst::ICMP_ULE;break;
11044     case NEON::BI__builtin_neon_vcled_s64:P = llvm::ICmpInst::ICMP_SLE;break;
11045     }
11046     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11047     Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty);
11048     Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
11049     Ops[0] = Builder.CreateICmp(P, Ops[0], Ops[1]);
11050     return Builder.CreateSExt(Ops[0], Int64Ty, "vceqd");
11051   }
11052   case NEON::BI__builtin_neon_vtstd_s64:
11053   case NEON::BI__builtin_neon_vtstd_u64: {
11054     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11055     Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty);
11056     Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
11057     Ops[0] = Builder.CreateAnd(Ops[0], Ops[1]);
11058     Ops[0] = Builder.CreateICmp(ICmpInst::ICMP_NE, Ops[0],
11059                                 llvm::Constant::getNullValue(Int64Ty));
11060     return Builder.CreateSExt(Ops[0], Int64Ty, "vtstd");
11061   }
11062   case NEON::BI__builtin_neon_vset_lane_i8:
11063   case NEON::BI__builtin_neon_vset_lane_i16:
11064   case NEON::BI__builtin_neon_vset_lane_i32:
11065   case NEON::BI__builtin_neon_vset_lane_i64:
11066   case NEON::BI__builtin_neon_vset_lane_bf16:
11067   case NEON::BI__builtin_neon_vset_lane_f32:
11068   case NEON::BI__builtin_neon_vsetq_lane_i8:
11069   case NEON::BI__builtin_neon_vsetq_lane_i16:
11070   case NEON::BI__builtin_neon_vsetq_lane_i32:
11071   case NEON::BI__builtin_neon_vsetq_lane_i64:
11072   case NEON::BI__builtin_neon_vsetq_lane_bf16:
11073   case NEON::BI__builtin_neon_vsetq_lane_f32:
11074     Ops.push_back(EmitScalarExpr(E->getArg(2)));
11075     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
11076   case NEON::BI__builtin_neon_vset_lane_f64:
11077     // The vector type needs a cast for the v1f64 variant.
11078     Ops[1] =
11079         Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(DoubleTy, 1));
11080     Ops.push_back(EmitScalarExpr(E->getArg(2)));
11081     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
11082   case NEON::BI__builtin_neon_vsetq_lane_f64:
11083     // The vector type needs a cast for the v2f64 variant.
11084     Ops[1] =
11085         Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(DoubleTy, 2));
11086     Ops.push_back(EmitScalarExpr(E->getArg(2)));
11087     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
11088 
11089   case NEON::BI__builtin_neon_vget_lane_i8:
11090   case NEON::BI__builtin_neon_vdupb_lane_i8:
11091     Ops[0] =
11092         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int8Ty, 8));
11093     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11094                                         "vget_lane");
11095   case NEON::BI__builtin_neon_vgetq_lane_i8:
11096   case NEON::BI__builtin_neon_vdupb_laneq_i8:
11097     Ops[0] =
11098         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int8Ty, 16));
11099     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11100                                         "vgetq_lane");
11101   case NEON::BI__builtin_neon_vget_lane_i16:
11102   case NEON::BI__builtin_neon_vduph_lane_i16:
11103     Ops[0] =
11104         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int16Ty, 4));
11105     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11106                                         "vget_lane");
11107   case NEON::BI__builtin_neon_vgetq_lane_i16:
11108   case NEON::BI__builtin_neon_vduph_laneq_i16:
11109     Ops[0] =
11110         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int16Ty, 8));
11111     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11112                                         "vgetq_lane");
11113   case NEON::BI__builtin_neon_vget_lane_i32:
11114   case NEON::BI__builtin_neon_vdups_lane_i32:
11115     Ops[0] =
11116         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int32Ty, 2));
11117     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11118                                         "vget_lane");
11119   case NEON::BI__builtin_neon_vdups_lane_f32:
11120     Ops[0] =
11121         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(FloatTy, 2));
11122     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11123                                         "vdups_lane");
11124   case NEON::BI__builtin_neon_vgetq_lane_i32:
11125   case NEON::BI__builtin_neon_vdups_laneq_i32:
11126     Ops[0] =
11127         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int32Ty, 4));
11128     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11129                                         "vgetq_lane");
11130   case NEON::BI__builtin_neon_vget_lane_i64:
11131   case NEON::BI__builtin_neon_vdupd_lane_i64:
11132     Ops[0] =
11133         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int64Ty, 1));
11134     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11135                                         "vget_lane");
11136   case NEON::BI__builtin_neon_vdupd_lane_f64:
11137     Ops[0] =
11138         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(DoubleTy, 1));
11139     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11140                                         "vdupd_lane");
11141   case NEON::BI__builtin_neon_vgetq_lane_i64:
11142   case NEON::BI__builtin_neon_vdupd_laneq_i64:
11143     Ops[0] =
11144         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int64Ty, 2));
11145     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11146                                         "vgetq_lane");
11147   case NEON::BI__builtin_neon_vget_lane_f32:
11148     Ops[0] =
11149         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(FloatTy, 2));
11150     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11151                                         "vget_lane");
11152   case NEON::BI__builtin_neon_vget_lane_f64:
11153     Ops[0] =
11154         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(DoubleTy, 1));
11155     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11156                                         "vget_lane");
11157   case NEON::BI__builtin_neon_vgetq_lane_f32:
11158   case NEON::BI__builtin_neon_vdups_laneq_f32:
11159     Ops[0] =
11160         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(FloatTy, 4));
11161     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11162                                         "vgetq_lane");
11163   case NEON::BI__builtin_neon_vgetq_lane_f64:
11164   case NEON::BI__builtin_neon_vdupd_laneq_f64:
11165     Ops[0] =
11166         Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(DoubleTy, 2));
11167     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11168                                         "vgetq_lane");
11169   case NEON::BI__builtin_neon_vaddh_f16:
11170     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11171     return Builder.CreateFAdd(Ops[0], Ops[1], "vaddh");
11172   case NEON::BI__builtin_neon_vsubh_f16:
11173     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11174     return Builder.CreateFSub(Ops[0], Ops[1], "vsubh");
11175   case NEON::BI__builtin_neon_vmulh_f16:
11176     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11177     return Builder.CreateFMul(Ops[0], Ops[1], "vmulh");
11178   case NEON::BI__builtin_neon_vdivh_f16:
11179     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11180     return Builder.CreateFDiv(Ops[0], Ops[1], "vdivh");
11181   case NEON::BI__builtin_neon_vfmah_f16:
11182     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
11183     return emitCallMaybeConstrainedFPBuiltin(
11184         *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, HalfTy,
11185         {EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)), Ops[0]});
11186   case NEON::BI__builtin_neon_vfmsh_f16: {
11187     Value* Neg = Builder.CreateFNeg(EmitScalarExpr(E->getArg(1)), "vsubh");
11188 
11189     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
11190     return emitCallMaybeConstrainedFPBuiltin(
11191         *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, HalfTy,
11192         {Neg, EmitScalarExpr(E->getArg(2)), Ops[0]});
11193   }
11194   case NEON::BI__builtin_neon_vaddd_s64:
11195   case NEON::BI__builtin_neon_vaddd_u64:
11196     return Builder.CreateAdd(Ops[0], EmitScalarExpr(E->getArg(1)), "vaddd");
11197   case NEON::BI__builtin_neon_vsubd_s64:
11198   case NEON::BI__builtin_neon_vsubd_u64:
11199     return Builder.CreateSub(Ops[0], EmitScalarExpr(E->getArg(1)), "vsubd");
11200   case NEON::BI__builtin_neon_vqdmlalh_s16:
11201   case NEON::BI__builtin_neon_vqdmlslh_s16: {
11202     SmallVector<Value *, 2> ProductOps;
11203     ProductOps.push_back(vectorWrapScalar16(Ops[1]));
11204     ProductOps.push_back(vectorWrapScalar16(EmitScalarExpr(E->getArg(2))));
11205     auto *VTy = llvm::FixedVectorType::get(Int32Ty, 4);
11206     Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmull, VTy),
11207                           ProductOps, "vqdmlXl");
11208     Constant *CI = ConstantInt::get(SizeTy, 0);
11209     Ops[1] = Builder.CreateExtractElement(Ops[1], CI, "lane0");
11210 
11211     unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlalh_s16
11212                                         ? Intrinsic::aarch64_neon_sqadd
11213                                         : Intrinsic::aarch64_neon_sqsub;
11214     return EmitNeonCall(CGM.getIntrinsic(AccumInt, Int32Ty), Ops, "vqdmlXl");
11215   }
11216   case NEON::BI__builtin_neon_vqshlud_n_s64: {
11217     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11218     Ops[1] = Builder.CreateZExt(Ops[1], Int64Ty);
11219     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqshlu, Int64Ty),
11220                         Ops, "vqshlu_n");
11221   }
11222   case NEON::BI__builtin_neon_vqshld_n_u64:
11223   case NEON::BI__builtin_neon_vqshld_n_s64: {
11224     unsigned Int = BuiltinID == NEON::BI__builtin_neon_vqshld_n_u64
11225                                    ? Intrinsic::aarch64_neon_uqshl
11226                                    : Intrinsic::aarch64_neon_sqshl;
11227     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11228     Ops[1] = Builder.CreateZExt(Ops[1], Int64Ty);
11229     return EmitNeonCall(CGM.getIntrinsic(Int, Int64Ty), Ops, "vqshl_n");
11230   }
11231   case NEON::BI__builtin_neon_vrshrd_n_u64:
11232   case NEON::BI__builtin_neon_vrshrd_n_s64: {
11233     unsigned Int = BuiltinID == NEON::BI__builtin_neon_vrshrd_n_u64
11234                                    ? Intrinsic::aarch64_neon_urshl
11235                                    : Intrinsic::aarch64_neon_srshl;
11236     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11237     int SV = cast<ConstantInt>(Ops[1])->getSExtValue();
11238     Ops[1] = ConstantInt::get(Int64Ty, -SV);
11239     return EmitNeonCall(CGM.getIntrinsic(Int, Int64Ty), Ops, "vrshr_n");
11240   }
11241   case NEON::BI__builtin_neon_vrsrad_n_u64:
11242   case NEON::BI__builtin_neon_vrsrad_n_s64: {
11243     unsigned Int = BuiltinID == NEON::BI__builtin_neon_vrsrad_n_u64
11244                                    ? Intrinsic::aarch64_neon_urshl
11245                                    : Intrinsic::aarch64_neon_srshl;
11246     Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
11247     Ops.push_back(Builder.CreateNeg(EmitScalarExpr(E->getArg(2))));
11248     Ops[1] = Builder.CreateCall(CGM.getIntrinsic(Int, Int64Ty),
11249                                 {Ops[1], Builder.CreateSExt(Ops[2], Int64Ty)});
11250     return Builder.CreateAdd(Ops[0], Builder.CreateBitCast(Ops[1], Int64Ty));
11251   }
11252   case NEON::BI__builtin_neon_vshld_n_s64:
11253   case NEON::BI__builtin_neon_vshld_n_u64: {
11254     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
11255     return Builder.CreateShl(
11256         Ops[0], ConstantInt::get(Int64Ty, Amt->getZExtValue()), "shld_n");
11257   }
11258   case NEON::BI__builtin_neon_vshrd_n_s64: {
11259     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
11260     return Builder.CreateAShr(
11261         Ops[0], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63),
11262                                                    Amt->getZExtValue())),
11263         "shrd_n");
11264   }
11265   case NEON::BI__builtin_neon_vshrd_n_u64: {
11266     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
11267     uint64_t ShiftAmt = Amt->getZExtValue();
11268     // Right-shifting an unsigned value by its size yields 0.
11269     if (ShiftAmt == 64)
11270       return ConstantInt::get(Int64Ty, 0);
11271     return Builder.CreateLShr(Ops[0], ConstantInt::get(Int64Ty, ShiftAmt),
11272                               "shrd_n");
11273   }
11274   case NEON::BI__builtin_neon_vsrad_n_s64: {
11275     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(2)));
11276     Ops[1] = Builder.CreateAShr(
11277         Ops[1], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63),
11278                                                    Amt->getZExtValue())),
11279         "shrd_n");
11280     return Builder.CreateAdd(Ops[0], Ops[1]);
11281   }
11282   case NEON::BI__builtin_neon_vsrad_n_u64: {
11283     llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(2)));
11284     uint64_t ShiftAmt = Amt->getZExtValue();
11285     // Right-shifting an unsigned value by its size yields 0.
11286     // As Op + 0 = Op, return Ops[0] directly.
11287     if (ShiftAmt == 64)
11288       return Ops[0];
11289     Ops[1] = Builder.CreateLShr(Ops[1], ConstantInt::get(Int64Ty, ShiftAmt),
11290                                 "shrd_n");
11291     return Builder.CreateAdd(Ops[0], Ops[1]);
11292   }
11293   case NEON::BI__builtin_neon_vqdmlalh_lane_s16:
11294   case NEON::BI__builtin_neon_vqdmlalh_laneq_s16:
11295   case NEON::BI__builtin_neon_vqdmlslh_lane_s16:
11296   case NEON::BI__builtin_neon_vqdmlslh_laneq_s16: {
11297     Ops[2] = Builder.CreateExtractElement(Ops[2], EmitScalarExpr(E->getArg(3)),
11298                                           "lane");
11299     SmallVector<Value *, 2> ProductOps;
11300     ProductOps.push_back(vectorWrapScalar16(Ops[1]));
11301     ProductOps.push_back(vectorWrapScalar16(Ops[2]));
11302     auto *VTy = llvm::FixedVectorType::get(Int32Ty, 4);
11303     Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmull, VTy),
11304                           ProductOps, "vqdmlXl");
11305     Constant *CI = ConstantInt::get(SizeTy, 0);
11306     Ops[1] = Builder.CreateExtractElement(Ops[1], CI, "lane0");
11307     Ops.pop_back();
11308 
11309     unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlalh_lane_s16 ||
11310                        BuiltinID == NEON::BI__builtin_neon_vqdmlalh_laneq_s16)
11311                           ? Intrinsic::aarch64_neon_sqadd
11312                           : Intrinsic::aarch64_neon_sqsub;
11313     return EmitNeonCall(CGM.getIntrinsic(AccInt, Int32Ty), Ops, "vqdmlXl");
11314   }
11315   case NEON::BI__builtin_neon_vqdmlals_s32:
11316   case NEON::BI__builtin_neon_vqdmlsls_s32: {
11317     SmallVector<Value *, 2> ProductOps;
11318     ProductOps.push_back(Ops[1]);
11319     ProductOps.push_back(EmitScalarExpr(E->getArg(2)));
11320     Ops[1] =
11321         EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmulls_scalar),
11322                      ProductOps, "vqdmlXl");
11323 
11324     unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlals_s32
11325                                         ? Intrinsic::aarch64_neon_sqadd
11326                                         : Intrinsic::aarch64_neon_sqsub;
11327     return EmitNeonCall(CGM.getIntrinsic(AccumInt, Int64Ty), Ops, "vqdmlXl");
11328   }
11329   case NEON::BI__builtin_neon_vqdmlals_lane_s32:
11330   case NEON::BI__builtin_neon_vqdmlals_laneq_s32:
11331   case NEON::BI__builtin_neon_vqdmlsls_lane_s32:
11332   case NEON::BI__builtin_neon_vqdmlsls_laneq_s32: {
11333     Ops[2] = Builder.CreateExtractElement(Ops[2], EmitScalarExpr(E->getArg(3)),
11334                                           "lane");
11335     SmallVector<Value *, 2> ProductOps;
11336     ProductOps.push_back(Ops[1]);
11337     ProductOps.push_back(Ops[2]);
11338     Ops[1] =
11339         EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmulls_scalar),
11340                      ProductOps, "vqdmlXl");
11341     Ops.pop_back();
11342 
11343     unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlals_lane_s32 ||
11344                        BuiltinID == NEON::BI__builtin_neon_vqdmlals_laneq_s32)
11345                           ? Intrinsic::aarch64_neon_sqadd
11346                           : Intrinsic::aarch64_neon_sqsub;
11347     return EmitNeonCall(CGM.getIntrinsic(AccInt, Int64Ty), Ops, "vqdmlXl");
11348   }
11349   case NEON::BI__builtin_neon_vget_lane_bf16:
11350   case NEON::BI__builtin_neon_vduph_lane_bf16:
11351   case NEON::BI__builtin_neon_vduph_lane_f16: {
11352     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11353                                         "vget_lane");
11354   }
11355   case NEON::BI__builtin_neon_vgetq_lane_bf16:
11356   case NEON::BI__builtin_neon_vduph_laneq_bf16:
11357   case NEON::BI__builtin_neon_vduph_laneq_f16: {
11358     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11359                                         "vgetq_lane");
11360   }
11361 
11362   case clang::AArch64::BI_InterlockedAdd: {
11363     Value *Arg0 = EmitScalarExpr(E->getArg(0));
11364     Value *Arg1 = EmitScalarExpr(E->getArg(1));
11365     AtomicRMWInst *RMWI = Builder.CreateAtomicRMW(
11366       AtomicRMWInst::Add, Arg0, Arg1,
11367       llvm::AtomicOrdering::SequentiallyConsistent);
11368     return Builder.CreateAdd(RMWI, Arg1);
11369   }
11370   }
11371 
11372   llvm::FixedVectorType *VTy = GetNeonType(this, Type);
11373   llvm::Type *Ty = VTy;
11374   if (!Ty)
11375     return nullptr;
11376 
11377   // Not all intrinsics handled by the common case work for AArch64 yet, so only
11378   // defer to common code if it's been added to our special map.
11379   Builtin = findARMVectorIntrinsicInMap(AArch64SIMDIntrinsicMap, BuiltinID,
11380                                         AArch64SIMDIntrinsicsProvenSorted);
11381 
11382   if (Builtin)
11383     return EmitCommonNeonBuiltinExpr(
11384         Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic,
11385         Builtin->NameHint, Builtin->TypeModifier, E, Ops,
11386         /*never use addresses*/ Address::invalid(), Address::invalid(), Arch);
11387 
11388   if (Value *V = EmitAArch64TblBuiltinExpr(*this, BuiltinID, E, Ops, Arch))
11389     return V;
11390 
11391   unsigned Int;
11392   switch (BuiltinID) {
11393   default: return nullptr;
11394   case NEON::BI__builtin_neon_vbsl_v:
11395   case NEON::BI__builtin_neon_vbslq_v: {
11396     llvm::Type *BitTy = llvm::VectorType::getInteger(VTy);
11397     Ops[0] = Builder.CreateBitCast(Ops[0], BitTy, "vbsl");
11398     Ops[1] = Builder.CreateBitCast(Ops[1], BitTy, "vbsl");
11399     Ops[2] = Builder.CreateBitCast(Ops[2], BitTy, "vbsl");
11400 
11401     Ops[1] = Builder.CreateAnd(Ops[0], Ops[1], "vbsl");
11402     Ops[2] = Builder.CreateAnd(Builder.CreateNot(Ops[0]), Ops[2], "vbsl");
11403     Ops[0] = Builder.CreateOr(Ops[1], Ops[2], "vbsl");
11404     return Builder.CreateBitCast(Ops[0], Ty);
11405   }
11406   case NEON::BI__builtin_neon_vfma_lane_v:
11407   case NEON::BI__builtin_neon_vfmaq_lane_v: { // Only used for FP types
11408     // The ARM builtins (and instructions) have the addend as the first
11409     // operand, but the 'fma' intrinsics have it last. Swap it around here.
11410     Value *Addend = Ops[0];
11411     Value *Multiplicand = Ops[1];
11412     Value *LaneSource = Ops[2];
11413     Ops[0] = Multiplicand;
11414     Ops[1] = LaneSource;
11415     Ops[2] = Addend;
11416 
11417     // Now adjust things to handle the lane access.
11418     auto *SourceTy = BuiltinID == NEON::BI__builtin_neon_vfmaq_lane_v
11419                          ? llvm::FixedVectorType::get(VTy->getElementType(),
11420                                                       VTy->getNumElements() / 2)
11421                          : VTy;
11422     llvm::Constant *cst = cast<Constant>(Ops[3]);
11423     Value *SV = llvm::ConstantVector::getSplat(VTy->getElementCount(), cst);
11424     Ops[1] = Builder.CreateBitCast(Ops[1], SourceTy);
11425     Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV, "lane");
11426 
11427     Ops.pop_back();
11428     Int = Builder.getIsFPConstrained() ? Intrinsic::experimental_constrained_fma
11429                                        : Intrinsic::fma;
11430     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fmla");
11431   }
11432   case NEON::BI__builtin_neon_vfma_laneq_v: {
11433     auto *VTy = cast<llvm::FixedVectorType>(Ty);
11434     // v1f64 fma should be mapped to Neon scalar f64 fma
11435     if (VTy && VTy->getElementType() == DoubleTy) {
11436       Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
11437       Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy);
11438       llvm::FixedVectorType *VTy =
11439           GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, true));
11440       Ops[2] = Builder.CreateBitCast(Ops[2], VTy);
11441       Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
11442       Value *Result;
11443       Result = emitCallMaybeConstrainedFPBuiltin(
11444           *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma,
11445           DoubleTy, {Ops[1], Ops[2], Ops[0]});
11446       return Builder.CreateBitCast(Result, Ty);
11447     }
11448     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
11449     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
11450 
11451     auto *STy = llvm::FixedVectorType::get(VTy->getElementType(),
11452                                            VTy->getNumElements() * 2);
11453     Ops[2] = Builder.CreateBitCast(Ops[2], STy);
11454     Value *SV = llvm::ConstantVector::getSplat(VTy->getElementCount(),
11455                                                cast<ConstantInt>(Ops[3]));
11456     Ops[2] = Builder.CreateShuffleVector(Ops[2], Ops[2], SV, "lane");
11457 
11458     return emitCallMaybeConstrainedFPBuiltin(
11459         *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
11460         {Ops[2], Ops[1], Ops[0]});
11461   }
11462   case NEON::BI__builtin_neon_vfmaq_laneq_v: {
11463     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
11464     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
11465 
11466     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
11467     Ops[2] = EmitNeonSplat(Ops[2], cast<ConstantInt>(Ops[3]));
11468     return emitCallMaybeConstrainedFPBuiltin(
11469         *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
11470         {Ops[2], Ops[1], Ops[0]});
11471   }
11472   case NEON::BI__builtin_neon_vfmah_lane_f16:
11473   case NEON::BI__builtin_neon_vfmas_lane_f32:
11474   case NEON::BI__builtin_neon_vfmah_laneq_f16:
11475   case NEON::BI__builtin_neon_vfmas_laneq_f32:
11476   case NEON::BI__builtin_neon_vfmad_lane_f64:
11477   case NEON::BI__builtin_neon_vfmad_laneq_f64: {
11478     Ops.push_back(EmitScalarExpr(E->getArg(3)));
11479     llvm::Type *Ty = ConvertType(E->getCallReturnType(getContext()));
11480     Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
11481     return emitCallMaybeConstrainedFPBuiltin(
11482         *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
11483         {Ops[1], Ops[2], Ops[0]});
11484   }
11485   case NEON::BI__builtin_neon_vmull_v:
11486     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
11487     Int = usgn ? Intrinsic::aarch64_neon_umull : Intrinsic::aarch64_neon_smull;
11488     if (Type.isPoly()) Int = Intrinsic::aarch64_neon_pmull;
11489     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull");
11490   case NEON::BI__builtin_neon_vmax_v:
11491   case NEON::BI__builtin_neon_vmaxq_v:
11492     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
11493     Int = usgn ? Intrinsic::aarch64_neon_umax : Intrinsic::aarch64_neon_smax;
11494     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmax;
11495     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmax");
11496   case NEON::BI__builtin_neon_vmaxh_f16: {
11497     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11498     Int = Intrinsic::aarch64_neon_fmax;
11499     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmax");
11500   }
11501   case NEON::BI__builtin_neon_vmin_v:
11502   case NEON::BI__builtin_neon_vminq_v:
11503     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
11504     Int = usgn ? Intrinsic::aarch64_neon_umin : Intrinsic::aarch64_neon_smin;
11505     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmin;
11506     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmin");
11507   case NEON::BI__builtin_neon_vminh_f16: {
11508     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11509     Int = Intrinsic::aarch64_neon_fmin;
11510     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmin");
11511   }
11512   case NEON::BI__builtin_neon_vabd_v:
11513   case NEON::BI__builtin_neon_vabdq_v:
11514     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
11515     Int = usgn ? Intrinsic::aarch64_neon_uabd : Intrinsic::aarch64_neon_sabd;
11516     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fabd;
11517     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vabd");
11518   case NEON::BI__builtin_neon_vpadal_v:
11519   case NEON::BI__builtin_neon_vpadalq_v: {
11520     unsigned ArgElts = VTy->getNumElements();
11521     llvm::IntegerType *EltTy = cast<IntegerType>(VTy->getElementType());
11522     unsigned BitWidth = EltTy->getBitWidth();
11523     auto *ArgTy = llvm::FixedVectorType::get(
11524         llvm::IntegerType::get(getLLVMContext(), BitWidth / 2), 2 * ArgElts);
11525     llvm::Type* Tys[2] = { VTy, ArgTy };
11526     Int = usgn ? Intrinsic::aarch64_neon_uaddlp : Intrinsic::aarch64_neon_saddlp;
11527     SmallVector<llvm::Value*, 1> TmpOps;
11528     TmpOps.push_back(Ops[1]);
11529     Function *F = CGM.getIntrinsic(Int, Tys);
11530     llvm::Value *tmp = EmitNeonCall(F, TmpOps, "vpadal");
11531     llvm::Value *addend = Builder.CreateBitCast(Ops[0], tmp->getType());
11532     return Builder.CreateAdd(tmp, addend);
11533   }
11534   case NEON::BI__builtin_neon_vpmin_v:
11535   case NEON::BI__builtin_neon_vpminq_v:
11536     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
11537     Int = usgn ? Intrinsic::aarch64_neon_uminp : Intrinsic::aarch64_neon_sminp;
11538     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fminp;
11539     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmin");
11540   case NEON::BI__builtin_neon_vpmax_v:
11541   case NEON::BI__builtin_neon_vpmaxq_v:
11542     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
11543     Int = usgn ? Intrinsic::aarch64_neon_umaxp : Intrinsic::aarch64_neon_smaxp;
11544     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmaxp;
11545     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmax");
11546   case NEON::BI__builtin_neon_vminnm_v:
11547   case NEON::BI__builtin_neon_vminnmq_v:
11548     Int = Intrinsic::aarch64_neon_fminnm;
11549     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vminnm");
11550   case NEON::BI__builtin_neon_vminnmh_f16:
11551     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11552     Int = Intrinsic::aarch64_neon_fminnm;
11553     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vminnm");
11554   case NEON::BI__builtin_neon_vmaxnm_v:
11555   case NEON::BI__builtin_neon_vmaxnmq_v:
11556     Int = Intrinsic::aarch64_neon_fmaxnm;
11557     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmaxnm");
11558   case NEON::BI__builtin_neon_vmaxnmh_f16:
11559     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11560     Int = Intrinsic::aarch64_neon_fmaxnm;
11561     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmaxnm");
11562   case NEON::BI__builtin_neon_vrecpss_f32: {
11563     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11564     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, FloatTy),
11565                         Ops, "vrecps");
11566   }
11567   case NEON::BI__builtin_neon_vrecpsd_f64:
11568     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11569     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, DoubleTy),
11570                         Ops, "vrecps");
11571   case NEON::BI__builtin_neon_vrecpsh_f16:
11572     Ops.push_back(EmitScalarExpr(E->getArg(1)));
11573     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, HalfTy),
11574                         Ops, "vrecps");
11575   case NEON::BI__builtin_neon_vqshrun_n_v:
11576     Int = Intrinsic::aarch64_neon_sqshrun;
11577     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrun_n");
11578   case NEON::BI__builtin_neon_vqrshrun_n_v:
11579     Int = Intrinsic::aarch64_neon_sqrshrun;
11580     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrun_n");
11581   case NEON::BI__builtin_neon_vqshrn_n_v:
11582     Int = usgn ? Intrinsic::aarch64_neon_uqshrn : Intrinsic::aarch64_neon_sqshrn;
11583     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n");
11584   case NEON::BI__builtin_neon_vrshrn_n_v:
11585     Int = Intrinsic::aarch64_neon_rshrn;
11586     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshrn_n");
11587   case NEON::BI__builtin_neon_vqrshrn_n_v:
11588     Int = usgn ? Intrinsic::aarch64_neon_uqrshrn : Intrinsic::aarch64_neon_sqrshrn;
11589     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n");
11590   case NEON::BI__builtin_neon_vrndah_f16: {
11591     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11592     Int = Builder.getIsFPConstrained()
11593               ? Intrinsic::experimental_constrained_round
11594               : Intrinsic::round;
11595     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrnda");
11596   }
11597   case NEON::BI__builtin_neon_vrnda_v:
11598   case NEON::BI__builtin_neon_vrndaq_v: {
11599     Int = Builder.getIsFPConstrained()
11600               ? Intrinsic::experimental_constrained_round
11601               : Intrinsic::round;
11602     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnda");
11603   }
11604   case NEON::BI__builtin_neon_vrndih_f16: {
11605     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11606     Int = Builder.getIsFPConstrained()
11607               ? Intrinsic::experimental_constrained_nearbyint
11608               : Intrinsic::nearbyint;
11609     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndi");
11610   }
11611   case NEON::BI__builtin_neon_vrndmh_f16: {
11612     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11613     Int = Builder.getIsFPConstrained()
11614               ? Intrinsic::experimental_constrained_floor
11615               : Intrinsic::floor;
11616     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndm");
11617   }
11618   case NEON::BI__builtin_neon_vrndm_v:
11619   case NEON::BI__builtin_neon_vrndmq_v: {
11620     Int = Builder.getIsFPConstrained()
11621               ? Intrinsic::experimental_constrained_floor
11622               : Intrinsic::floor;
11623     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndm");
11624   }
11625   case NEON::BI__builtin_neon_vrndnh_f16: {
11626     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11627     Int = Builder.getIsFPConstrained()
11628               ? Intrinsic::experimental_constrained_roundeven
11629               : Intrinsic::roundeven;
11630     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndn");
11631   }
11632   case NEON::BI__builtin_neon_vrndn_v:
11633   case NEON::BI__builtin_neon_vrndnq_v: {
11634     Int = Builder.getIsFPConstrained()
11635               ? Intrinsic::experimental_constrained_roundeven
11636               : Intrinsic::roundeven;
11637     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndn");
11638   }
11639   case NEON::BI__builtin_neon_vrndns_f32: {
11640     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11641     Int = Builder.getIsFPConstrained()
11642               ? Intrinsic::experimental_constrained_roundeven
11643               : Intrinsic::roundeven;
11644     return EmitNeonCall(CGM.getIntrinsic(Int, FloatTy), Ops, "vrndn");
11645   }
11646   case NEON::BI__builtin_neon_vrndph_f16: {
11647     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11648     Int = Builder.getIsFPConstrained()
11649               ? Intrinsic::experimental_constrained_ceil
11650               : Intrinsic::ceil;
11651     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndp");
11652   }
11653   case NEON::BI__builtin_neon_vrndp_v:
11654   case NEON::BI__builtin_neon_vrndpq_v: {
11655     Int = Builder.getIsFPConstrained()
11656               ? Intrinsic::experimental_constrained_ceil
11657               : Intrinsic::ceil;
11658     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndp");
11659   }
11660   case NEON::BI__builtin_neon_vrndxh_f16: {
11661     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11662     Int = Builder.getIsFPConstrained()
11663               ? Intrinsic::experimental_constrained_rint
11664               : Intrinsic::rint;
11665     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndx");
11666   }
11667   case NEON::BI__builtin_neon_vrndx_v:
11668   case NEON::BI__builtin_neon_vrndxq_v: {
11669     Int = Builder.getIsFPConstrained()
11670               ? Intrinsic::experimental_constrained_rint
11671               : Intrinsic::rint;
11672     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndx");
11673   }
11674   case NEON::BI__builtin_neon_vrndh_f16: {
11675     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11676     Int = Builder.getIsFPConstrained()
11677               ? Intrinsic::experimental_constrained_trunc
11678               : Intrinsic::trunc;
11679     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndz");
11680   }
11681   case NEON::BI__builtin_neon_vrnd32x_f32:
11682   case NEON::BI__builtin_neon_vrnd32xq_f32: {
11683     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11684     Int = Intrinsic::aarch64_neon_frint32x;
11685     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd32x");
11686   }
11687   case NEON::BI__builtin_neon_vrnd32z_f32:
11688   case NEON::BI__builtin_neon_vrnd32zq_f32: {
11689     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11690     Int = Intrinsic::aarch64_neon_frint32z;
11691     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd32z");
11692   }
11693   case NEON::BI__builtin_neon_vrnd64x_f32:
11694   case NEON::BI__builtin_neon_vrnd64xq_f32: {
11695     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11696     Int = Intrinsic::aarch64_neon_frint64x;
11697     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd64x");
11698   }
11699   case NEON::BI__builtin_neon_vrnd64z_f32:
11700   case NEON::BI__builtin_neon_vrnd64zq_f32: {
11701     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11702     Int = Intrinsic::aarch64_neon_frint64z;
11703     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd64z");
11704   }
11705   case NEON::BI__builtin_neon_vrnd_v:
11706   case NEON::BI__builtin_neon_vrndq_v: {
11707     Int = Builder.getIsFPConstrained()
11708               ? Intrinsic::experimental_constrained_trunc
11709               : Intrinsic::trunc;
11710     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndz");
11711   }
11712   case NEON::BI__builtin_neon_vcvt_f64_v:
11713   case NEON::BI__builtin_neon_vcvtq_f64_v:
11714     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
11715     Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad));
11716     return usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
11717                 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
11718   case NEON::BI__builtin_neon_vcvt_f64_f32: {
11719     assert(Type.getEltType() == NeonTypeFlags::Float64 && quad &&
11720            "unexpected vcvt_f64_f32 builtin");
11721     NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float32, false, false);
11722     Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag));
11723 
11724     return Builder.CreateFPExt(Ops[0], Ty, "vcvt");
11725   }
11726   case NEON::BI__builtin_neon_vcvt_f32_f64: {
11727     assert(Type.getEltType() == NeonTypeFlags::Float32 &&
11728            "unexpected vcvt_f32_f64 builtin");
11729     NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float64, false, true);
11730     Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag));
11731 
11732     return Builder.CreateFPTrunc(Ops[0], Ty, "vcvt");
11733   }
11734   case NEON::BI__builtin_neon_vcvt_s32_v:
11735   case NEON::BI__builtin_neon_vcvt_u32_v:
11736   case NEON::BI__builtin_neon_vcvt_s64_v:
11737   case NEON::BI__builtin_neon_vcvt_u64_v:
11738   case NEON::BI__builtin_neon_vcvt_s16_f16:
11739   case NEON::BI__builtin_neon_vcvt_u16_f16:
11740   case NEON::BI__builtin_neon_vcvtq_s32_v:
11741   case NEON::BI__builtin_neon_vcvtq_u32_v:
11742   case NEON::BI__builtin_neon_vcvtq_s64_v:
11743   case NEON::BI__builtin_neon_vcvtq_u64_v:
11744   case NEON::BI__builtin_neon_vcvtq_s16_f16:
11745   case NEON::BI__builtin_neon_vcvtq_u16_f16: {
11746     Int =
11747         usgn ? Intrinsic::aarch64_neon_fcvtzu : Intrinsic::aarch64_neon_fcvtzs;
11748     llvm::Type *Tys[2] = {Ty, GetFloatNeonType(this, Type)};
11749     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtz");
11750   }
11751   case NEON::BI__builtin_neon_vcvta_s16_f16:
11752   case NEON::BI__builtin_neon_vcvta_u16_f16:
11753   case NEON::BI__builtin_neon_vcvta_s32_v:
11754   case NEON::BI__builtin_neon_vcvtaq_s16_f16:
11755   case NEON::BI__builtin_neon_vcvtaq_s32_v:
11756   case NEON::BI__builtin_neon_vcvta_u32_v:
11757   case NEON::BI__builtin_neon_vcvtaq_u16_f16:
11758   case NEON::BI__builtin_neon_vcvtaq_u32_v:
11759   case NEON::BI__builtin_neon_vcvta_s64_v:
11760   case NEON::BI__builtin_neon_vcvtaq_s64_v:
11761   case NEON::BI__builtin_neon_vcvta_u64_v:
11762   case NEON::BI__builtin_neon_vcvtaq_u64_v: {
11763     Int = usgn ? Intrinsic::aarch64_neon_fcvtau : Intrinsic::aarch64_neon_fcvtas;
11764     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
11765     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvta");
11766   }
11767   case NEON::BI__builtin_neon_vcvtm_s16_f16:
11768   case NEON::BI__builtin_neon_vcvtm_s32_v:
11769   case NEON::BI__builtin_neon_vcvtmq_s16_f16:
11770   case NEON::BI__builtin_neon_vcvtmq_s32_v:
11771   case NEON::BI__builtin_neon_vcvtm_u16_f16:
11772   case NEON::BI__builtin_neon_vcvtm_u32_v:
11773   case NEON::BI__builtin_neon_vcvtmq_u16_f16:
11774   case NEON::BI__builtin_neon_vcvtmq_u32_v:
11775   case NEON::BI__builtin_neon_vcvtm_s64_v:
11776   case NEON::BI__builtin_neon_vcvtmq_s64_v:
11777   case NEON::BI__builtin_neon_vcvtm_u64_v:
11778   case NEON::BI__builtin_neon_vcvtmq_u64_v: {
11779     Int = usgn ? Intrinsic::aarch64_neon_fcvtmu : Intrinsic::aarch64_neon_fcvtms;
11780     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
11781     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtm");
11782   }
11783   case NEON::BI__builtin_neon_vcvtn_s16_f16:
11784   case NEON::BI__builtin_neon_vcvtn_s32_v:
11785   case NEON::BI__builtin_neon_vcvtnq_s16_f16:
11786   case NEON::BI__builtin_neon_vcvtnq_s32_v:
11787   case NEON::BI__builtin_neon_vcvtn_u16_f16:
11788   case NEON::BI__builtin_neon_vcvtn_u32_v:
11789   case NEON::BI__builtin_neon_vcvtnq_u16_f16:
11790   case NEON::BI__builtin_neon_vcvtnq_u32_v:
11791   case NEON::BI__builtin_neon_vcvtn_s64_v:
11792   case NEON::BI__builtin_neon_vcvtnq_s64_v:
11793   case NEON::BI__builtin_neon_vcvtn_u64_v:
11794   case NEON::BI__builtin_neon_vcvtnq_u64_v: {
11795     Int = usgn ? Intrinsic::aarch64_neon_fcvtnu : Intrinsic::aarch64_neon_fcvtns;
11796     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
11797     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtn");
11798   }
11799   case NEON::BI__builtin_neon_vcvtp_s16_f16:
11800   case NEON::BI__builtin_neon_vcvtp_s32_v:
11801   case NEON::BI__builtin_neon_vcvtpq_s16_f16:
11802   case NEON::BI__builtin_neon_vcvtpq_s32_v:
11803   case NEON::BI__builtin_neon_vcvtp_u16_f16:
11804   case NEON::BI__builtin_neon_vcvtp_u32_v:
11805   case NEON::BI__builtin_neon_vcvtpq_u16_f16:
11806   case NEON::BI__builtin_neon_vcvtpq_u32_v:
11807   case NEON::BI__builtin_neon_vcvtp_s64_v:
11808   case NEON::BI__builtin_neon_vcvtpq_s64_v:
11809   case NEON::BI__builtin_neon_vcvtp_u64_v:
11810   case NEON::BI__builtin_neon_vcvtpq_u64_v: {
11811     Int = usgn ? Intrinsic::aarch64_neon_fcvtpu : Intrinsic::aarch64_neon_fcvtps;
11812     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
11813     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtp");
11814   }
11815   case NEON::BI__builtin_neon_vmulx_v:
11816   case NEON::BI__builtin_neon_vmulxq_v: {
11817     Int = Intrinsic::aarch64_neon_fmulx;
11818     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmulx");
11819   }
11820   case NEON::BI__builtin_neon_vmulxh_lane_f16:
11821   case NEON::BI__builtin_neon_vmulxh_laneq_f16: {
11822     // vmulx_lane should be mapped to Neon scalar mulx after
11823     // extracting the scalar element
11824     Ops.push_back(EmitScalarExpr(E->getArg(2)));
11825     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
11826     Ops.pop_back();
11827     Int = Intrinsic::aarch64_neon_fmulx;
11828     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmulx");
11829   }
11830   case NEON::BI__builtin_neon_vmul_lane_v:
11831   case NEON::BI__builtin_neon_vmul_laneq_v: {
11832     // v1f64 vmul_lane should be mapped to Neon scalar mul lane
11833     bool Quad = false;
11834     if (BuiltinID == NEON::BI__builtin_neon_vmul_laneq_v)
11835       Quad = true;
11836     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
11837     llvm::FixedVectorType *VTy =
11838         GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, Quad));
11839     Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
11840     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
11841     Value *Result = Builder.CreateFMul(Ops[0], Ops[1]);
11842     return Builder.CreateBitCast(Result, Ty);
11843   }
11844   case NEON::BI__builtin_neon_vnegd_s64:
11845     return Builder.CreateNeg(EmitScalarExpr(E->getArg(0)), "vnegd");
11846   case NEON::BI__builtin_neon_vnegh_f16:
11847     return Builder.CreateFNeg(EmitScalarExpr(E->getArg(0)), "vnegh");
11848   case NEON::BI__builtin_neon_vpmaxnm_v:
11849   case NEON::BI__builtin_neon_vpmaxnmq_v: {
11850     Int = Intrinsic::aarch64_neon_fmaxnmp;
11851     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmaxnm");
11852   }
11853   case NEON::BI__builtin_neon_vpminnm_v:
11854   case NEON::BI__builtin_neon_vpminnmq_v: {
11855     Int = Intrinsic::aarch64_neon_fminnmp;
11856     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpminnm");
11857   }
11858   case NEON::BI__builtin_neon_vsqrth_f16: {
11859     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11860     Int = Builder.getIsFPConstrained()
11861               ? Intrinsic::experimental_constrained_sqrt
11862               : Intrinsic::sqrt;
11863     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vsqrt");
11864   }
11865   case NEON::BI__builtin_neon_vsqrt_v:
11866   case NEON::BI__builtin_neon_vsqrtq_v: {
11867     Int = Builder.getIsFPConstrained()
11868               ? Intrinsic::experimental_constrained_sqrt
11869               : Intrinsic::sqrt;
11870     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
11871     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqrt");
11872   }
11873   case NEON::BI__builtin_neon_vrbit_v:
11874   case NEON::BI__builtin_neon_vrbitq_v: {
11875     Int = Intrinsic::bitreverse;
11876     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrbit");
11877   }
11878   case NEON::BI__builtin_neon_vaddv_u8:
11879     // FIXME: These are handled by the AArch64 scalar code.
11880     usgn = true;
11881     [[fallthrough]];
11882   case NEON::BI__builtin_neon_vaddv_s8: {
11883     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
11884     Ty = Int32Ty;
11885     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
11886     llvm::Type *Tys[2] = { Ty, VTy };
11887     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11888     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
11889     return Builder.CreateTrunc(Ops[0], Int8Ty);
11890   }
11891   case NEON::BI__builtin_neon_vaddv_u16:
11892     usgn = true;
11893     [[fallthrough]];
11894   case NEON::BI__builtin_neon_vaddv_s16: {
11895     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
11896     Ty = Int32Ty;
11897     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
11898     llvm::Type *Tys[2] = { Ty, VTy };
11899     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11900     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
11901     return Builder.CreateTrunc(Ops[0], Int16Ty);
11902   }
11903   case NEON::BI__builtin_neon_vaddvq_u8:
11904     usgn = true;
11905     [[fallthrough]];
11906   case NEON::BI__builtin_neon_vaddvq_s8: {
11907     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
11908     Ty = Int32Ty;
11909     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
11910     llvm::Type *Tys[2] = { Ty, VTy };
11911     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11912     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
11913     return Builder.CreateTrunc(Ops[0], Int8Ty);
11914   }
11915   case NEON::BI__builtin_neon_vaddvq_u16:
11916     usgn = true;
11917     [[fallthrough]];
11918   case NEON::BI__builtin_neon_vaddvq_s16: {
11919     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
11920     Ty = Int32Ty;
11921     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
11922     llvm::Type *Tys[2] = { Ty, VTy };
11923     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11924     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
11925     return Builder.CreateTrunc(Ops[0], Int16Ty);
11926   }
11927   case NEON::BI__builtin_neon_vmaxv_u8: {
11928     Int = Intrinsic::aarch64_neon_umaxv;
11929     Ty = Int32Ty;
11930     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
11931     llvm::Type *Tys[2] = { Ty, VTy };
11932     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11933     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
11934     return Builder.CreateTrunc(Ops[0], Int8Ty);
11935   }
11936   case NEON::BI__builtin_neon_vmaxv_u16: {
11937     Int = Intrinsic::aarch64_neon_umaxv;
11938     Ty = Int32Ty;
11939     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
11940     llvm::Type *Tys[2] = { Ty, VTy };
11941     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11942     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
11943     return Builder.CreateTrunc(Ops[0], Int16Ty);
11944   }
11945   case NEON::BI__builtin_neon_vmaxvq_u8: {
11946     Int = Intrinsic::aarch64_neon_umaxv;
11947     Ty = Int32Ty;
11948     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
11949     llvm::Type *Tys[2] = { Ty, VTy };
11950     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11951     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
11952     return Builder.CreateTrunc(Ops[0], Int8Ty);
11953   }
11954   case NEON::BI__builtin_neon_vmaxvq_u16: {
11955     Int = Intrinsic::aarch64_neon_umaxv;
11956     Ty = Int32Ty;
11957     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
11958     llvm::Type *Tys[2] = { Ty, VTy };
11959     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11960     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
11961     return Builder.CreateTrunc(Ops[0], Int16Ty);
11962   }
11963   case NEON::BI__builtin_neon_vmaxv_s8: {
11964     Int = Intrinsic::aarch64_neon_smaxv;
11965     Ty = Int32Ty;
11966     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
11967     llvm::Type *Tys[2] = { Ty, VTy };
11968     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11969     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
11970     return Builder.CreateTrunc(Ops[0], Int8Ty);
11971   }
11972   case NEON::BI__builtin_neon_vmaxv_s16: {
11973     Int = Intrinsic::aarch64_neon_smaxv;
11974     Ty = Int32Ty;
11975     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
11976     llvm::Type *Tys[2] = { Ty, VTy };
11977     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11978     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
11979     return Builder.CreateTrunc(Ops[0], Int16Ty);
11980   }
11981   case NEON::BI__builtin_neon_vmaxvq_s8: {
11982     Int = Intrinsic::aarch64_neon_smaxv;
11983     Ty = Int32Ty;
11984     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
11985     llvm::Type *Tys[2] = { Ty, VTy };
11986     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11987     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
11988     return Builder.CreateTrunc(Ops[0], Int8Ty);
11989   }
11990   case NEON::BI__builtin_neon_vmaxvq_s16: {
11991     Int = Intrinsic::aarch64_neon_smaxv;
11992     Ty = Int32Ty;
11993     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
11994     llvm::Type *Tys[2] = { Ty, VTy };
11995     Ops.push_back(EmitScalarExpr(E->getArg(0)));
11996     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
11997     return Builder.CreateTrunc(Ops[0], Int16Ty);
11998   }
11999   case NEON::BI__builtin_neon_vmaxv_f16: {
12000     Int = Intrinsic::aarch64_neon_fmaxv;
12001     Ty = HalfTy;
12002     VTy = llvm::FixedVectorType::get(HalfTy, 4);
12003     llvm::Type *Tys[2] = { Ty, VTy };
12004     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12005     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
12006     return Builder.CreateTrunc(Ops[0], HalfTy);
12007   }
12008   case NEON::BI__builtin_neon_vmaxvq_f16: {
12009     Int = Intrinsic::aarch64_neon_fmaxv;
12010     Ty = HalfTy;
12011     VTy = llvm::FixedVectorType::get(HalfTy, 8);
12012     llvm::Type *Tys[2] = { Ty, VTy };
12013     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12014     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
12015     return Builder.CreateTrunc(Ops[0], HalfTy);
12016   }
12017   case NEON::BI__builtin_neon_vminv_u8: {
12018     Int = Intrinsic::aarch64_neon_uminv;
12019     Ty = Int32Ty;
12020     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
12021     llvm::Type *Tys[2] = { Ty, VTy };
12022     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12023     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12024     return Builder.CreateTrunc(Ops[0], Int8Ty);
12025   }
12026   case NEON::BI__builtin_neon_vminv_u16: {
12027     Int = Intrinsic::aarch64_neon_uminv;
12028     Ty = Int32Ty;
12029     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
12030     llvm::Type *Tys[2] = { Ty, VTy };
12031     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12032     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12033     return Builder.CreateTrunc(Ops[0], Int16Ty);
12034   }
12035   case NEON::BI__builtin_neon_vminvq_u8: {
12036     Int = Intrinsic::aarch64_neon_uminv;
12037     Ty = Int32Ty;
12038     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
12039     llvm::Type *Tys[2] = { Ty, VTy };
12040     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12041     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12042     return Builder.CreateTrunc(Ops[0], Int8Ty);
12043   }
12044   case NEON::BI__builtin_neon_vminvq_u16: {
12045     Int = Intrinsic::aarch64_neon_uminv;
12046     Ty = Int32Ty;
12047     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
12048     llvm::Type *Tys[2] = { Ty, VTy };
12049     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12050     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12051     return Builder.CreateTrunc(Ops[0], Int16Ty);
12052   }
12053   case NEON::BI__builtin_neon_vminv_s8: {
12054     Int = Intrinsic::aarch64_neon_sminv;
12055     Ty = Int32Ty;
12056     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
12057     llvm::Type *Tys[2] = { Ty, VTy };
12058     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12059     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12060     return Builder.CreateTrunc(Ops[0], Int8Ty);
12061   }
12062   case NEON::BI__builtin_neon_vminv_s16: {
12063     Int = Intrinsic::aarch64_neon_sminv;
12064     Ty = Int32Ty;
12065     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
12066     llvm::Type *Tys[2] = { Ty, VTy };
12067     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12068     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12069     return Builder.CreateTrunc(Ops[0], Int16Ty);
12070   }
12071   case NEON::BI__builtin_neon_vminvq_s8: {
12072     Int = Intrinsic::aarch64_neon_sminv;
12073     Ty = Int32Ty;
12074     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
12075     llvm::Type *Tys[2] = { Ty, VTy };
12076     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12077     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12078     return Builder.CreateTrunc(Ops[0], Int8Ty);
12079   }
12080   case NEON::BI__builtin_neon_vminvq_s16: {
12081     Int = Intrinsic::aarch64_neon_sminv;
12082     Ty = Int32Ty;
12083     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
12084     llvm::Type *Tys[2] = { Ty, VTy };
12085     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12086     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12087     return Builder.CreateTrunc(Ops[0], Int16Ty);
12088   }
12089   case NEON::BI__builtin_neon_vminv_f16: {
12090     Int = Intrinsic::aarch64_neon_fminv;
12091     Ty = HalfTy;
12092     VTy = llvm::FixedVectorType::get(HalfTy, 4);
12093     llvm::Type *Tys[2] = { Ty, VTy };
12094     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12095     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12096     return Builder.CreateTrunc(Ops[0], HalfTy);
12097   }
12098   case NEON::BI__builtin_neon_vminvq_f16: {
12099     Int = Intrinsic::aarch64_neon_fminv;
12100     Ty = HalfTy;
12101     VTy = llvm::FixedVectorType::get(HalfTy, 8);
12102     llvm::Type *Tys[2] = { Ty, VTy };
12103     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12104     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12105     return Builder.CreateTrunc(Ops[0], HalfTy);
12106   }
12107   case NEON::BI__builtin_neon_vmaxnmv_f16: {
12108     Int = Intrinsic::aarch64_neon_fmaxnmv;
12109     Ty = HalfTy;
12110     VTy = llvm::FixedVectorType::get(HalfTy, 4);
12111     llvm::Type *Tys[2] = { Ty, VTy };
12112     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12113     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxnmv");
12114     return Builder.CreateTrunc(Ops[0], HalfTy);
12115   }
12116   case NEON::BI__builtin_neon_vmaxnmvq_f16: {
12117     Int = Intrinsic::aarch64_neon_fmaxnmv;
12118     Ty = HalfTy;
12119     VTy = llvm::FixedVectorType::get(HalfTy, 8);
12120     llvm::Type *Tys[2] = { Ty, VTy };
12121     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12122     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxnmv");
12123     return Builder.CreateTrunc(Ops[0], HalfTy);
12124   }
12125   case NEON::BI__builtin_neon_vminnmv_f16: {
12126     Int = Intrinsic::aarch64_neon_fminnmv;
12127     Ty = HalfTy;
12128     VTy = llvm::FixedVectorType::get(HalfTy, 4);
12129     llvm::Type *Tys[2] = { Ty, VTy };
12130     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12131     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminnmv");
12132     return Builder.CreateTrunc(Ops[0], HalfTy);
12133   }
12134   case NEON::BI__builtin_neon_vminnmvq_f16: {
12135     Int = Intrinsic::aarch64_neon_fminnmv;
12136     Ty = HalfTy;
12137     VTy = llvm::FixedVectorType::get(HalfTy, 8);
12138     llvm::Type *Tys[2] = { Ty, VTy };
12139     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12140     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminnmv");
12141     return Builder.CreateTrunc(Ops[0], HalfTy);
12142   }
12143   case NEON::BI__builtin_neon_vmul_n_f64: {
12144     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
12145     Value *RHS = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)), DoubleTy);
12146     return Builder.CreateFMul(Ops[0], RHS);
12147   }
12148   case NEON::BI__builtin_neon_vaddlv_u8: {
12149     Int = Intrinsic::aarch64_neon_uaddlv;
12150     Ty = Int32Ty;
12151     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
12152     llvm::Type *Tys[2] = { Ty, VTy };
12153     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12154     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
12155     return Builder.CreateTrunc(Ops[0], Int16Ty);
12156   }
12157   case NEON::BI__builtin_neon_vaddlv_u16: {
12158     Int = Intrinsic::aarch64_neon_uaddlv;
12159     Ty = Int32Ty;
12160     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
12161     llvm::Type *Tys[2] = { Ty, VTy };
12162     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12163     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
12164   }
12165   case NEON::BI__builtin_neon_vaddlvq_u8: {
12166     Int = Intrinsic::aarch64_neon_uaddlv;
12167     Ty = Int32Ty;
12168     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
12169     llvm::Type *Tys[2] = { Ty, VTy };
12170     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12171     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
12172     return Builder.CreateTrunc(Ops[0], Int16Ty);
12173   }
12174   case NEON::BI__builtin_neon_vaddlvq_u16: {
12175     Int = Intrinsic::aarch64_neon_uaddlv;
12176     Ty = Int32Ty;
12177     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
12178     llvm::Type *Tys[2] = { Ty, VTy };
12179     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12180     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
12181   }
12182   case NEON::BI__builtin_neon_vaddlv_s8: {
12183     Int = Intrinsic::aarch64_neon_saddlv;
12184     Ty = Int32Ty;
12185     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
12186     llvm::Type *Tys[2] = { Ty, VTy };
12187     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12188     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
12189     return Builder.CreateTrunc(Ops[0], Int16Ty);
12190   }
12191   case NEON::BI__builtin_neon_vaddlv_s16: {
12192     Int = Intrinsic::aarch64_neon_saddlv;
12193     Ty = Int32Ty;
12194     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
12195     llvm::Type *Tys[2] = { Ty, VTy };
12196     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12197     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
12198   }
12199   case NEON::BI__builtin_neon_vaddlvq_s8: {
12200     Int = Intrinsic::aarch64_neon_saddlv;
12201     Ty = Int32Ty;
12202     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
12203     llvm::Type *Tys[2] = { Ty, VTy };
12204     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12205     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
12206     return Builder.CreateTrunc(Ops[0], Int16Ty);
12207   }
12208   case NEON::BI__builtin_neon_vaddlvq_s16: {
12209     Int = Intrinsic::aarch64_neon_saddlv;
12210     Ty = Int32Ty;
12211     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
12212     llvm::Type *Tys[2] = { Ty, VTy };
12213     Ops.push_back(EmitScalarExpr(E->getArg(0)));
12214     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
12215   }
12216   case NEON::BI__builtin_neon_vsri_n_v:
12217   case NEON::BI__builtin_neon_vsriq_n_v: {
12218     Int = Intrinsic::aarch64_neon_vsri;
12219     llvm::Function *Intrin = CGM.getIntrinsic(Int, Ty);
12220     return EmitNeonCall(Intrin, Ops, "vsri_n");
12221   }
12222   case NEON::BI__builtin_neon_vsli_n_v:
12223   case NEON::BI__builtin_neon_vsliq_n_v: {
12224     Int = Intrinsic::aarch64_neon_vsli;
12225     llvm::Function *Intrin = CGM.getIntrinsic(Int, Ty);
12226     return EmitNeonCall(Intrin, Ops, "vsli_n");
12227   }
12228   case NEON::BI__builtin_neon_vsra_n_v:
12229   case NEON::BI__builtin_neon_vsraq_n_v:
12230     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
12231     Ops[1] = EmitNeonRShiftImm(Ops[1], Ops[2], Ty, usgn, "vsra_n");
12232     return Builder.CreateAdd(Ops[0], Ops[1]);
12233   case NEON::BI__builtin_neon_vrsra_n_v:
12234   case NEON::BI__builtin_neon_vrsraq_n_v: {
12235     Int = usgn ? Intrinsic::aarch64_neon_urshl : Intrinsic::aarch64_neon_srshl;
12236     SmallVector<llvm::Value*,2> TmpOps;
12237     TmpOps.push_back(Ops[1]);
12238     TmpOps.push_back(Ops[2]);
12239     Function* F = CGM.getIntrinsic(Int, Ty);
12240     llvm::Value *tmp = EmitNeonCall(F, TmpOps, "vrshr_n", 1, true);
12241     Ops[0] = Builder.CreateBitCast(Ops[0], VTy);
12242     return Builder.CreateAdd(Ops[0], tmp);
12243   }
12244   case NEON::BI__builtin_neon_vld1_v:
12245   case NEON::BI__builtin_neon_vld1q_v: {
12246     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(VTy));
12247     return Builder.CreateAlignedLoad(VTy, Ops[0], PtrOp0.getAlignment());
12248   }
12249   case NEON::BI__builtin_neon_vst1_v:
12250   case NEON::BI__builtin_neon_vst1q_v:
12251     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(VTy));
12252     Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
12253     return Builder.CreateAlignedStore(Ops[1], Ops[0], PtrOp0.getAlignment());
12254   case NEON::BI__builtin_neon_vld1_lane_v:
12255   case NEON::BI__builtin_neon_vld1q_lane_v: {
12256     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
12257     Ty = llvm::PointerType::getUnqual(VTy->getElementType());
12258     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
12259     Ops[0] = Builder.CreateAlignedLoad(VTy->getElementType(), Ops[0],
12260                                        PtrOp0.getAlignment());
12261     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vld1_lane");
12262   }
12263   case NEON::BI__builtin_neon_vldap1_lane_s64:
12264   case NEON::BI__builtin_neon_vldap1q_lane_s64: {
12265     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
12266     Ty = llvm::PointerType::getUnqual(VTy->getElementType());
12267     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
12268     llvm::LoadInst *LI = Builder.CreateAlignedLoad(
12269         VTy->getElementType(), Ops[0], PtrOp0.getAlignment());
12270     LI->setAtomic(llvm::AtomicOrdering::Acquire);
12271     Ops[0] = LI;
12272     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vldap1_lane");
12273   }
12274   case NEON::BI__builtin_neon_vld1_dup_v:
12275   case NEON::BI__builtin_neon_vld1q_dup_v: {
12276     Value *V = PoisonValue::get(Ty);
12277     Ty = llvm::PointerType::getUnqual(VTy->getElementType());
12278     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
12279     Ops[0] = Builder.CreateAlignedLoad(VTy->getElementType(), Ops[0],
12280                                        PtrOp0.getAlignment());
12281     llvm::Constant *CI = ConstantInt::get(Int32Ty, 0);
12282     Ops[0] = Builder.CreateInsertElement(V, Ops[0], CI);
12283     return EmitNeonSplat(Ops[0], CI);
12284   }
12285   case NEON::BI__builtin_neon_vst1_lane_v:
12286   case NEON::BI__builtin_neon_vst1q_lane_v:
12287     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
12288     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
12289     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
12290     return Builder.CreateAlignedStore(Ops[1], Builder.CreateBitCast(Ops[0], Ty),
12291                                       PtrOp0.getAlignment());
12292   case NEON::BI__builtin_neon_vstl1_lane_s64:
12293   case NEON::BI__builtin_neon_vstl1q_lane_s64: {
12294     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
12295     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
12296     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
12297     llvm::StoreInst *SI = Builder.CreateAlignedStore(
12298         Ops[1], Builder.CreateBitCast(Ops[0], Ty), PtrOp0.getAlignment());
12299     SI->setAtomic(llvm::AtomicOrdering::Release);
12300     return SI;
12301   }
12302   case NEON::BI__builtin_neon_vld2_v:
12303   case NEON::BI__builtin_neon_vld2q_v: {
12304     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy);
12305     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
12306     llvm::Type *Tys[2] = { VTy, PTy };
12307     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2, Tys);
12308     Ops[1] = Builder.CreateCall(F, Ops[1], "vld2");
12309     Ops[0] = Builder.CreateBitCast(Ops[0],
12310                 llvm::PointerType::getUnqual(Ops[1]->getType()));
12311     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
12312   }
12313   case NEON::BI__builtin_neon_vld3_v:
12314   case NEON::BI__builtin_neon_vld3q_v: {
12315     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy);
12316     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
12317     llvm::Type *Tys[2] = { VTy, PTy };
12318     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3, Tys);
12319     Ops[1] = Builder.CreateCall(F, Ops[1], "vld3");
12320     Ops[0] = Builder.CreateBitCast(Ops[0],
12321                 llvm::PointerType::getUnqual(Ops[1]->getType()));
12322     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
12323   }
12324   case NEON::BI__builtin_neon_vld4_v:
12325   case NEON::BI__builtin_neon_vld4q_v: {
12326     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy);
12327     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
12328     llvm::Type *Tys[2] = { VTy, PTy };
12329     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4, Tys);
12330     Ops[1] = Builder.CreateCall(F, Ops[1], "vld4");
12331     Ops[0] = Builder.CreateBitCast(Ops[0],
12332                 llvm::PointerType::getUnqual(Ops[1]->getType()));
12333     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
12334   }
12335   case NEON::BI__builtin_neon_vld2_dup_v:
12336   case NEON::BI__builtin_neon_vld2q_dup_v: {
12337     llvm::Type *PTy =
12338       llvm::PointerType::getUnqual(VTy->getElementType());
12339     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
12340     llvm::Type *Tys[2] = { VTy, PTy };
12341     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2r, Tys);
12342     Ops[1] = Builder.CreateCall(F, Ops[1], "vld2");
12343     Ops[0] = Builder.CreateBitCast(Ops[0],
12344                 llvm::PointerType::getUnqual(Ops[1]->getType()));
12345     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
12346   }
12347   case NEON::BI__builtin_neon_vld3_dup_v:
12348   case NEON::BI__builtin_neon_vld3q_dup_v: {
12349     llvm::Type *PTy =
12350       llvm::PointerType::getUnqual(VTy->getElementType());
12351     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
12352     llvm::Type *Tys[2] = { VTy, PTy };
12353     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3r, Tys);
12354     Ops[1] = Builder.CreateCall(F, Ops[1], "vld3");
12355     Ops[0] = Builder.CreateBitCast(Ops[0],
12356                 llvm::PointerType::getUnqual(Ops[1]->getType()));
12357     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
12358   }
12359   case NEON::BI__builtin_neon_vld4_dup_v:
12360   case NEON::BI__builtin_neon_vld4q_dup_v: {
12361     llvm::Type *PTy =
12362       llvm::PointerType::getUnqual(VTy->getElementType());
12363     Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
12364     llvm::Type *Tys[2] = { VTy, PTy };
12365     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4r, Tys);
12366     Ops[1] = Builder.CreateCall(F, Ops[1], "vld4");
12367     Ops[0] = Builder.CreateBitCast(Ops[0],
12368                 llvm::PointerType::getUnqual(Ops[1]->getType()));
12369     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
12370   }
12371   case NEON::BI__builtin_neon_vld2_lane_v:
12372   case NEON::BI__builtin_neon_vld2q_lane_v: {
12373     llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
12374     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2lane, Tys);
12375     std::rotate(Ops.begin() + 1, Ops.begin() + 2, Ops.end());
12376     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
12377     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
12378     Ops[3] = Builder.CreateZExt(Ops[3], Int64Ty);
12379     Ops[1] = Builder.CreateCall(F, ArrayRef(Ops).slice(1), "vld2_lane");
12380     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
12381     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
12382     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
12383   }
12384   case NEON::BI__builtin_neon_vld3_lane_v:
12385   case NEON::BI__builtin_neon_vld3q_lane_v: {
12386     llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
12387     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3lane, Tys);
12388     std::rotate(Ops.begin() + 1, Ops.begin() + 2, Ops.end());
12389     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
12390     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
12391     Ops[3] = Builder.CreateBitCast(Ops[3], Ty);
12392     Ops[4] = Builder.CreateZExt(Ops[4], Int64Ty);
12393     Ops[1] = Builder.CreateCall(F, ArrayRef(Ops).slice(1), "vld3_lane");
12394     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
12395     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
12396     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
12397   }
12398   case NEON::BI__builtin_neon_vld4_lane_v:
12399   case NEON::BI__builtin_neon_vld4q_lane_v: {
12400     llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
12401     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4lane, Tys);
12402     std::rotate(Ops.begin() + 1, Ops.begin() + 2, Ops.end());
12403     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
12404     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
12405     Ops[3] = Builder.CreateBitCast(Ops[3], Ty);
12406     Ops[4] = Builder.CreateBitCast(Ops[4], Ty);
12407     Ops[5] = Builder.CreateZExt(Ops[5], Int64Ty);
12408     Ops[1] = Builder.CreateCall(F, ArrayRef(Ops).slice(1), "vld4_lane");
12409     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
12410     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
12411     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
12412   }
12413   case NEON::BI__builtin_neon_vst2_v:
12414   case NEON::BI__builtin_neon_vst2q_v: {
12415     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
12416     llvm::Type *Tys[2] = { VTy, Ops[2]->getType() };
12417     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st2, Tys),
12418                         Ops, "");
12419   }
12420   case NEON::BI__builtin_neon_vst2_lane_v:
12421   case NEON::BI__builtin_neon_vst2q_lane_v: {
12422     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
12423     Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty);
12424     llvm::Type *Tys[2] = { VTy, Ops[3]->getType() };
12425     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st2lane, Tys),
12426                         Ops, "");
12427   }
12428   case NEON::BI__builtin_neon_vst3_v:
12429   case NEON::BI__builtin_neon_vst3q_v: {
12430     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
12431     llvm::Type *Tys[2] = { VTy, Ops[3]->getType() };
12432     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st3, Tys),
12433                         Ops, "");
12434   }
12435   case NEON::BI__builtin_neon_vst3_lane_v:
12436   case NEON::BI__builtin_neon_vst3q_lane_v: {
12437     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
12438     Ops[3] = Builder.CreateZExt(Ops[3], Int64Ty);
12439     llvm::Type *Tys[2] = { VTy, Ops[4]->getType() };
12440     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st3lane, Tys),
12441                         Ops, "");
12442   }
12443   case NEON::BI__builtin_neon_vst4_v:
12444   case NEON::BI__builtin_neon_vst4q_v: {
12445     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
12446     llvm::Type *Tys[2] = { VTy, Ops[4]->getType() };
12447     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st4, Tys),
12448                         Ops, "");
12449   }
12450   case NEON::BI__builtin_neon_vst4_lane_v:
12451   case NEON::BI__builtin_neon_vst4q_lane_v: {
12452     std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
12453     Ops[4] = Builder.CreateZExt(Ops[4], Int64Ty);
12454     llvm::Type *Tys[2] = { VTy, Ops[5]->getType() };
12455     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st4lane, Tys),
12456                         Ops, "");
12457   }
12458   case NEON::BI__builtin_neon_vtrn_v:
12459   case NEON::BI__builtin_neon_vtrnq_v: {
12460     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
12461     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
12462     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
12463     Value *SV = nullptr;
12464 
12465     for (unsigned vi = 0; vi != 2; ++vi) {
12466       SmallVector<int, 16> Indices;
12467       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
12468         Indices.push_back(i+vi);
12469         Indices.push_back(i+e+vi);
12470       }
12471       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
12472       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vtrn");
12473       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
12474     }
12475     return SV;
12476   }
12477   case NEON::BI__builtin_neon_vuzp_v:
12478   case NEON::BI__builtin_neon_vuzpq_v: {
12479     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
12480     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
12481     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
12482     Value *SV = nullptr;
12483 
12484     for (unsigned vi = 0; vi != 2; ++vi) {
12485       SmallVector<int, 16> Indices;
12486       for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
12487         Indices.push_back(2*i+vi);
12488 
12489       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
12490       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vuzp");
12491       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
12492     }
12493     return SV;
12494   }
12495   case NEON::BI__builtin_neon_vzip_v:
12496   case NEON::BI__builtin_neon_vzipq_v: {
12497     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
12498     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
12499     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
12500     Value *SV = nullptr;
12501 
12502     for (unsigned vi = 0; vi != 2; ++vi) {
12503       SmallVector<int, 16> Indices;
12504       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
12505         Indices.push_back((i + vi*e) >> 1);
12506         Indices.push_back(((i + vi*e) >> 1)+e);
12507       }
12508       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
12509       SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vzip");
12510       SV = Builder.CreateDefaultAlignedStore(SV, Addr);
12511     }
12512     return SV;
12513   }
12514   case NEON::BI__builtin_neon_vqtbl1q_v: {
12515     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl1, Ty),
12516                         Ops, "vtbl1");
12517   }
12518   case NEON::BI__builtin_neon_vqtbl2q_v: {
12519     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl2, Ty),
12520                         Ops, "vtbl2");
12521   }
12522   case NEON::BI__builtin_neon_vqtbl3q_v: {
12523     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl3, Ty),
12524                         Ops, "vtbl3");
12525   }
12526   case NEON::BI__builtin_neon_vqtbl4q_v: {
12527     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl4, Ty),
12528                         Ops, "vtbl4");
12529   }
12530   case NEON::BI__builtin_neon_vqtbx1q_v: {
12531     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx1, Ty),
12532                         Ops, "vtbx1");
12533   }
12534   case NEON::BI__builtin_neon_vqtbx2q_v: {
12535     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx2, Ty),
12536                         Ops, "vtbx2");
12537   }
12538   case NEON::BI__builtin_neon_vqtbx3q_v: {
12539     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx3, Ty),
12540                         Ops, "vtbx3");
12541   }
12542   case NEON::BI__builtin_neon_vqtbx4q_v: {
12543     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx4, Ty),
12544                         Ops, "vtbx4");
12545   }
12546   case NEON::BI__builtin_neon_vsqadd_v:
12547   case NEON::BI__builtin_neon_vsqaddq_v: {
12548     Int = Intrinsic::aarch64_neon_usqadd;
12549     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqadd");
12550   }
12551   case NEON::BI__builtin_neon_vuqadd_v:
12552   case NEON::BI__builtin_neon_vuqaddq_v: {
12553     Int = Intrinsic::aarch64_neon_suqadd;
12554     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd");
12555   }
12556   }
12557 }
12558 
12559 Value *CodeGenFunction::EmitBPFBuiltinExpr(unsigned BuiltinID,
12560                                            const CallExpr *E) {
12561   assert((BuiltinID == BPF::BI__builtin_preserve_field_info ||
12562           BuiltinID == BPF::BI__builtin_btf_type_id ||
12563           BuiltinID == BPF::BI__builtin_preserve_type_info ||
12564           BuiltinID == BPF::BI__builtin_preserve_enum_value) &&
12565          "unexpected BPF builtin");
12566 
12567   // A sequence number, injected into IR builtin functions, to
12568   // prevent CSE given the only difference of the function
12569   // may just be the debuginfo metadata.
12570   static uint32_t BuiltinSeqNum;
12571 
12572   switch (BuiltinID) {
12573   default:
12574     llvm_unreachable("Unexpected BPF builtin");
12575   case BPF::BI__builtin_preserve_field_info: {
12576     const Expr *Arg = E->getArg(0);
12577     bool IsBitField = Arg->IgnoreParens()->getObjectKind() == OK_BitField;
12578 
12579     if (!getDebugInfo()) {
12580       CGM.Error(E->getExprLoc(),
12581                 "using __builtin_preserve_field_info() without -g");
12582       return IsBitField ? EmitLValue(Arg).getBitFieldPointer()
12583                         : EmitLValue(Arg).getPointer(*this);
12584     }
12585 
12586     // Enable underlying preserve_*_access_index() generation.
12587     bool OldIsInPreservedAIRegion = IsInPreservedAIRegion;
12588     IsInPreservedAIRegion = true;
12589     Value *FieldAddr = IsBitField ? EmitLValue(Arg).getBitFieldPointer()
12590                                   : EmitLValue(Arg).getPointer(*this);
12591     IsInPreservedAIRegion = OldIsInPreservedAIRegion;
12592 
12593     ConstantInt *C = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
12594     Value *InfoKind = ConstantInt::get(Int64Ty, C->getSExtValue());
12595 
12596     // Built the IR for the preserve_field_info intrinsic.
12597     llvm::Function *FnGetFieldInfo = llvm::Intrinsic::getDeclaration(
12598         &CGM.getModule(), llvm::Intrinsic::bpf_preserve_field_info,
12599         {FieldAddr->getType()});
12600     return Builder.CreateCall(FnGetFieldInfo, {FieldAddr, InfoKind});
12601   }
12602   case BPF::BI__builtin_btf_type_id:
12603   case BPF::BI__builtin_preserve_type_info: {
12604     if (!getDebugInfo()) {
12605       CGM.Error(E->getExprLoc(), "using builtin function without -g");
12606       return nullptr;
12607     }
12608 
12609     const Expr *Arg0 = E->getArg(0);
12610     llvm::DIType *DbgInfo = getDebugInfo()->getOrCreateStandaloneType(
12611         Arg0->getType(), Arg0->getExprLoc());
12612 
12613     ConstantInt *Flag = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
12614     Value *FlagValue = ConstantInt::get(Int64Ty, Flag->getSExtValue());
12615     Value *SeqNumVal = ConstantInt::get(Int32Ty, BuiltinSeqNum++);
12616 
12617     llvm::Function *FnDecl;
12618     if (BuiltinID == BPF::BI__builtin_btf_type_id)
12619       FnDecl = llvm::Intrinsic::getDeclaration(
12620           &CGM.getModule(), llvm::Intrinsic::bpf_btf_type_id, {});
12621     else
12622       FnDecl = llvm::Intrinsic::getDeclaration(
12623           &CGM.getModule(), llvm::Intrinsic::bpf_preserve_type_info, {});
12624     CallInst *Fn = Builder.CreateCall(FnDecl, {SeqNumVal, FlagValue});
12625     Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo);
12626     return Fn;
12627   }
12628   case BPF::BI__builtin_preserve_enum_value: {
12629     if (!getDebugInfo()) {
12630       CGM.Error(E->getExprLoc(), "using builtin function without -g");
12631       return nullptr;
12632     }
12633 
12634     const Expr *Arg0 = E->getArg(0);
12635     llvm::DIType *DbgInfo = getDebugInfo()->getOrCreateStandaloneType(
12636         Arg0->getType(), Arg0->getExprLoc());
12637 
12638     // Find enumerator
12639     const auto *UO = cast<UnaryOperator>(Arg0->IgnoreParens());
12640     const auto *CE = cast<CStyleCastExpr>(UO->getSubExpr());
12641     const auto *DR = cast<DeclRefExpr>(CE->getSubExpr());
12642     const auto *Enumerator = cast<EnumConstantDecl>(DR->getDecl());
12643 
12644     auto &InitVal = Enumerator->getInitVal();
12645     std::string InitValStr;
12646     if (InitVal.isNegative() || InitVal > uint64_t(INT64_MAX))
12647       InitValStr = std::to_string(InitVal.getSExtValue());
12648     else
12649       InitValStr = std::to_string(InitVal.getZExtValue());
12650     std::string EnumStr = Enumerator->getNameAsString() + ":" + InitValStr;
12651     Value *EnumStrVal = Builder.CreateGlobalStringPtr(EnumStr);
12652 
12653     ConstantInt *Flag = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
12654     Value *FlagValue = ConstantInt::get(Int64Ty, Flag->getSExtValue());
12655     Value *SeqNumVal = ConstantInt::get(Int32Ty, BuiltinSeqNum++);
12656 
12657     llvm::Function *IntrinsicFn = llvm::Intrinsic::getDeclaration(
12658         &CGM.getModule(), llvm::Intrinsic::bpf_preserve_enum_value, {});
12659     CallInst *Fn =
12660         Builder.CreateCall(IntrinsicFn, {SeqNumVal, EnumStrVal, FlagValue});
12661     Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo);
12662     return Fn;
12663   }
12664   }
12665 }
12666 
12667 llvm::Value *CodeGenFunction::
12668 BuildVector(ArrayRef<llvm::Value*> Ops) {
12669   assert((Ops.size() & (Ops.size() - 1)) == 0 &&
12670          "Not a power-of-two sized vector!");
12671   bool AllConstants = true;
12672   for (unsigned i = 0, e = Ops.size(); i != e && AllConstants; ++i)
12673     AllConstants &= isa<Constant>(Ops[i]);
12674 
12675   // If this is a constant vector, create a ConstantVector.
12676   if (AllConstants) {
12677     SmallVector<llvm::Constant*, 16> CstOps;
12678     for (unsigned i = 0, e = Ops.size(); i != e; ++i)
12679       CstOps.push_back(cast<Constant>(Ops[i]));
12680     return llvm::ConstantVector::get(CstOps);
12681   }
12682 
12683   // Otherwise, insertelement the values to build the vector.
12684   Value *Result = llvm::PoisonValue::get(
12685       llvm::FixedVectorType::get(Ops[0]->getType(), Ops.size()));
12686 
12687   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
12688     Result = Builder.CreateInsertElement(Result, Ops[i], Builder.getInt64(i));
12689 
12690   return Result;
12691 }
12692 
12693 // Convert the mask from an integer type to a vector of i1.
12694 static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask,
12695                               unsigned NumElts) {
12696 
12697   auto *MaskTy = llvm::FixedVectorType::get(
12698       CGF.Builder.getInt1Ty(),
12699       cast<IntegerType>(Mask->getType())->getBitWidth());
12700   Value *MaskVec = CGF.Builder.CreateBitCast(Mask, MaskTy);
12701 
12702   // If we have less than 8 elements, then the starting mask was an i8 and
12703   // we need to extract down to the right number of elements.
12704   if (NumElts < 8) {
12705     int Indices[4];
12706     for (unsigned i = 0; i != NumElts; ++i)
12707       Indices[i] = i;
12708     MaskVec = CGF.Builder.CreateShuffleVector(
12709         MaskVec, MaskVec, ArrayRef(Indices, NumElts), "extract");
12710   }
12711   return MaskVec;
12712 }
12713 
12714 static Value *EmitX86MaskedStore(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
12715                                  Align Alignment) {
12716   // Cast the pointer to right type.
12717   Value *Ptr = CGF.Builder.CreateBitCast(Ops[0],
12718                                llvm::PointerType::getUnqual(Ops[1]->getType()));
12719 
12720   Value *MaskVec = getMaskVecValue(
12721       CGF, Ops[2],
12722       cast<llvm::FixedVectorType>(Ops[1]->getType())->getNumElements());
12723 
12724   return CGF.Builder.CreateMaskedStore(Ops[1], Ptr, Alignment, MaskVec);
12725 }
12726 
12727 static Value *EmitX86MaskedLoad(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
12728                                 Align Alignment) {
12729   // Cast the pointer to right type.
12730   llvm::Type *Ty = Ops[1]->getType();
12731   Value *Ptr =
12732       CGF.Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
12733 
12734   Value *MaskVec = getMaskVecValue(
12735       CGF, Ops[2], cast<llvm::FixedVectorType>(Ty)->getNumElements());
12736 
12737   return CGF.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, MaskVec, Ops[1]);
12738 }
12739 
12740 static Value *EmitX86ExpandLoad(CodeGenFunction &CGF,
12741                                 ArrayRef<Value *> Ops) {
12742   auto *ResultTy = cast<llvm::VectorType>(Ops[1]->getType());
12743   llvm::Type *PtrTy = ResultTy->getElementType();
12744 
12745   // Cast the pointer to element type.
12746   Value *Ptr = CGF.Builder.CreateBitCast(Ops[0],
12747                                          llvm::PointerType::getUnqual(PtrTy));
12748 
12749   Value *MaskVec = getMaskVecValue(
12750       CGF, Ops[2], cast<FixedVectorType>(ResultTy)->getNumElements());
12751 
12752   llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_expandload,
12753                                            ResultTy);
12754   return CGF.Builder.CreateCall(F, { Ptr, MaskVec, Ops[1] });
12755 }
12756 
12757 static Value *EmitX86CompressExpand(CodeGenFunction &CGF,
12758                                     ArrayRef<Value *> Ops,
12759                                     bool IsCompress) {
12760   auto *ResultTy = cast<llvm::FixedVectorType>(Ops[1]->getType());
12761 
12762   Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements());
12763 
12764   Intrinsic::ID IID = IsCompress ? Intrinsic::x86_avx512_mask_compress
12765                                  : Intrinsic::x86_avx512_mask_expand;
12766   llvm::Function *F = CGF.CGM.getIntrinsic(IID, ResultTy);
12767   return CGF.Builder.CreateCall(F, { Ops[0], Ops[1], MaskVec });
12768 }
12769 
12770 static Value *EmitX86CompressStore(CodeGenFunction &CGF,
12771                                    ArrayRef<Value *> Ops) {
12772   auto *ResultTy = cast<llvm::FixedVectorType>(Ops[1]->getType());
12773   llvm::Type *PtrTy = ResultTy->getElementType();
12774 
12775   // Cast the pointer to element type.
12776   Value *Ptr = CGF.Builder.CreateBitCast(Ops[0],
12777                                          llvm::PointerType::getUnqual(PtrTy));
12778 
12779   Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements());
12780 
12781   llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_compressstore,
12782                                            ResultTy);
12783   return CGF.Builder.CreateCall(F, { Ops[1], Ptr, MaskVec });
12784 }
12785 
12786 static Value *EmitX86MaskLogic(CodeGenFunction &CGF, Instruction::BinaryOps Opc,
12787                               ArrayRef<Value *> Ops,
12788                               bool InvertLHS = false) {
12789   unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
12790   Value *LHS = getMaskVecValue(CGF, Ops[0], NumElts);
12791   Value *RHS = getMaskVecValue(CGF, Ops[1], NumElts);
12792 
12793   if (InvertLHS)
12794     LHS = CGF.Builder.CreateNot(LHS);
12795 
12796   return CGF.Builder.CreateBitCast(CGF.Builder.CreateBinOp(Opc, LHS, RHS),
12797                                    Ops[0]->getType());
12798 }
12799 
12800 static Value *EmitX86FunnelShift(CodeGenFunction &CGF, Value *Op0, Value *Op1,
12801                                  Value *Amt, bool IsRight) {
12802   llvm::Type *Ty = Op0->getType();
12803 
12804   // Amount may be scalar immediate, in which case create a splat vector.
12805   // Funnel shifts amounts are treated as modulo and types are all power-of-2 so
12806   // we only care about the lowest log2 bits anyway.
12807   if (Amt->getType() != Ty) {
12808     unsigned NumElts = cast<llvm::FixedVectorType>(Ty)->getNumElements();
12809     Amt = CGF.Builder.CreateIntCast(Amt, Ty->getScalarType(), false);
12810     Amt = CGF.Builder.CreateVectorSplat(NumElts, Amt);
12811   }
12812 
12813   unsigned IID = IsRight ? Intrinsic::fshr : Intrinsic::fshl;
12814   Function *F = CGF.CGM.getIntrinsic(IID, Ty);
12815   return CGF.Builder.CreateCall(F, {Op0, Op1, Amt});
12816 }
12817 
12818 static Value *EmitX86vpcom(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
12819                            bool IsSigned) {
12820   Value *Op0 = Ops[0];
12821   Value *Op1 = Ops[1];
12822   llvm::Type *Ty = Op0->getType();
12823   uint64_t Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7;
12824 
12825   CmpInst::Predicate Pred;
12826   switch (Imm) {
12827   case 0x0:
12828     Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
12829     break;
12830   case 0x1:
12831     Pred = IsSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE;
12832     break;
12833   case 0x2:
12834     Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
12835     break;
12836   case 0x3:
12837     Pred = IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE;
12838     break;
12839   case 0x4:
12840     Pred = ICmpInst::ICMP_EQ;
12841     break;
12842   case 0x5:
12843     Pred = ICmpInst::ICMP_NE;
12844     break;
12845   case 0x6:
12846     return llvm::Constant::getNullValue(Ty); // FALSE
12847   case 0x7:
12848     return llvm::Constant::getAllOnesValue(Ty); // TRUE
12849   default:
12850     llvm_unreachable("Unexpected XOP vpcom/vpcomu predicate");
12851   }
12852 
12853   Value *Cmp = CGF.Builder.CreateICmp(Pred, Op0, Op1);
12854   Value *Res = CGF.Builder.CreateSExt(Cmp, Ty);
12855   return Res;
12856 }
12857 
12858 static Value *EmitX86Select(CodeGenFunction &CGF,
12859                             Value *Mask, Value *Op0, Value *Op1) {
12860 
12861   // If the mask is all ones just return first argument.
12862   if (const auto *C = dyn_cast<Constant>(Mask))
12863     if (C->isAllOnesValue())
12864       return Op0;
12865 
12866   Mask = getMaskVecValue(
12867       CGF, Mask, cast<llvm::FixedVectorType>(Op0->getType())->getNumElements());
12868 
12869   return CGF.Builder.CreateSelect(Mask, Op0, Op1);
12870 }
12871 
12872 static Value *EmitX86ScalarSelect(CodeGenFunction &CGF,
12873                                   Value *Mask, Value *Op0, Value *Op1) {
12874   // If the mask is all ones just return first argument.
12875   if (const auto *C = dyn_cast<Constant>(Mask))
12876     if (C->isAllOnesValue())
12877       return Op0;
12878 
12879   auto *MaskTy = llvm::FixedVectorType::get(
12880       CGF.Builder.getInt1Ty(), Mask->getType()->getIntegerBitWidth());
12881   Mask = CGF.Builder.CreateBitCast(Mask, MaskTy);
12882   Mask = CGF.Builder.CreateExtractElement(Mask, (uint64_t)0);
12883   return CGF.Builder.CreateSelect(Mask, Op0, Op1);
12884 }
12885 
12886 static Value *EmitX86MaskedCompareResult(CodeGenFunction &CGF, Value *Cmp,
12887                                          unsigned NumElts, Value *MaskIn) {
12888   if (MaskIn) {
12889     const auto *C = dyn_cast<Constant>(MaskIn);
12890     if (!C || !C->isAllOnesValue())
12891       Cmp = CGF.Builder.CreateAnd(Cmp, getMaskVecValue(CGF, MaskIn, NumElts));
12892   }
12893 
12894   if (NumElts < 8) {
12895     int Indices[8];
12896     for (unsigned i = 0; i != NumElts; ++i)
12897       Indices[i] = i;
12898     for (unsigned i = NumElts; i != 8; ++i)
12899       Indices[i] = i % NumElts + NumElts;
12900     Cmp = CGF.Builder.CreateShuffleVector(
12901         Cmp, llvm::Constant::getNullValue(Cmp->getType()), Indices);
12902   }
12903 
12904   return CGF.Builder.CreateBitCast(Cmp,
12905                                    IntegerType::get(CGF.getLLVMContext(),
12906                                                     std::max(NumElts, 8U)));
12907 }
12908 
12909 static Value *EmitX86MaskedCompare(CodeGenFunction &CGF, unsigned CC,
12910                                    bool Signed, ArrayRef<Value *> Ops) {
12911   assert((Ops.size() == 2 || Ops.size() == 4) &&
12912          "Unexpected number of arguments");
12913   unsigned NumElts =
12914       cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
12915   Value *Cmp;
12916 
12917   if (CC == 3) {
12918     Cmp = Constant::getNullValue(
12919         llvm::FixedVectorType::get(CGF.Builder.getInt1Ty(), NumElts));
12920   } else if (CC == 7) {
12921     Cmp = Constant::getAllOnesValue(
12922         llvm::FixedVectorType::get(CGF.Builder.getInt1Ty(), NumElts));
12923   } else {
12924     ICmpInst::Predicate Pred;
12925     switch (CC) {
12926     default: llvm_unreachable("Unknown condition code");
12927     case 0: Pred = ICmpInst::ICMP_EQ;  break;
12928     case 1: Pred = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break;
12929     case 2: Pred = Signed ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break;
12930     case 4: Pred = ICmpInst::ICMP_NE;  break;
12931     case 5: Pred = Signed ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break;
12932     case 6: Pred = Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break;
12933     }
12934     Cmp = CGF.Builder.CreateICmp(Pred, Ops[0], Ops[1]);
12935   }
12936 
12937   Value *MaskIn = nullptr;
12938   if (Ops.size() == 4)
12939     MaskIn = Ops[3];
12940 
12941   return EmitX86MaskedCompareResult(CGF, Cmp, NumElts, MaskIn);
12942 }
12943 
12944 static Value *EmitX86ConvertToMask(CodeGenFunction &CGF, Value *In) {
12945   Value *Zero = Constant::getNullValue(In->getType());
12946   return EmitX86MaskedCompare(CGF, 1, true, { In, Zero });
12947 }
12948 
12949 static Value *EmitX86ConvertIntToFp(CodeGenFunction &CGF, const CallExpr *E,
12950                                     ArrayRef<Value *> Ops, bool IsSigned) {
12951   unsigned Rnd = cast<llvm::ConstantInt>(Ops[3])->getZExtValue();
12952   llvm::Type *Ty = Ops[1]->getType();
12953 
12954   Value *Res;
12955   if (Rnd != 4) {
12956     Intrinsic::ID IID = IsSigned ? Intrinsic::x86_avx512_sitofp_round
12957                                  : Intrinsic::x86_avx512_uitofp_round;
12958     Function *F = CGF.CGM.getIntrinsic(IID, { Ty, Ops[0]->getType() });
12959     Res = CGF.Builder.CreateCall(F, { Ops[0], Ops[3] });
12960   } else {
12961     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
12962     Res = IsSigned ? CGF.Builder.CreateSIToFP(Ops[0], Ty)
12963                    : CGF.Builder.CreateUIToFP(Ops[0], Ty);
12964   }
12965 
12966   return EmitX86Select(CGF, Ops[2], Res, Ops[1]);
12967 }
12968 
12969 // Lowers X86 FMA intrinsics to IR.
12970 static Value *EmitX86FMAExpr(CodeGenFunction &CGF, const CallExpr *E,
12971                              ArrayRef<Value *> Ops, unsigned BuiltinID,
12972                              bool IsAddSub) {
12973 
12974   bool Subtract = false;
12975   Intrinsic::ID IID = Intrinsic::not_intrinsic;
12976   switch (BuiltinID) {
12977   default: break;
12978   case clang::X86::BI__builtin_ia32_vfmsubph512_mask3:
12979     Subtract = true;
12980     [[fallthrough]];
12981   case clang::X86::BI__builtin_ia32_vfmaddph512_mask:
12982   case clang::X86::BI__builtin_ia32_vfmaddph512_maskz:
12983   case clang::X86::BI__builtin_ia32_vfmaddph512_mask3:
12984     IID = llvm::Intrinsic::x86_avx512fp16_vfmadd_ph_512;
12985     break;
12986   case clang::X86::BI__builtin_ia32_vfmsubaddph512_mask3:
12987     Subtract = true;
12988     [[fallthrough]];
12989   case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask:
12990   case clang::X86::BI__builtin_ia32_vfmaddsubph512_maskz:
12991   case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask3:
12992     IID = llvm::Intrinsic::x86_avx512fp16_vfmaddsub_ph_512;
12993     break;
12994   case clang::X86::BI__builtin_ia32_vfmsubps512_mask3:
12995     Subtract = true;
12996     [[fallthrough]];
12997   case clang::X86::BI__builtin_ia32_vfmaddps512_mask:
12998   case clang::X86::BI__builtin_ia32_vfmaddps512_maskz:
12999   case clang::X86::BI__builtin_ia32_vfmaddps512_mask3:
13000     IID = llvm::Intrinsic::x86_avx512_vfmadd_ps_512; break;
13001   case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3:
13002     Subtract = true;
13003     [[fallthrough]];
13004   case clang::X86::BI__builtin_ia32_vfmaddpd512_mask:
13005   case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz:
13006   case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3:
13007     IID = llvm::Intrinsic::x86_avx512_vfmadd_pd_512; break;
13008   case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3:
13009     Subtract = true;
13010     [[fallthrough]];
13011   case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask:
13012   case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz:
13013   case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3:
13014     IID = llvm::Intrinsic::x86_avx512_vfmaddsub_ps_512;
13015     break;
13016   case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
13017     Subtract = true;
13018     [[fallthrough]];
13019   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask:
13020   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
13021   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
13022     IID = llvm::Intrinsic::x86_avx512_vfmaddsub_pd_512;
13023     break;
13024   }
13025 
13026   Value *A = Ops[0];
13027   Value *B = Ops[1];
13028   Value *C = Ops[2];
13029 
13030   if (Subtract)
13031     C = CGF.Builder.CreateFNeg(C);
13032 
13033   Value *Res;
13034 
13035   // Only handle in case of _MM_FROUND_CUR_DIRECTION/4 (no rounding).
13036   if (IID != Intrinsic::not_intrinsic &&
13037       (cast<llvm::ConstantInt>(Ops.back())->getZExtValue() != (uint64_t)4 ||
13038        IsAddSub)) {
13039     Function *Intr = CGF.CGM.getIntrinsic(IID);
13040     Res = CGF.Builder.CreateCall(Intr, {A, B, C, Ops.back() });
13041   } else {
13042     llvm::Type *Ty = A->getType();
13043     Function *FMA;
13044     if (CGF.Builder.getIsFPConstrained()) {
13045       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
13046       FMA = CGF.CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, Ty);
13047       Res = CGF.Builder.CreateConstrainedFPCall(FMA, {A, B, C});
13048     } else {
13049       FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty);
13050       Res = CGF.Builder.CreateCall(FMA, {A, B, C});
13051     }
13052   }
13053 
13054   // Handle any required masking.
13055   Value *MaskFalseVal = nullptr;
13056   switch (BuiltinID) {
13057   case clang::X86::BI__builtin_ia32_vfmaddph512_mask:
13058   case clang::X86::BI__builtin_ia32_vfmaddps512_mask:
13059   case clang::X86::BI__builtin_ia32_vfmaddpd512_mask:
13060   case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask:
13061   case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask:
13062   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask:
13063     MaskFalseVal = Ops[0];
13064     break;
13065   case clang::X86::BI__builtin_ia32_vfmaddph512_maskz:
13066   case clang::X86::BI__builtin_ia32_vfmaddps512_maskz:
13067   case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz:
13068   case clang::X86::BI__builtin_ia32_vfmaddsubph512_maskz:
13069   case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz:
13070   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
13071     MaskFalseVal = Constant::getNullValue(Ops[0]->getType());
13072     break;
13073   case clang::X86::BI__builtin_ia32_vfmsubph512_mask3:
13074   case clang::X86::BI__builtin_ia32_vfmaddph512_mask3:
13075   case clang::X86::BI__builtin_ia32_vfmsubps512_mask3:
13076   case clang::X86::BI__builtin_ia32_vfmaddps512_mask3:
13077   case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3:
13078   case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3:
13079   case clang::X86::BI__builtin_ia32_vfmsubaddph512_mask3:
13080   case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask3:
13081   case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3:
13082   case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3:
13083   case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
13084   case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
13085     MaskFalseVal = Ops[2];
13086     break;
13087   }
13088 
13089   if (MaskFalseVal)
13090     return EmitX86Select(CGF, Ops[3], Res, MaskFalseVal);
13091 
13092   return Res;
13093 }
13094 
13095 static Value *EmitScalarFMAExpr(CodeGenFunction &CGF, const CallExpr *E,
13096                                 MutableArrayRef<Value *> Ops, Value *Upper,
13097                                 bool ZeroMask = false, unsigned PTIdx = 0,
13098                                 bool NegAcc = false) {
13099   unsigned Rnd = 4;
13100   if (Ops.size() > 4)
13101     Rnd = cast<llvm::ConstantInt>(Ops[4])->getZExtValue();
13102 
13103   if (NegAcc)
13104     Ops[2] = CGF.Builder.CreateFNeg(Ops[2]);
13105 
13106   Ops[0] = CGF.Builder.CreateExtractElement(Ops[0], (uint64_t)0);
13107   Ops[1] = CGF.Builder.CreateExtractElement(Ops[1], (uint64_t)0);
13108   Ops[2] = CGF.Builder.CreateExtractElement(Ops[2], (uint64_t)0);
13109   Value *Res;
13110   if (Rnd != 4) {
13111     Intrinsic::ID IID;
13112 
13113     switch (Ops[0]->getType()->getPrimitiveSizeInBits()) {
13114     case 16:
13115       IID = Intrinsic::x86_avx512fp16_vfmadd_f16;
13116       break;
13117     case 32:
13118       IID = Intrinsic::x86_avx512_vfmadd_f32;
13119       break;
13120     case 64:
13121       IID = Intrinsic::x86_avx512_vfmadd_f64;
13122       break;
13123     default:
13124       llvm_unreachable("Unexpected size");
13125     }
13126     Res = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID),
13127                                  {Ops[0], Ops[1], Ops[2], Ops[4]});
13128   } else if (CGF.Builder.getIsFPConstrained()) {
13129     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
13130     Function *FMA = CGF.CGM.getIntrinsic(
13131         Intrinsic::experimental_constrained_fma, Ops[0]->getType());
13132     Res = CGF.Builder.CreateConstrainedFPCall(FMA, Ops.slice(0, 3));
13133   } else {
13134     Function *FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ops[0]->getType());
13135     Res = CGF.Builder.CreateCall(FMA, Ops.slice(0, 3));
13136   }
13137   // If we have more than 3 arguments, we need to do masking.
13138   if (Ops.size() > 3) {
13139     Value *PassThru = ZeroMask ? Constant::getNullValue(Res->getType())
13140                                : Ops[PTIdx];
13141 
13142     // If we negated the accumulator and the its the PassThru value we need to
13143     // bypass the negate. Conveniently Upper should be the same thing in this
13144     // case.
13145     if (NegAcc && PTIdx == 2)
13146       PassThru = CGF.Builder.CreateExtractElement(Upper, (uint64_t)0);
13147 
13148     Res = EmitX86ScalarSelect(CGF, Ops[3], Res, PassThru);
13149   }
13150   return CGF.Builder.CreateInsertElement(Upper, Res, (uint64_t)0);
13151 }
13152 
13153 static Value *EmitX86Muldq(CodeGenFunction &CGF, bool IsSigned,
13154                            ArrayRef<Value *> Ops) {
13155   llvm::Type *Ty = Ops[0]->getType();
13156   // Arguments have a vXi32 type so cast to vXi64.
13157   Ty = llvm::FixedVectorType::get(CGF.Int64Ty,
13158                                   Ty->getPrimitiveSizeInBits() / 64);
13159   Value *LHS = CGF.Builder.CreateBitCast(Ops[0], Ty);
13160   Value *RHS = CGF.Builder.CreateBitCast(Ops[1], Ty);
13161 
13162   if (IsSigned) {
13163     // Shift left then arithmetic shift right.
13164     Constant *ShiftAmt = ConstantInt::get(Ty, 32);
13165     LHS = CGF.Builder.CreateShl(LHS, ShiftAmt);
13166     LHS = CGF.Builder.CreateAShr(LHS, ShiftAmt);
13167     RHS = CGF.Builder.CreateShl(RHS, ShiftAmt);
13168     RHS = CGF.Builder.CreateAShr(RHS, ShiftAmt);
13169   } else {
13170     // Clear the upper bits.
13171     Constant *Mask = ConstantInt::get(Ty, 0xffffffff);
13172     LHS = CGF.Builder.CreateAnd(LHS, Mask);
13173     RHS = CGF.Builder.CreateAnd(RHS, Mask);
13174   }
13175 
13176   return CGF.Builder.CreateMul(LHS, RHS);
13177 }
13178 
13179 // Emit a masked pternlog intrinsic. This only exists because the header has to
13180 // use a macro and we aren't able to pass the input argument to a pternlog
13181 // builtin and a select builtin without evaluating it twice.
13182 static Value *EmitX86Ternlog(CodeGenFunction &CGF, bool ZeroMask,
13183                              ArrayRef<Value *> Ops) {
13184   llvm::Type *Ty = Ops[0]->getType();
13185 
13186   unsigned VecWidth = Ty->getPrimitiveSizeInBits();
13187   unsigned EltWidth = Ty->getScalarSizeInBits();
13188   Intrinsic::ID IID;
13189   if (VecWidth == 128 && EltWidth == 32)
13190     IID = Intrinsic::x86_avx512_pternlog_d_128;
13191   else if (VecWidth == 256 && EltWidth == 32)
13192     IID = Intrinsic::x86_avx512_pternlog_d_256;
13193   else if (VecWidth == 512 && EltWidth == 32)
13194     IID = Intrinsic::x86_avx512_pternlog_d_512;
13195   else if (VecWidth == 128 && EltWidth == 64)
13196     IID = Intrinsic::x86_avx512_pternlog_q_128;
13197   else if (VecWidth == 256 && EltWidth == 64)
13198     IID = Intrinsic::x86_avx512_pternlog_q_256;
13199   else if (VecWidth == 512 && EltWidth == 64)
13200     IID = Intrinsic::x86_avx512_pternlog_q_512;
13201   else
13202     llvm_unreachable("Unexpected intrinsic");
13203 
13204   Value *Ternlog = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID),
13205                                           Ops.drop_back());
13206   Value *PassThru = ZeroMask ? ConstantAggregateZero::get(Ty) : Ops[0];
13207   return EmitX86Select(CGF, Ops[4], Ternlog, PassThru);
13208 }
13209 
13210 static Value *EmitX86SExtMask(CodeGenFunction &CGF, Value *Op,
13211                               llvm::Type *DstTy) {
13212   unsigned NumberOfElements =
13213       cast<llvm::FixedVectorType>(DstTy)->getNumElements();
13214   Value *Mask = getMaskVecValue(CGF, Op, NumberOfElements);
13215   return CGF.Builder.CreateSExt(Mask, DstTy, "vpmovm2");
13216 }
13217 
13218 Value *CodeGenFunction::EmitX86CpuIs(const CallExpr *E) {
13219   const Expr *CPUExpr = E->getArg(0)->IgnoreParenCasts();
13220   StringRef CPUStr = cast<clang::StringLiteral>(CPUExpr)->getString();
13221   return EmitX86CpuIs(CPUStr);
13222 }
13223 
13224 // Convert F16 halfs to floats.
13225 static Value *EmitX86CvtF16ToFloatExpr(CodeGenFunction &CGF,
13226                                        ArrayRef<Value *> Ops,
13227                                        llvm::Type *DstTy) {
13228   assert((Ops.size() == 1 || Ops.size() == 3 || Ops.size() == 4) &&
13229          "Unknown cvtph2ps intrinsic");
13230 
13231   // If the SAE intrinsic doesn't use default rounding then we can't upgrade.
13232   if (Ops.size() == 4 && cast<llvm::ConstantInt>(Ops[3])->getZExtValue() != 4) {
13233     Function *F =
13234         CGF.CGM.getIntrinsic(Intrinsic::x86_avx512_mask_vcvtph2ps_512);
13235     return CGF.Builder.CreateCall(F, {Ops[0], Ops[1], Ops[2], Ops[3]});
13236   }
13237 
13238   unsigned NumDstElts = cast<llvm::FixedVectorType>(DstTy)->getNumElements();
13239   Value *Src = Ops[0];
13240 
13241   // Extract the subvector.
13242   if (NumDstElts !=
13243       cast<llvm::FixedVectorType>(Src->getType())->getNumElements()) {
13244     assert(NumDstElts == 4 && "Unexpected vector size");
13245     Src = CGF.Builder.CreateShuffleVector(Src, ArrayRef<int>{0, 1, 2, 3});
13246   }
13247 
13248   // Bitcast from vXi16 to vXf16.
13249   auto *HalfTy = llvm::FixedVectorType::get(
13250       llvm::Type::getHalfTy(CGF.getLLVMContext()), NumDstElts);
13251   Src = CGF.Builder.CreateBitCast(Src, HalfTy);
13252 
13253   // Perform the fp-extension.
13254   Value *Res = CGF.Builder.CreateFPExt(Src, DstTy, "cvtph2ps");
13255 
13256   if (Ops.size() >= 3)
13257     Res = EmitX86Select(CGF, Ops[2], Res, Ops[1]);
13258   return Res;
13259 }
13260 
13261 Value *CodeGenFunction::EmitX86CpuIs(StringRef CPUStr) {
13262 
13263   llvm::Type *Int32Ty = Builder.getInt32Ty();
13264 
13265   // Matching the struct layout from the compiler-rt/libgcc structure that is
13266   // filled in:
13267   // unsigned int __cpu_vendor;
13268   // unsigned int __cpu_type;
13269   // unsigned int __cpu_subtype;
13270   // unsigned int __cpu_features[1];
13271   llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty,
13272                                           llvm::ArrayType::get(Int32Ty, 1));
13273 
13274   // Grab the global __cpu_model.
13275   llvm::Constant *CpuModel = CGM.CreateRuntimeVariable(STy, "__cpu_model");
13276   cast<llvm::GlobalValue>(CpuModel)->setDSOLocal(true);
13277 
13278   // Calculate the index needed to access the correct field based on the
13279   // range. Also adjust the expected value.
13280   unsigned Index;
13281   unsigned Value;
13282   std::tie(Index, Value) = StringSwitch<std::pair<unsigned, unsigned>>(CPUStr)
13283 #define X86_VENDOR(ENUM, STRING)                                               \
13284   .Case(STRING, {0u, static_cast<unsigned>(llvm::X86::ENUM)})
13285 #define X86_CPU_TYPE_ALIAS(ENUM, ALIAS)                                        \
13286   .Case(ALIAS, {1u, static_cast<unsigned>(llvm::X86::ENUM)})
13287 #define X86_CPU_TYPE(ENUM, STR)                                                \
13288   .Case(STR, {1u, static_cast<unsigned>(llvm::X86::ENUM)})
13289 #define X86_CPU_SUBTYPE_ALIAS(ENUM, ALIAS)                                     \
13290   .Case(ALIAS, {2u, static_cast<unsigned>(llvm::X86::ENUM)})
13291 #define X86_CPU_SUBTYPE(ENUM, STR)                                             \
13292   .Case(STR, {2u, static_cast<unsigned>(llvm::X86::ENUM)})
13293 #include "llvm/TargetParser/X86TargetParser.def"
13294                                .Default({0, 0});
13295   assert(Value != 0 && "Invalid CPUStr passed to CpuIs");
13296 
13297   // Grab the appropriate field from __cpu_model.
13298   llvm::Value *Idxs[] = {ConstantInt::get(Int32Ty, 0),
13299                          ConstantInt::get(Int32Ty, Index)};
13300   llvm::Value *CpuValue = Builder.CreateGEP(STy, CpuModel, Idxs);
13301   CpuValue = Builder.CreateAlignedLoad(Int32Ty, CpuValue,
13302                                        CharUnits::fromQuantity(4));
13303 
13304   // Check the value of the field against the requested value.
13305   return Builder.CreateICmpEQ(CpuValue,
13306                                   llvm::ConstantInt::get(Int32Ty, Value));
13307 }
13308 
13309 Value *CodeGenFunction::EmitX86CpuSupports(const CallExpr *E) {
13310   const Expr *FeatureExpr = E->getArg(0)->IgnoreParenCasts();
13311   StringRef FeatureStr = cast<StringLiteral>(FeatureExpr)->getString();
13312   return EmitX86CpuSupports(FeatureStr);
13313 }
13314 
13315 Value *CodeGenFunction::EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs) {
13316   return EmitX86CpuSupports(llvm::X86::getCpuSupportsMask(FeatureStrs));
13317 }
13318 
13319 llvm::Value *CodeGenFunction::EmitX86CpuSupports(uint64_t FeaturesMask) {
13320   uint32_t Features1 = Lo_32(FeaturesMask);
13321   uint32_t Features2 = Hi_32(FeaturesMask);
13322 
13323   Value *Result = Builder.getTrue();
13324 
13325   if (Features1 != 0) {
13326     // Matching the struct layout from the compiler-rt/libgcc structure that is
13327     // filled in:
13328     // unsigned int __cpu_vendor;
13329     // unsigned int __cpu_type;
13330     // unsigned int __cpu_subtype;
13331     // unsigned int __cpu_features[1];
13332     llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty,
13333                                             llvm::ArrayType::get(Int32Ty, 1));
13334 
13335     // Grab the global __cpu_model.
13336     llvm::Constant *CpuModel = CGM.CreateRuntimeVariable(STy, "__cpu_model");
13337     cast<llvm::GlobalValue>(CpuModel)->setDSOLocal(true);
13338 
13339     // Grab the first (0th) element from the field __cpu_features off of the
13340     // global in the struct STy.
13341     Value *Idxs[] = {Builder.getInt32(0), Builder.getInt32(3),
13342                      Builder.getInt32(0)};
13343     Value *CpuFeatures = Builder.CreateGEP(STy, CpuModel, Idxs);
13344     Value *Features = Builder.CreateAlignedLoad(Int32Ty, CpuFeatures,
13345                                                 CharUnits::fromQuantity(4));
13346 
13347     // Check the value of the bit corresponding to the feature requested.
13348     Value *Mask = Builder.getInt32(Features1);
13349     Value *Bitset = Builder.CreateAnd(Features, Mask);
13350     Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask);
13351     Result = Builder.CreateAnd(Result, Cmp);
13352   }
13353 
13354   if (Features2 != 0) {
13355     llvm::Constant *CpuFeatures2 = CGM.CreateRuntimeVariable(Int32Ty,
13356                                                              "__cpu_features2");
13357     cast<llvm::GlobalValue>(CpuFeatures2)->setDSOLocal(true);
13358 
13359     Value *Features = Builder.CreateAlignedLoad(Int32Ty, CpuFeatures2,
13360                                                 CharUnits::fromQuantity(4));
13361 
13362     // Check the value of the bit corresponding to the feature requested.
13363     Value *Mask = Builder.getInt32(Features2);
13364     Value *Bitset = Builder.CreateAnd(Features, Mask);
13365     Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask);
13366     Result = Builder.CreateAnd(Result, Cmp);
13367   }
13368 
13369   return Result;
13370 }
13371 
13372 Value *CodeGenFunction::EmitAArch64CpuInit() {
13373   llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, false);
13374   llvm::FunctionCallee Func =
13375       CGM.CreateRuntimeFunction(FTy, "init_cpu_features_resolver");
13376   cast<llvm::GlobalValue>(Func.getCallee())->setDSOLocal(true);
13377   cast<llvm::GlobalValue>(Func.getCallee())
13378       ->setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass);
13379   return Builder.CreateCall(Func);
13380 }
13381 
13382 Value *CodeGenFunction::EmitX86CpuInit() {
13383   llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy,
13384                                                     /*Variadic*/ false);
13385   llvm::FunctionCallee Func =
13386       CGM.CreateRuntimeFunction(FTy, "__cpu_indicator_init");
13387   cast<llvm::GlobalValue>(Func.getCallee())->setDSOLocal(true);
13388   cast<llvm::GlobalValue>(Func.getCallee())
13389       ->setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass);
13390   return Builder.CreateCall(Func);
13391 }
13392 
13393 llvm::Value *
13394 CodeGenFunction::EmitAArch64CpuSupports(ArrayRef<StringRef> FeaturesStrs) {
13395   uint64_t FeaturesMask = llvm::AArch64::getCpuSupportsMask(FeaturesStrs);
13396   Value *Result = Builder.getTrue();
13397   if (FeaturesMask != 0) {
13398     // Get features from structure in runtime library
13399     // struct {
13400     //   unsigned long long features;
13401     // } __aarch64_cpu_features;
13402     llvm::Type *STy = llvm::StructType::get(Int64Ty);
13403     llvm::Constant *AArch64CPUFeatures =
13404         CGM.CreateRuntimeVariable(STy, "__aarch64_cpu_features");
13405     cast<llvm::GlobalValue>(AArch64CPUFeatures)->setDSOLocal(true);
13406     llvm::Value *CpuFeatures = Builder.CreateGEP(
13407         STy, AArch64CPUFeatures,
13408         {ConstantInt::get(Int32Ty, 0), ConstantInt::get(Int32Ty, 0)});
13409     Value *Features = Builder.CreateAlignedLoad(Int64Ty, CpuFeatures,
13410                                                 CharUnits::fromQuantity(8));
13411     Value *Mask = Builder.getInt64(FeaturesMask);
13412     Value *Bitset = Builder.CreateAnd(Features, Mask);
13413     Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask);
13414     Result = Builder.CreateAnd(Result, Cmp);
13415   }
13416   return Result;
13417 }
13418 
13419 Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
13420                                            const CallExpr *E) {
13421   if (BuiltinID == X86::BI__builtin_cpu_is)
13422     return EmitX86CpuIs(E);
13423   if (BuiltinID == X86::BI__builtin_cpu_supports)
13424     return EmitX86CpuSupports(E);
13425   if (BuiltinID == X86::BI__builtin_cpu_init)
13426     return EmitX86CpuInit();
13427 
13428   // Handle MSVC intrinsics before argument evaluation to prevent double
13429   // evaluation.
13430   if (std::optional<MSVCIntrin> MsvcIntId = translateX86ToMsvcIntrin(BuiltinID))
13431     return EmitMSVCBuiltinExpr(*MsvcIntId, E);
13432 
13433   SmallVector<Value*, 4> Ops;
13434   bool IsMaskFCmp = false;
13435   bool IsConjFMA = false;
13436 
13437   // Find out if any arguments are required to be integer constant expressions.
13438   unsigned ICEArguments = 0;
13439   ASTContext::GetBuiltinTypeError Error;
13440   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
13441   assert(Error == ASTContext::GE_None && "Should not codegen an error");
13442 
13443   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
13444     // If this is a normal argument, just emit it as a scalar.
13445     if ((ICEArguments & (1 << i)) == 0) {
13446       Ops.push_back(EmitScalarExpr(E->getArg(i)));
13447       continue;
13448     }
13449 
13450     // If this is required to be a constant, constant fold it so that we know
13451     // that the generated intrinsic gets a ConstantInt.
13452     Ops.push_back(llvm::ConstantInt::get(
13453         getLLVMContext(), *E->getArg(i)->getIntegerConstantExpr(getContext())));
13454   }
13455 
13456   // These exist so that the builtin that takes an immediate can be bounds
13457   // checked by clang to avoid passing bad immediates to the backend. Since
13458   // AVX has a larger immediate than SSE we would need separate builtins to
13459   // do the different bounds checking. Rather than create a clang specific
13460   // SSE only builtin, this implements eight separate builtins to match gcc
13461   // implementation.
13462   auto getCmpIntrinsicCall = [this, &Ops](Intrinsic::ID ID, unsigned Imm) {
13463     Ops.push_back(llvm::ConstantInt::get(Int8Ty, Imm));
13464     llvm::Function *F = CGM.getIntrinsic(ID);
13465     return Builder.CreateCall(F, Ops);
13466   };
13467 
13468   // For the vector forms of FP comparisons, translate the builtins directly to
13469   // IR.
13470   // TODO: The builtins could be removed if the SSE header files used vector
13471   // extension comparisons directly (vector ordered/unordered may need
13472   // additional support via __builtin_isnan()).
13473   auto getVectorFCmpIR = [this, &Ops, E](CmpInst::Predicate Pred,
13474                                          bool IsSignaling) {
13475     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
13476     Value *Cmp;
13477     if (IsSignaling)
13478       Cmp = Builder.CreateFCmpS(Pred, Ops[0], Ops[1]);
13479     else
13480       Cmp = Builder.CreateFCmp(Pred, Ops[0], Ops[1]);
13481     llvm::VectorType *FPVecTy = cast<llvm::VectorType>(Ops[0]->getType());
13482     llvm::VectorType *IntVecTy = llvm::VectorType::getInteger(FPVecTy);
13483     Value *Sext = Builder.CreateSExt(Cmp, IntVecTy);
13484     return Builder.CreateBitCast(Sext, FPVecTy);
13485   };
13486 
13487   switch (BuiltinID) {
13488   default: return nullptr;
13489   case X86::BI_mm_prefetch: {
13490     Value *Address = Ops[0];
13491     ConstantInt *C = cast<ConstantInt>(Ops[1]);
13492     Value *RW = ConstantInt::get(Int32Ty, (C->getZExtValue() >> 2) & 0x1);
13493     Value *Locality = ConstantInt::get(Int32Ty, C->getZExtValue() & 0x3);
13494     Value *Data = ConstantInt::get(Int32Ty, 1);
13495     Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType());
13496     return Builder.CreateCall(F, {Address, RW, Locality, Data});
13497   }
13498   case X86::BI_mm_clflush: {
13499     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_clflush),
13500                               Ops[0]);
13501   }
13502   case X86::BI_mm_lfence: {
13503     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_lfence));
13504   }
13505   case X86::BI_mm_mfence: {
13506     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_mfence));
13507   }
13508   case X86::BI_mm_sfence: {
13509     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_sfence));
13510   }
13511   case X86::BI_mm_pause: {
13512     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_pause));
13513   }
13514   case X86::BI__rdtsc: {
13515     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_rdtsc));
13516   }
13517   case X86::BI__builtin_ia32_rdtscp: {
13518     Value *Call = Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_rdtscp));
13519     Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 1),
13520                                       Ops[0]);
13521     return Builder.CreateExtractValue(Call, 0);
13522   }
13523   case X86::BI__builtin_ia32_lzcnt_u16:
13524   case X86::BI__builtin_ia32_lzcnt_u32:
13525   case X86::BI__builtin_ia32_lzcnt_u64: {
13526     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType());
13527     return Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)});
13528   }
13529   case X86::BI__builtin_ia32_tzcnt_u16:
13530   case X86::BI__builtin_ia32_tzcnt_u32:
13531   case X86::BI__builtin_ia32_tzcnt_u64: {
13532     Function *F = CGM.getIntrinsic(Intrinsic::cttz, Ops[0]->getType());
13533     return Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)});
13534   }
13535   case X86::BI__builtin_ia32_undef128:
13536   case X86::BI__builtin_ia32_undef256:
13537   case X86::BI__builtin_ia32_undef512:
13538     // The x86 definition of "undef" is not the same as the LLVM definition
13539     // (PR32176). We leave optimizing away an unnecessary zero constant to the
13540     // IR optimizer and backend.
13541     // TODO: If we had a "freeze" IR instruction to generate a fixed undef
13542     // value, we should use that here instead of a zero.
13543     return llvm::Constant::getNullValue(ConvertType(E->getType()));
13544   case X86::BI__builtin_ia32_vec_init_v8qi:
13545   case X86::BI__builtin_ia32_vec_init_v4hi:
13546   case X86::BI__builtin_ia32_vec_init_v2si:
13547     return Builder.CreateBitCast(BuildVector(Ops),
13548                                  llvm::Type::getX86_MMXTy(getLLVMContext()));
13549   case X86::BI__builtin_ia32_vec_ext_v2si:
13550   case X86::BI__builtin_ia32_vec_ext_v16qi:
13551   case X86::BI__builtin_ia32_vec_ext_v8hi:
13552   case X86::BI__builtin_ia32_vec_ext_v4si:
13553   case X86::BI__builtin_ia32_vec_ext_v4sf:
13554   case X86::BI__builtin_ia32_vec_ext_v2di:
13555   case X86::BI__builtin_ia32_vec_ext_v32qi:
13556   case X86::BI__builtin_ia32_vec_ext_v16hi:
13557   case X86::BI__builtin_ia32_vec_ext_v8si:
13558   case X86::BI__builtin_ia32_vec_ext_v4di: {
13559     unsigned NumElts =
13560         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
13561     uint64_t Index = cast<ConstantInt>(Ops[1])->getZExtValue();
13562     Index &= NumElts - 1;
13563     // These builtins exist so we can ensure the index is an ICE and in range.
13564     // Otherwise we could just do this in the header file.
13565     return Builder.CreateExtractElement(Ops[0], Index);
13566   }
13567   case X86::BI__builtin_ia32_vec_set_v16qi:
13568   case X86::BI__builtin_ia32_vec_set_v8hi:
13569   case X86::BI__builtin_ia32_vec_set_v4si:
13570   case X86::BI__builtin_ia32_vec_set_v2di:
13571   case X86::BI__builtin_ia32_vec_set_v32qi:
13572   case X86::BI__builtin_ia32_vec_set_v16hi:
13573   case X86::BI__builtin_ia32_vec_set_v8si:
13574   case X86::BI__builtin_ia32_vec_set_v4di: {
13575     unsigned NumElts =
13576         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
13577     unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue();
13578     Index &= NumElts - 1;
13579     // These builtins exist so we can ensure the index is an ICE and in range.
13580     // Otherwise we could just do this in the header file.
13581     return Builder.CreateInsertElement(Ops[0], Ops[1], Index);
13582   }
13583   case X86::BI_mm_setcsr:
13584   case X86::BI__builtin_ia32_ldmxcsr: {
13585     Address Tmp = CreateMemTemp(E->getArg(0)->getType());
13586     Builder.CreateStore(Ops[0], Tmp);
13587     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_ldmxcsr),
13588                           Builder.CreateBitCast(Tmp.getPointer(), Int8PtrTy));
13589   }
13590   case X86::BI_mm_getcsr:
13591   case X86::BI__builtin_ia32_stmxcsr: {
13592     Address Tmp = CreateMemTemp(E->getType());
13593     Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_stmxcsr),
13594                        Builder.CreateBitCast(Tmp.getPointer(), Int8PtrTy));
13595     return Builder.CreateLoad(Tmp, "stmxcsr");
13596   }
13597   case X86::BI__builtin_ia32_xsave:
13598   case X86::BI__builtin_ia32_xsave64:
13599   case X86::BI__builtin_ia32_xrstor:
13600   case X86::BI__builtin_ia32_xrstor64:
13601   case X86::BI__builtin_ia32_xsaveopt:
13602   case X86::BI__builtin_ia32_xsaveopt64:
13603   case X86::BI__builtin_ia32_xrstors:
13604   case X86::BI__builtin_ia32_xrstors64:
13605   case X86::BI__builtin_ia32_xsavec:
13606   case X86::BI__builtin_ia32_xsavec64:
13607   case X86::BI__builtin_ia32_xsaves:
13608   case X86::BI__builtin_ia32_xsaves64:
13609   case X86::BI__builtin_ia32_xsetbv:
13610   case X86::BI_xsetbv: {
13611     Intrinsic::ID ID;
13612 #define INTRINSIC_X86_XSAVE_ID(NAME) \
13613     case X86::BI__builtin_ia32_##NAME: \
13614       ID = Intrinsic::x86_##NAME; \
13615       break
13616     switch (BuiltinID) {
13617     default: llvm_unreachable("Unsupported intrinsic!");
13618     INTRINSIC_X86_XSAVE_ID(xsave);
13619     INTRINSIC_X86_XSAVE_ID(xsave64);
13620     INTRINSIC_X86_XSAVE_ID(xrstor);
13621     INTRINSIC_X86_XSAVE_ID(xrstor64);
13622     INTRINSIC_X86_XSAVE_ID(xsaveopt);
13623     INTRINSIC_X86_XSAVE_ID(xsaveopt64);
13624     INTRINSIC_X86_XSAVE_ID(xrstors);
13625     INTRINSIC_X86_XSAVE_ID(xrstors64);
13626     INTRINSIC_X86_XSAVE_ID(xsavec);
13627     INTRINSIC_X86_XSAVE_ID(xsavec64);
13628     INTRINSIC_X86_XSAVE_ID(xsaves);
13629     INTRINSIC_X86_XSAVE_ID(xsaves64);
13630     INTRINSIC_X86_XSAVE_ID(xsetbv);
13631     case X86::BI_xsetbv:
13632       ID = Intrinsic::x86_xsetbv;
13633       break;
13634     }
13635 #undef INTRINSIC_X86_XSAVE_ID
13636     Value *Mhi = Builder.CreateTrunc(
13637       Builder.CreateLShr(Ops[1], ConstantInt::get(Int64Ty, 32)), Int32Ty);
13638     Value *Mlo = Builder.CreateTrunc(Ops[1], Int32Ty);
13639     Ops[1] = Mhi;
13640     Ops.push_back(Mlo);
13641     return Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
13642   }
13643   case X86::BI__builtin_ia32_xgetbv:
13644   case X86::BI_xgetbv:
13645     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_xgetbv), Ops);
13646   case X86::BI__builtin_ia32_storedqudi128_mask:
13647   case X86::BI__builtin_ia32_storedqusi128_mask:
13648   case X86::BI__builtin_ia32_storedquhi128_mask:
13649   case X86::BI__builtin_ia32_storedquqi128_mask:
13650   case X86::BI__builtin_ia32_storeupd128_mask:
13651   case X86::BI__builtin_ia32_storeups128_mask:
13652   case X86::BI__builtin_ia32_storedqudi256_mask:
13653   case X86::BI__builtin_ia32_storedqusi256_mask:
13654   case X86::BI__builtin_ia32_storedquhi256_mask:
13655   case X86::BI__builtin_ia32_storedquqi256_mask:
13656   case X86::BI__builtin_ia32_storeupd256_mask:
13657   case X86::BI__builtin_ia32_storeups256_mask:
13658   case X86::BI__builtin_ia32_storedqudi512_mask:
13659   case X86::BI__builtin_ia32_storedqusi512_mask:
13660   case X86::BI__builtin_ia32_storedquhi512_mask:
13661   case X86::BI__builtin_ia32_storedquqi512_mask:
13662   case X86::BI__builtin_ia32_storeupd512_mask:
13663   case X86::BI__builtin_ia32_storeups512_mask:
13664     return EmitX86MaskedStore(*this, Ops, Align(1));
13665 
13666   case X86::BI__builtin_ia32_storesh128_mask:
13667   case X86::BI__builtin_ia32_storess128_mask:
13668   case X86::BI__builtin_ia32_storesd128_mask:
13669     return EmitX86MaskedStore(*this, Ops, Align(1));
13670 
13671   case X86::BI__builtin_ia32_vpopcntb_128:
13672   case X86::BI__builtin_ia32_vpopcntd_128:
13673   case X86::BI__builtin_ia32_vpopcntq_128:
13674   case X86::BI__builtin_ia32_vpopcntw_128:
13675   case X86::BI__builtin_ia32_vpopcntb_256:
13676   case X86::BI__builtin_ia32_vpopcntd_256:
13677   case X86::BI__builtin_ia32_vpopcntq_256:
13678   case X86::BI__builtin_ia32_vpopcntw_256:
13679   case X86::BI__builtin_ia32_vpopcntb_512:
13680   case X86::BI__builtin_ia32_vpopcntd_512:
13681   case X86::BI__builtin_ia32_vpopcntq_512:
13682   case X86::BI__builtin_ia32_vpopcntw_512: {
13683     llvm::Type *ResultType = ConvertType(E->getType());
13684     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ResultType);
13685     return Builder.CreateCall(F, Ops);
13686   }
13687   case X86::BI__builtin_ia32_cvtmask2b128:
13688   case X86::BI__builtin_ia32_cvtmask2b256:
13689   case X86::BI__builtin_ia32_cvtmask2b512:
13690   case X86::BI__builtin_ia32_cvtmask2w128:
13691   case X86::BI__builtin_ia32_cvtmask2w256:
13692   case X86::BI__builtin_ia32_cvtmask2w512:
13693   case X86::BI__builtin_ia32_cvtmask2d128:
13694   case X86::BI__builtin_ia32_cvtmask2d256:
13695   case X86::BI__builtin_ia32_cvtmask2d512:
13696   case X86::BI__builtin_ia32_cvtmask2q128:
13697   case X86::BI__builtin_ia32_cvtmask2q256:
13698   case X86::BI__builtin_ia32_cvtmask2q512:
13699     return EmitX86SExtMask(*this, Ops[0], ConvertType(E->getType()));
13700 
13701   case X86::BI__builtin_ia32_cvtb2mask128:
13702   case X86::BI__builtin_ia32_cvtb2mask256:
13703   case X86::BI__builtin_ia32_cvtb2mask512:
13704   case X86::BI__builtin_ia32_cvtw2mask128:
13705   case X86::BI__builtin_ia32_cvtw2mask256:
13706   case X86::BI__builtin_ia32_cvtw2mask512:
13707   case X86::BI__builtin_ia32_cvtd2mask128:
13708   case X86::BI__builtin_ia32_cvtd2mask256:
13709   case X86::BI__builtin_ia32_cvtd2mask512:
13710   case X86::BI__builtin_ia32_cvtq2mask128:
13711   case X86::BI__builtin_ia32_cvtq2mask256:
13712   case X86::BI__builtin_ia32_cvtq2mask512:
13713     return EmitX86ConvertToMask(*this, Ops[0]);
13714 
13715   case X86::BI__builtin_ia32_cvtdq2ps512_mask:
13716   case X86::BI__builtin_ia32_cvtqq2ps512_mask:
13717   case X86::BI__builtin_ia32_cvtqq2pd512_mask:
13718   case X86::BI__builtin_ia32_vcvtw2ph512_mask:
13719   case X86::BI__builtin_ia32_vcvtdq2ph512_mask:
13720   case X86::BI__builtin_ia32_vcvtqq2ph512_mask:
13721     return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ true);
13722   case X86::BI__builtin_ia32_cvtudq2ps512_mask:
13723   case X86::BI__builtin_ia32_cvtuqq2ps512_mask:
13724   case X86::BI__builtin_ia32_cvtuqq2pd512_mask:
13725   case X86::BI__builtin_ia32_vcvtuw2ph512_mask:
13726   case X86::BI__builtin_ia32_vcvtudq2ph512_mask:
13727   case X86::BI__builtin_ia32_vcvtuqq2ph512_mask:
13728     return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ false);
13729 
13730   case X86::BI__builtin_ia32_vfmaddss3:
13731   case X86::BI__builtin_ia32_vfmaddsd3:
13732   case X86::BI__builtin_ia32_vfmaddsh3_mask:
13733   case X86::BI__builtin_ia32_vfmaddss3_mask:
13734   case X86::BI__builtin_ia32_vfmaddsd3_mask:
13735     return EmitScalarFMAExpr(*this, E, Ops, Ops[0]);
13736   case X86::BI__builtin_ia32_vfmaddss:
13737   case X86::BI__builtin_ia32_vfmaddsd:
13738     return EmitScalarFMAExpr(*this, E, Ops,
13739                              Constant::getNullValue(Ops[0]->getType()));
13740   case X86::BI__builtin_ia32_vfmaddsh3_maskz:
13741   case X86::BI__builtin_ia32_vfmaddss3_maskz:
13742   case X86::BI__builtin_ia32_vfmaddsd3_maskz:
13743     return EmitScalarFMAExpr(*this, E, Ops, Ops[0], /*ZeroMask*/ true);
13744   case X86::BI__builtin_ia32_vfmaddsh3_mask3:
13745   case X86::BI__builtin_ia32_vfmaddss3_mask3:
13746   case X86::BI__builtin_ia32_vfmaddsd3_mask3:
13747     return EmitScalarFMAExpr(*this, E, Ops, Ops[2], /*ZeroMask*/ false, 2);
13748   case X86::BI__builtin_ia32_vfmsubsh3_mask3:
13749   case X86::BI__builtin_ia32_vfmsubss3_mask3:
13750   case X86::BI__builtin_ia32_vfmsubsd3_mask3:
13751     return EmitScalarFMAExpr(*this, E, Ops, Ops[2], /*ZeroMask*/ false, 2,
13752                              /*NegAcc*/ true);
13753   case X86::BI__builtin_ia32_vfmaddph:
13754   case X86::BI__builtin_ia32_vfmaddps:
13755   case X86::BI__builtin_ia32_vfmaddpd:
13756   case X86::BI__builtin_ia32_vfmaddph256:
13757   case X86::BI__builtin_ia32_vfmaddps256:
13758   case X86::BI__builtin_ia32_vfmaddpd256:
13759   case X86::BI__builtin_ia32_vfmaddph512_mask:
13760   case X86::BI__builtin_ia32_vfmaddph512_maskz:
13761   case X86::BI__builtin_ia32_vfmaddph512_mask3:
13762   case X86::BI__builtin_ia32_vfmaddps512_mask:
13763   case X86::BI__builtin_ia32_vfmaddps512_maskz:
13764   case X86::BI__builtin_ia32_vfmaddps512_mask3:
13765   case X86::BI__builtin_ia32_vfmsubps512_mask3:
13766   case X86::BI__builtin_ia32_vfmaddpd512_mask:
13767   case X86::BI__builtin_ia32_vfmaddpd512_maskz:
13768   case X86::BI__builtin_ia32_vfmaddpd512_mask3:
13769   case X86::BI__builtin_ia32_vfmsubpd512_mask3:
13770   case X86::BI__builtin_ia32_vfmsubph512_mask3:
13771     return EmitX86FMAExpr(*this, E, Ops, BuiltinID, /*IsAddSub*/ false);
13772   case X86::BI__builtin_ia32_vfmaddsubph512_mask:
13773   case X86::BI__builtin_ia32_vfmaddsubph512_maskz:
13774   case X86::BI__builtin_ia32_vfmaddsubph512_mask3:
13775   case X86::BI__builtin_ia32_vfmsubaddph512_mask3:
13776   case X86::BI__builtin_ia32_vfmaddsubps512_mask:
13777   case X86::BI__builtin_ia32_vfmaddsubps512_maskz:
13778   case X86::BI__builtin_ia32_vfmaddsubps512_mask3:
13779   case X86::BI__builtin_ia32_vfmsubaddps512_mask3:
13780   case X86::BI__builtin_ia32_vfmaddsubpd512_mask:
13781   case X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
13782   case X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
13783   case X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
13784     return EmitX86FMAExpr(*this, E, Ops, BuiltinID, /*IsAddSub*/ true);
13785 
13786   case X86::BI__builtin_ia32_movdqa32store128_mask:
13787   case X86::BI__builtin_ia32_movdqa64store128_mask:
13788   case X86::BI__builtin_ia32_storeaps128_mask:
13789   case X86::BI__builtin_ia32_storeapd128_mask:
13790   case X86::BI__builtin_ia32_movdqa32store256_mask:
13791   case X86::BI__builtin_ia32_movdqa64store256_mask:
13792   case X86::BI__builtin_ia32_storeaps256_mask:
13793   case X86::BI__builtin_ia32_storeapd256_mask:
13794   case X86::BI__builtin_ia32_movdqa32store512_mask:
13795   case X86::BI__builtin_ia32_movdqa64store512_mask:
13796   case X86::BI__builtin_ia32_storeaps512_mask:
13797   case X86::BI__builtin_ia32_storeapd512_mask:
13798     return EmitX86MaskedStore(
13799         *this, Ops,
13800         getContext().getTypeAlignInChars(E->getArg(1)->getType()).getAsAlign());
13801 
13802   case X86::BI__builtin_ia32_loadups128_mask:
13803   case X86::BI__builtin_ia32_loadups256_mask:
13804   case X86::BI__builtin_ia32_loadups512_mask:
13805   case X86::BI__builtin_ia32_loadupd128_mask:
13806   case X86::BI__builtin_ia32_loadupd256_mask:
13807   case X86::BI__builtin_ia32_loadupd512_mask:
13808   case X86::BI__builtin_ia32_loaddquqi128_mask:
13809   case X86::BI__builtin_ia32_loaddquqi256_mask:
13810   case X86::BI__builtin_ia32_loaddquqi512_mask:
13811   case X86::BI__builtin_ia32_loaddquhi128_mask:
13812   case X86::BI__builtin_ia32_loaddquhi256_mask:
13813   case X86::BI__builtin_ia32_loaddquhi512_mask:
13814   case X86::BI__builtin_ia32_loaddqusi128_mask:
13815   case X86::BI__builtin_ia32_loaddqusi256_mask:
13816   case X86::BI__builtin_ia32_loaddqusi512_mask:
13817   case X86::BI__builtin_ia32_loaddqudi128_mask:
13818   case X86::BI__builtin_ia32_loaddqudi256_mask:
13819   case X86::BI__builtin_ia32_loaddqudi512_mask:
13820     return EmitX86MaskedLoad(*this, Ops, Align(1));
13821 
13822   case X86::BI__builtin_ia32_loadsh128_mask:
13823   case X86::BI__builtin_ia32_loadss128_mask:
13824   case X86::BI__builtin_ia32_loadsd128_mask:
13825     return EmitX86MaskedLoad(*this, Ops, Align(1));
13826 
13827   case X86::BI__builtin_ia32_loadaps128_mask:
13828   case X86::BI__builtin_ia32_loadaps256_mask:
13829   case X86::BI__builtin_ia32_loadaps512_mask:
13830   case X86::BI__builtin_ia32_loadapd128_mask:
13831   case X86::BI__builtin_ia32_loadapd256_mask:
13832   case X86::BI__builtin_ia32_loadapd512_mask:
13833   case X86::BI__builtin_ia32_movdqa32load128_mask:
13834   case X86::BI__builtin_ia32_movdqa32load256_mask:
13835   case X86::BI__builtin_ia32_movdqa32load512_mask:
13836   case X86::BI__builtin_ia32_movdqa64load128_mask:
13837   case X86::BI__builtin_ia32_movdqa64load256_mask:
13838   case X86::BI__builtin_ia32_movdqa64load512_mask:
13839     return EmitX86MaskedLoad(
13840         *this, Ops,
13841         getContext().getTypeAlignInChars(E->getArg(1)->getType()).getAsAlign());
13842 
13843   case X86::BI__builtin_ia32_expandloaddf128_mask:
13844   case X86::BI__builtin_ia32_expandloaddf256_mask:
13845   case X86::BI__builtin_ia32_expandloaddf512_mask:
13846   case X86::BI__builtin_ia32_expandloadsf128_mask:
13847   case X86::BI__builtin_ia32_expandloadsf256_mask:
13848   case X86::BI__builtin_ia32_expandloadsf512_mask:
13849   case X86::BI__builtin_ia32_expandloaddi128_mask:
13850   case X86::BI__builtin_ia32_expandloaddi256_mask:
13851   case X86::BI__builtin_ia32_expandloaddi512_mask:
13852   case X86::BI__builtin_ia32_expandloadsi128_mask:
13853   case X86::BI__builtin_ia32_expandloadsi256_mask:
13854   case X86::BI__builtin_ia32_expandloadsi512_mask:
13855   case X86::BI__builtin_ia32_expandloadhi128_mask:
13856   case X86::BI__builtin_ia32_expandloadhi256_mask:
13857   case X86::BI__builtin_ia32_expandloadhi512_mask:
13858   case X86::BI__builtin_ia32_expandloadqi128_mask:
13859   case X86::BI__builtin_ia32_expandloadqi256_mask:
13860   case X86::BI__builtin_ia32_expandloadqi512_mask:
13861     return EmitX86ExpandLoad(*this, Ops);
13862 
13863   case X86::BI__builtin_ia32_compressstoredf128_mask:
13864   case X86::BI__builtin_ia32_compressstoredf256_mask:
13865   case X86::BI__builtin_ia32_compressstoredf512_mask:
13866   case X86::BI__builtin_ia32_compressstoresf128_mask:
13867   case X86::BI__builtin_ia32_compressstoresf256_mask:
13868   case X86::BI__builtin_ia32_compressstoresf512_mask:
13869   case X86::BI__builtin_ia32_compressstoredi128_mask:
13870   case X86::BI__builtin_ia32_compressstoredi256_mask:
13871   case X86::BI__builtin_ia32_compressstoredi512_mask:
13872   case X86::BI__builtin_ia32_compressstoresi128_mask:
13873   case X86::BI__builtin_ia32_compressstoresi256_mask:
13874   case X86::BI__builtin_ia32_compressstoresi512_mask:
13875   case X86::BI__builtin_ia32_compressstorehi128_mask:
13876   case X86::BI__builtin_ia32_compressstorehi256_mask:
13877   case X86::BI__builtin_ia32_compressstorehi512_mask:
13878   case X86::BI__builtin_ia32_compressstoreqi128_mask:
13879   case X86::BI__builtin_ia32_compressstoreqi256_mask:
13880   case X86::BI__builtin_ia32_compressstoreqi512_mask:
13881     return EmitX86CompressStore(*this, Ops);
13882 
13883   case X86::BI__builtin_ia32_expanddf128_mask:
13884   case X86::BI__builtin_ia32_expanddf256_mask:
13885   case X86::BI__builtin_ia32_expanddf512_mask:
13886   case X86::BI__builtin_ia32_expandsf128_mask:
13887   case X86::BI__builtin_ia32_expandsf256_mask:
13888   case X86::BI__builtin_ia32_expandsf512_mask:
13889   case X86::BI__builtin_ia32_expanddi128_mask:
13890   case X86::BI__builtin_ia32_expanddi256_mask:
13891   case X86::BI__builtin_ia32_expanddi512_mask:
13892   case X86::BI__builtin_ia32_expandsi128_mask:
13893   case X86::BI__builtin_ia32_expandsi256_mask:
13894   case X86::BI__builtin_ia32_expandsi512_mask:
13895   case X86::BI__builtin_ia32_expandhi128_mask:
13896   case X86::BI__builtin_ia32_expandhi256_mask:
13897   case X86::BI__builtin_ia32_expandhi512_mask:
13898   case X86::BI__builtin_ia32_expandqi128_mask:
13899   case X86::BI__builtin_ia32_expandqi256_mask:
13900   case X86::BI__builtin_ia32_expandqi512_mask:
13901     return EmitX86CompressExpand(*this, Ops, /*IsCompress*/false);
13902 
13903   case X86::BI__builtin_ia32_compressdf128_mask:
13904   case X86::BI__builtin_ia32_compressdf256_mask:
13905   case X86::BI__builtin_ia32_compressdf512_mask:
13906   case X86::BI__builtin_ia32_compresssf128_mask:
13907   case X86::BI__builtin_ia32_compresssf256_mask:
13908   case X86::BI__builtin_ia32_compresssf512_mask:
13909   case X86::BI__builtin_ia32_compressdi128_mask:
13910   case X86::BI__builtin_ia32_compressdi256_mask:
13911   case X86::BI__builtin_ia32_compressdi512_mask:
13912   case X86::BI__builtin_ia32_compresssi128_mask:
13913   case X86::BI__builtin_ia32_compresssi256_mask:
13914   case X86::BI__builtin_ia32_compresssi512_mask:
13915   case X86::BI__builtin_ia32_compresshi128_mask:
13916   case X86::BI__builtin_ia32_compresshi256_mask:
13917   case X86::BI__builtin_ia32_compresshi512_mask:
13918   case X86::BI__builtin_ia32_compressqi128_mask:
13919   case X86::BI__builtin_ia32_compressqi256_mask:
13920   case X86::BI__builtin_ia32_compressqi512_mask:
13921     return EmitX86CompressExpand(*this, Ops, /*IsCompress*/true);
13922 
13923   case X86::BI__builtin_ia32_gather3div2df:
13924   case X86::BI__builtin_ia32_gather3div2di:
13925   case X86::BI__builtin_ia32_gather3div4df:
13926   case X86::BI__builtin_ia32_gather3div4di:
13927   case X86::BI__builtin_ia32_gather3div4sf:
13928   case X86::BI__builtin_ia32_gather3div4si:
13929   case X86::BI__builtin_ia32_gather3div8sf:
13930   case X86::BI__builtin_ia32_gather3div8si:
13931   case X86::BI__builtin_ia32_gather3siv2df:
13932   case X86::BI__builtin_ia32_gather3siv2di:
13933   case X86::BI__builtin_ia32_gather3siv4df:
13934   case X86::BI__builtin_ia32_gather3siv4di:
13935   case X86::BI__builtin_ia32_gather3siv4sf:
13936   case X86::BI__builtin_ia32_gather3siv4si:
13937   case X86::BI__builtin_ia32_gather3siv8sf:
13938   case X86::BI__builtin_ia32_gather3siv8si:
13939   case X86::BI__builtin_ia32_gathersiv8df:
13940   case X86::BI__builtin_ia32_gathersiv16sf:
13941   case X86::BI__builtin_ia32_gatherdiv8df:
13942   case X86::BI__builtin_ia32_gatherdiv16sf:
13943   case X86::BI__builtin_ia32_gathersiv8di:
13944   case X86::BI__builtin_ia32_gathersiv16si:
13945   case X86::BI__builtin_ia32_gatherdiv8di:
13946   case X86::BI__builtin_ia32_gatherdiv16si: {
13947     Intrinsic::ID IID;
13948     switch (BuiltinID) {
13949     default: llvm_unreachable("Unexpected builtin");
13950     case X86::BI__builtin_ia32_gather3div2df:
13951       IID = Intrinsic::x86_avx512_mask_gather3div2_df;
13952       break;
13953     case X86::BI__builtin_ia32_gather3div2di:
13954       IID = Intrinsic::x86_avx512_mask_gather3div2_di;
13955       break;
13956     case X86::BI__builtin_ia32_gather3div4df:
13957       IID = Intrinsic::x86_avx512_mask_gather3div4_df;
13958       break;
13959     case X86::BI__builtin_ia32_gather3div4di:
13960       IID = Intrinsic::x86_avx512_mask_gather3div4_di;
13961       break;
13962     case X86::BI__builtin_ia32_gather3div4sf:
13963       IID = Intrinsic::x86_avx512_mask_gather3div4_sf;
13964       break;
13965     case X86::BI__builtin_ia32_gather3div4si:
13966       IID = Intrinsic::x86_avx512_mask_gather3div4_si;
13967       break;
13968     case X86::BI__builtin_ia32_gather3div8sf:
13969       IID = Intrinsic::x86_avx512_mask_gather3div8_sf;
13970       break;
13971     case X86::BI__builtin_ia32_gather3div8si:
13972       IID = Intrinsic::x86_avx512_mask_gather3div8_si;
13973       break;
13974     case X86::BI__builtin_ia32_gather3siv2df:
13975       IID = Intrinsic::x86_avx512_mask_gather3siv2_df;
13976       break;
13977     case X86::BI__builtin_ia32_gather3siv2di:
13978       IID = Intrinsic::x86_avx512_mask_gather3siv2_di;
13979       break;
13980     case X86::BI__builtin_ia32_gather3siv4df:
13981       IID = Intrinsic::x86_avx512_mask_gather3siv4_df;
13982       break;
13983     case X86::BI__builtin_ia32_gather3siv4di:
13984       IID = Intrinsic::x86_avx512_mask_gather3siv4_di;
13985       break;
13986     case X86::BI__builtin_ia32_gather3siv4sf:
13987       IID = Intrinsic::x86_avx512_mask_gather3siv4_sf;
13988       break;
13989     case X86::BI__builtin_ia32_gather3siv4si:
13990       IID = Intrinsic::x86_avx512_mask_gather3siv4_si;
13991       break;
13992     case X86::BI__builtin_ia32_gather3siv8sf:
13993       IID = Intrinsic::x86_avx512_mask_gather3siv8_sf;
13994       break;
13995     case X86::BI__builtin_ia32_gather3siv8si:
13996       IID = Intrinsic::x86_avx512_mask_gather3siv8_si;
13997       break;
13998     case X86::BI__builtin_ia32_gathersiv8df:
13999       IID = Intrinsic::x86_avx512_mask_gather_dpd_512;
14000       break;
14001     case X86::BI__builtin_ia32_gathersiv16sf:
14002       IID = Intrinsic::x86_avx512_mask_gather_dps_512;
14003       break;
14004     case X86::BI__builtin_ia32_gatherdiv8df:
14005       IID = Intrinsic::x86_avx512_mask_gather_qpd_512;
14006       break;
14007     case X86::BI__builtin_ia32_gatherdiv16sf:
14008       IID = Intrinsic::x86_avx512_mask_gather_qps_512;
14009       break;
14010     case X86::BI__builtin_ia32_gathersiv8di:
14011       IID = Intrinsic::x86_avx512_mask_gather_dpq_512;
14012       break;
14013     case X86::BI__builtin_ia32_gathersiv16si:
14014       IID = Intrinsic::x86_avx512_mask_gather_dpi_512;
14015       break;
14016     case X86::BI__builtin_ia32_gatherdiv8di:
14017       IID = Intrinsic::x86_avx512_mask_gather_qpq_512;
14018       break;
14019     case X86::BI__builtin_ia32_gatherdiv16si:
14020       IID = Intrinsic::x86_avx512_mask_gather_qpi_512;
14021       break;
14022     }
14023 
14024     unsigned MinElts = std::min(
14025         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(),
14026         cast<llvm::FixedVectorType>(Ops[2]->getType())->getNumElements());
14027     Ops[3] = getMaskVecValue(*this, Ops[3], MinElts);
14028     Function *Intr = CGM.getIntrinsic(IID);
14029     return Builder.CreateCall(Intr, Ops);
14030   }
14031 
14032   case X86::BI__builtin_ia32_scattersiv8df:
14033   case X86::BI__builtin_ia32_scattersiv16sf:
14034   case X86::BI__builtin_ia32_scatterdiv8df:
14035   case X86::BI__builtin_ia32_scatterdiv16sf:
14036   case X86::BI__builtin_ia32_scattersiv8di:
14037   case X86::BI__builtin_ia32_scattersiv16si:
14038   case X86::BI__builtin_ia32_scatterdiv8di:
14039   case X86::BI__builtin_ia32_scatterdiv16si:
14040   case X86::BI__builtin_ia32_scatterdiv2df:
14041   case X86::BI__builtin_ia32_scatterdiv2di:
14042   case X86::BI__builtin_ia32_scatterdiv4df:
14043   case X86::BI__builtin_ia32_scatterdiv4di:
14044   case X86::BI__builtin_ia32_scatterdiv4sf:
14045   case X86::BI__builtin_ia32_scatterdiv4si:
14046   case X86::BI__builtin_ia32_scatterdiv8sf:
14047   case X86::BI__builtin_ia32_scatterdiv8si:
14048   case X86::BI__builtin_ia32_scattersiv2df:
14049   case X86::BI__builtin_ia32_scattersiv2di:
14050   case X86::BI__builtin_ia32_scattersiv4df:
14051   case X86::BI__builtin_ia32_scattersiv4di:
14052   case X86::BI__builtin_ia32_scattersiv4sf:
14053   case X86::BI__builtin_ia32_scattersiv4si:
14054   case X86::BI__builtin_ia32_scattersiv8sf:
14055   case X86::BI__builtin_ia32_scattersiv8si: {
14056     Intrinsic::ID IID;
14057     switch (BuiltinID) {
14058     default: llvm_unreachable("Unexpected builtin");
14059     case X86::BI__builtin_ia32_scattersiv8df:
14060       IID = Intrinsic::x86_avx512_mask_scatter_dpd_512;
14061       break;
14062     case X86::BI__builtin_ia32_scattersiv16sf:
14063       IID = Intrinsic::x86_avx512_mask_scatter_dps_512;
14064       break;
14065     case X86::BI__builtin_ia32_scatterdiv8df:
14066       IID = Intrinsic::x86_avx512_mask_scatter_qpd_512;
14067       break;
14068     case X86::BI__builtin_ia32_scatterdiv16sf:
14069       IID = Intrinsic::x86_avx512_mask_scatter_qps_512;
14070       break;
14071     case X86::BI__builtin_ia32_scattersiv8di:
14072       IID = Intrinsic::x86_avx512_mask_scatter_dpq_512;
14073       break;
14074     case X86::BI__builtin_ia32_scattersiv16si:
14075       IID = Intrinsic::x86_avx512_mask_scatter_dpi_512;
14076       break;
14077     case X86::BI__builtin_ia32_scatterdiv8di:
14078       IID = Intrinsic::x86_avx512_mask_scatter_qpq_512;
14079       break;
14080     case X86::BI__builtin_ia32_scatterdiv16si:
14081       IID = Intrinsic::x86_avx512_mask_scatter_qpi_512;
14082       break;
14083     case X86::BI__builtin_ia32_scatterdiv2df:
14084       IID = Intrinsic::x86_avx512_mask_scatterdiv2_df;
14085       break;
14086     case X86::BI__builtin_ia32_scatterdiv2di:
14087       IID = Intrinsic::x86_avx512_mask_scatterdiv2_di;
14088       break;
14089     case X86::BI__builtin_ia32_scatterdiv4df:
14090       IID = Intrinsic::x86_avx512_mask_scatterdiv4_df;
14091       break;
14092     case X86::BI__builtin_ia32_scatterdiv4di:
14093       IID = Intrinsic::x86_avx512_mask_scatterdiv4_di;
14094       break;
14095     case X86::BI__builtin_ia32_scatterdiv4sf:
14096       IID = Intrinsic::x86_avx512_mask_scatterdiv4_sf;
14097       break;
14098     case X86::BI__builtin_ia32_scatterdiv4si:
14099       IID = Intrinsic::x86_avx512_mask_scatterdiv4_si;
14100       break;
14101     case X86::BI__builtin_ia32_scatterdiv8sf:
14102       IID = Intrinsic::x86_avx512_mask_scatterdiv8_sf;
14103       break;
14104     case X86::BI__builtin_ia32_scatterdiv8si:
14105       IID = Intrinsic::x86_avx512_mask_scatterdiv8_si;
14106       break;
14107     case X86::BI__builtin_ia32_scattersiv2df:
14108       IID = Intrinsic::x86_avx512_mask_scattersiv2_df;
14109       break;
14110     case X86::BI__builtin_ia32_scattersiv2di:
14111       IID = Intrinsic::x86_avx512_mask_scattersiv2_di;
14112       break;
14113     case X86::BI__builtin_ia32_scattersiv4df:
14114       IID = Intrinsic::x86_avx512_mask_scattersiv4_df;
14115       break;
14116     case X86::BI__builtin_ia32_scattersiv4di:
14117       IID = Intrinsic::x86_avx512_mask_scattersiv4_di;
14118       break;
14119     case X86::BI__builtin_ia32_scattersiv4sf:
14120       IID = Intrinsic::x86_avx512_mask_scattersiv4_sf;
14121       break;
14122     case X86::BI__builtin_ia32_scattersiv4si:
14123       IID = Intrinsic::x86_avx512_mask_scattersiv4_si;
14124       break;
14125     case X86::BI__builtin_ia32_scattersiv8sf:
14126       IID = Intrinsic::x86_avx512_mask_scattersiv8_sf;
14127       break;
14128     case X86::BI__builtin_ia32_scattersiv8si:
14129       IID = Intrinsic::x86_avx512_mask_scattersiv8_si;
14130       break;
14131     }
14132 
14133     unsigned MinElts = std::min(
14134         cast<llvm::FixedVectorType>(Ops[2]->getType())->getNumElements(),
14135         cast<llvm::FixedVectorType>(Ops[3]->getType())->getNumElements());
14136     Ops[1] = getMaskVecValue(*this, Ops[1], MinElts);
14137     Function *Intr = CGM.getIntrinsic(IID);
14138     return Builder.CreateCall(Intr, Ops);
14139   }
14140 
14141   case X86::BI__builtin_ia32_vextractf128_pd256:
14142   case X86::BI__builtin_ia32_vextractf128_ps256:
14143   case X86::BI__builtin_ia32_vextractf128_si256:
14144   case X86::BI__builtin_ia32_extract128i256:
14145   case X86::BI__builtin_ia32_extractf64x4_mask:
14146   case X86::BI__builtin_ia32_extractf32x4_mask:
14147   case X86::BI__builtin_ia32_extracti64x4_mask:
14148   case X86::BI__builtin_ia32_extracti32x4_mask:
14149   case X86::BI__builtin_ia32_extractf32x8_mask:
14150   case X86::BI__builtin_ia32_extracti32x8_mask:
14151   case X86::BI__builtin_ia32_extractf32x4_256_mask:
14152   case X86::BI__builtin_ia32_extracti32x4_256_mask:
14153   case X86::BI__builtin_ia32_extractf64x2_256_mask:
14154   case X86::BI__builtin_ia32_extracti64x2_256_mask:
14155   case X86::BI__builtin_ia32_extractf64x2_512_mask:
14156   case X86::BI__builtin_ia32_extracti64x2_512_mask: {
14157     auto *DstTy = cast<llvm::FixedVectorType>(ConvertType(E->getType()));
14158     unsigned NumElts = DstTy->getNumElements();
14159     unsigned SrcNumElts =
14160         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
14161     unsigned SubVectors = SrcNumElts / NumElts;
14162     unsigned Index = cast<ConstantInt>(Ops[1])->getZExtValue();
14163     assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors");
14164     Index &= SubVectors - 1; // Remove any extra bits.
14165     Index *= NumElts;
14166 
14167     int Indices[16];
14168     for (unsigned i = 0; i != NumElts; ++i)
14169       Indices[i] = i + Index;
14170 
14171     Value *Res = Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
14172                                              "extract");
14173 
14174     if (Ops.size() == 4)
14175       Res = EmitX86Select(*this, Ops[3], Res, Ops[2]);
14176 
14177     return Res;
14178   }
14179   case X86::BI__builtin_ia32_vinsertf128_pd256:
14180   case X86::BI__builtin_ia32_vinsertf128_ps256:
14181   case X86::BI__builtin_ia32_vinsertf128_si256:
14182   case X86::BI__builtin_ia32_insert128i256:
14183   case X86::BI__builtin_ia32_insertf64x4:
14184   case X86::BI__builtin_ia32_insertf32x4:
14185   case X86::BI__builtin_ia32_inserti64x4:
14186   case X86::BI__builtin_ia32_inserti32x4:
14187   case X86::BI__builtin_ia32_insertf32x8:
14188   case X86::BI__builtin_ia32_inserti32x8:
14189   case X86::BI__builtin_ia32_insertf32x4_256:
14190   case X86::BI__builtin_ia32_inserti32x4_256:
14191   case X86::BI__builtin_ia32_insertf64x2_256:
14192   case X86::BI__builtin_ia32_inserti64x2_256:
14193   case X86::BI__builtin_ia32_insertf64x2_512:
14194   case X86::BI__builtin_ia32_inserti64x2_512: {
14195     unsigned DstNumElts =
14196         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
14197     unsigned SrcNumElts =
14198         cast<llvm::FixedVectorType>(Ops[1]->getType())->getNumElements();
14199     unsigned SubVectors = DstNumElts / SrcNumElts;
14200     unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue();
14201     assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors");
14202     Index &= SubVectors - 1; // Remove any extra bits.
14203     Index *= SrcNumElts;
14204 
14205     int Indices[16];
14206     for (unsigned i = 0; i != DstNumElts; ++i)
14207       Indices[i] = (i >= SrcNumElts) ? SrcNumElts + (i % SrcNumElts) : i;
14208 
14209     Value *Op1 = Builder.CreateShuffleVector(
14210         Ops[1], ArrayRef(Indices, DstNumElts), "widen");
14211 
14212     for (unsigned i = 0; i != DstNumElts; ++i) {
14213       if (i >= Index && i < (Index + SrcNumElts))
14214         Indices[i] = (i - Index) + DstNumElts;
14215       else
14216         Indices[i] = i;
14217     }
14218 
14219     return Builder.CreateShuffleVector(Ops[0], Op1,
14220                                        ArrayRef(Indices, DstNumElts), "insert");
14221   }
14222   case X86::BI__builtin_ia32_pmovqd512_mask:
14223   case X86::BI__builtin_ia32_pmovwb512_mask: {
14224     Value *Res = Builder.CreateTrunc(Ops[0], Ops[1]->getType());
14225     return EmitX86Select(*this, Ops[2], Res, Ops[1]);
14226   }
14227   case X86::BI__builtin_ia32_pmovdb512_mask:
14228   case X86::BI__builtin_ia32_pmovdw512_mask:
14229   case X86::BI__builtin_ia32_pmovqw512_mask: {
14230     if (const auto *C = dyn_cast<Constant>(Ops[2]))
14231       if (C->isAllOnesValue())
14232         return Builder.CreateTrunc(Ops[0], Ops[1]->getType());
14233 
14234     Intrinsic::ID IID;
14235     switch (BuiltinID) {
14236     default: llvm_unreachable("Unsupported intrinsic!");
14237     case X86::BI__builtin_ia32_pmovdb512_mask:
14238       IID = Intrinsic::x86_avx512_mask_pmov_db_512;
14239       break;
14240     case X86::BI__builtin_ia32_pmovdw512_mask:
14241       IID = Intrinsic::x86_avx512_mask_pmov_dw_512;
14242       break;
14243     case X86::BI__builtin_ia32_pmovqw512_mask:
14244       IID = Intrinsic::x86_avx512_mask_pmov_qw_512;
14245       break;
14246     }
14247 
14248     Function *Intr = CGM.getIntrinsic(IID);
14249     return Builder.CreateCall(Intr, Ops);
14250   }
14251   case X86::BI__builtin_ia32_pblendw128:
14252   case X86::BI__builtin_ia32_blendpd:
14253   case X86::BI__builtin_ia32_blendps:
14254   case X86::BI__builtin_ia32_blendpd256:
14255   case X86::BI__builtin_ia32_blendps256:
14256   case X86::BI__builtin_ia32_pblendw256:
14257   case X86::BI__builtin_ia32_pblendd128:
14258   case X86::BI__builtin_ia32_pblendd256: {
14259     unsigned NumElts =
14260         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
14261     unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
14262 
14263     int Indices[16];
14264     // If there are more than 8 elements, the immediate is used twice so make
14265     // sure we handle that.
14266     for (unsigned i = 0; i != NumElts; ++i)
14267       Indices[i] = ((Imm >> (i % 8)) & 0x1) ? NumElts + i : i;
14268 
14269     return Builder.CreateShuffleVector(Ops[0], Ops[1],
14270                                        ArrayRef(Indices, NumElts), "blend");
14271   }
14272   case X86::BI__builtin_ia32_pshuflw:
14273   case X86::BI__builtin_ia32_pshuflw256:
14274   case X86::BI__builtin_ia32_pshuflw512: {
14275     uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
14276     auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
14277     unsigned NumElts = Ty->getNumElements();
14278 
14279     // Splat the 8-bits of immediate 4 times to help the loop wrap around.
14280     Imm = (Imm & 0xff) * 0x01010101;
14281 
14282     int Indices[32];
14283     for (unsigned l = 0; l != NumElts; l += 8) {
14284       for (unsigned i = 0; i != 4; ++i) {
14285         Indices[l + i] = l + (Imm & 3);
14286         Imm >>= 2;
14287       }
14288       for (unsigned i = 4; i != 8; ++i)
14289         Indices[l + i] = l + i;
14290     }
14291 
14292     return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
14293                                        "pshuflw");
14294   }
14295   case X86::BI__builtin_ia32_pshufhw:
14296   case X86::BI__builtin_ia32_pshufhw256:
14297   case X86::BI__builtin_ia32_pshufhw512: {
14298     uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
14299     auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
14300     unsigned NumElts = Ty->getNumElements();
14301 
14302     // Splat the 8-bits of immediate 4 times to help the loop wrap around.
14303     Imm = (Imm & 0xff) * 0x01010101;
14304 
14305     int Indices[32];
14306     for (unsigned l = 0; l != NumElts; l += 8) {
14307       for (unsigned i = 0; i != 4; ++i)
14308         Indices[l + i] = l + i;
14309       for (unsigned i = 4; i != 8; ++i) {
14310         Indices[l + i] = l + 4 + (Imm & 3);
14311         Imm >>= 2;
14312       }
14313     }
14314 
14315     return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
14316                                        "pshufhw");
14317   }
14318   case X86::BI__builtin_ia32_pshufd:
14319   case X86::BI__builtin_ia32_pshufd256:
14320   case X86::BI__builtin_ia32_pshufd512:
14321   case X86::BI__builtin_ia32_vpermilpd:
14322   case X86::BI__builtin_ia32_vpermilps:
14323   case X86::BI__builtin_ia32_vpermilpd256:
14324   case X86::BI__builtin_ia32_vpermilps256:
14325   case X86::BI__builtin_ia32_vpermilpd512:
14326   case X86::BI__builtin_ia32_vpermilps512: {
14327     uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
14328     auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
14329     unsigned NumElts = Ty->getNumElements();
14330     unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128;
14331     unsigned NumLaneElts = NumElts / NumLanes;
14332 
14333     // Splat the 8-bits of immediate 4 times to help the loop wrap around.
14334     Imm = (Imm & 0xff) * 0x01010101;
14335 
14336     int Indices[16];
14337     for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
14338       for (unsigned i = 0; i != NumLaneElts; ++i) {
14339         Indices[i + l] = (Imm % NumLaneElts) + l;
14340         Imm /= NumLaneElts;
14341       }
14342     }
14343 
14344     return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
14345                                        "permil");
14346   }
14347   case X86::BI__builtin_ia32_shufpd:
14348   case X86::BI__builtin_ia32_shufpd256:
14349   case X86::BI__builtin_ia32_shufpd512:
14350   case X86::BI__builtin_ia32_shufps:
14351   case X86::BI__builtin_ia32_shufps256:
14352   case X86::BI__builtin_ia32_shufps512: {
14353     uint32_t Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
14354     auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
14355     unsigned NumElts = Ty->getNumElements();
14356     unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128;
14357     unsigned NumLaneElts = NumElts / NumLanes;
14358 
14359     // Splat the 8-bits of immediate 4 times to help the loop wrap around.
14360     Imm = (Imm & 0xff) * 0x01010101;
14361 
14362     int Indices[16];
14363     for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
14364       for (unsigned i = 0; i != NumLaneElts; ++i) {
14365         unsigned Index = Imm % NumLaneElts;
14366         Imm /= NumLaneElts;
14367         if (i >= (NumLaneElts / 2))
14368           Index += NumElts;
14369         Indices[l + i] = l + Index;
14370       }
14371     }
14372 
14373     return Builder.CreateShuffleVector(Ops[0], Ops[1],
14374                                        ArrayRef(Indices, NumElts), "shufp");
14375   }
14376   case X86::BI__builtin_ia32_permdi256:
14377   case X86::BI__builtin_ia32_permdf256:
14378   case X86::BI__builtin_ia32_permdi512:
14379   case X86::BI__builtin_ia32_permdf512: {
14380     unsigned Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
14381     auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
14382     unsigned NumElts = Ty->getNumElements();
14383 
14384     // These intrinsics operate on 256-bit lanes of four 64-bit elements.
14385     int Indices[8];
14386     for (unsigned l = 0; l != NumElts; l += 4)
14387       for (unsigned i = 0; i != 4; ++i)
14388         Indices[l + i] = l + ((Imm >> (2 * i)) & 0x3);
14389 
14390     return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
14391                                        "perm");
14392   }
14393   case X86::BI__builtin_ia32_palignr128:
14394   case X86::BI__builtin_ia32_palignr256:
14395   case X86::BI__builtin_ia32_palignr512: {
14396     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff;
14397 
14398     unsigned NumElts =
14399         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
14400     assert(NumElts % 16 == 0);
14401 
14402     // If palignr is shifting the pair of vectors more than the size of two
14403     // lanes, emit zero.
14404     if (ShiftVal >= 32)
14405       return llvm::Constant::getNullValue(ConvertType(E->getType()));
14406 
14407     // If palignr is shifting the pair of input vectors more than one lane,
14408     // but less than two lanes, convert to shifting in zeroes.
14409     if (ShiftVal > 16) {
14410       ShiftVal -= 16;
14411       Ops[1] = Ops[0];
14412       Ops[0] = llvm::Constant::getNullValue(Ops[0]->getType());
14413     }
14414 
14415     int Indices[64];
14416     // 256-bit palignr operates on 128-bit lanes so we need to handle that
14417     for (unsigned l = 0; l != NumElts; l += 16) {
14418       for (unsigned i = 0; i != 16; ++i) {
14419         unsigned Idx = ShiftVal + i;
14420         if (Idx >= 16)
14421           Idx += NumElts - 16; // End of lane, switch operand.
14422         Indices[l + i] = Idx + l;
14423       }
14424     }
14425 
14426     return Builder.CreateShuffleVector(Ops[1], Ops[0],
14427                                        ArrayRef(Indices, NumElts), "palignr");
14428   }
14429   case X86::BI__builtin_ia32_alignd128:
14430   case X86::BI__builtin_ia32_alignd256:
14431   case X86::BI__builtin_ia32_alignd512:
14432   case X86::BI__builtin_ia32_alignq128:
14433   case X86::BI__builtin_ia32_alignq256:
14434   case X86::BI__builtin_ia32_alignq512: {
14435     unsigned NumElts =
14436         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
14437     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff;
14438 
14439     // Mask the shift amount to width of a vector.
14440     ShiftVal &= NumElts - 1;
14441 
14442     int Indices[16];
14443     for (unsigned i = 0; i != NumElts; ++i)
14444       Indices[i] = i + ShiftVal;
14445 
14446     return Builder.CreateShuffleVector(Ops[1], Ops[0],
14447                                        ArrayRef(Indices, NumElts), "valign");
14448   }
14449   case X86::BI__builtin_ia32_shuf_f32x4_256:
14450   case X86::BI__builtin_ia32_shuf_f64x2_256:
14451   case X86::BI__builtin_ia32_shuf_i32x4_256:
14452   case X86::BI__builtin_ia32_shuf_i64x2_256:
14453   case X86::BI__builtin_ia32_shuf_f32x4:
14454   case X86::BI__builtin_ia32_shuf_f64x2:
14455   case X86::BI__builtin_ia32_shuf_i32x4:
14456   case X86::BI__builtin_ia32_shuf_i64x2: {
14457     unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
14458     auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
14459     unsigned NumElts = Ty->getNumElements();
14460     unsigned NumLanes = Ty->getPrimitiveSizeInBits() == 512 ? 4 : 2;
14461     unsigned NumLaneElts = NumElts / NumLanes;
14462 
14463     int Indices[16];
14464     for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
14465       unsigned Index = (Imm % NumLanes) * NumLaneElts;
14466       Imm /= NumLanes; // Discard the bits we just used.
14467       if (l >= (NumElts / 2))
14468         Index += NumElts; // Switch to other source.
14469       for (unsigned i = 0; i != NumLaneElts; ++i) {
14470         Indices[l + i] = Index + i;
14471       }
14472     }
14473 
14474     return Builder.CreateShuffleVector(Ops[0], Ops[1],
14475                                        ArrayRef(Indices, NumElts), "shuf");
14476   }
14477 
14478   case X86::BI__builtin_ia32_vperm2f128_pd256:
14479   case X86::BI__builtin_ia32_vperm2f128_ps256:
14480   case X86::BI__builtin_ia32_vperm2f128_si256:
14481   case X86::BI__builtin_ia32_permti256: {
14482     unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
14483     unsigned NumElts =
14484         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
14485 
14486     // This takes a very simple approach since there are two lanes and a
14487     // shuffle can have 2 inputs. So we reserve the first input for the first
14488     // lane and the second input for the second lane. This may result in
14489     // duplicate sources, but this can be dealt with in the backend.
14490 
14491     Value *OutOps[2];
14492     int Indices[8];
14493     for (unsigned l = 0; l != 2; ++l) {
14494       // Determine the source for this lane.
14495       if (Imm & (1 << ((l * 4) + 3)))
14496         OutOps[l] = llvm::ConstantAggregateZero::get(Ops[0]->getType());
14497       else if (Imm & (1 << ((l * 4) + 1)))
14498         OutOps[l] = Ops[1];
14499       else
14500         OutOps[l] = Ops[0];
14501 
14502       for (unsigned i = 0; i != NumElts/2; ++i) {
14503         // Start with ith element of the source for this lane.
14504         unsigned Idx = (l * NumElts) + i;
14505         // If bit 0 of the immediate half is set, switch to the high half of
14506         // the source.
14507         if (Imm & (1 << (l * 4)))
14508           Idx += NumElts/2;
14509         Indices[(l * (NumElts/2)) + i] = Idx;
14510       }
14511     }
14512 
14513     return Builder.CreateShuffleVector(OutOps[0], OutOps[1],
14514                                        ArrayRef(Indices, NumElts), "vperm");
14515   }
14516 
14517   case X86::BI__builtin_ia32_pslldqi128_byteshift:
14518   case X86::BI__builtin_ia32_pslldqi256_byteshift:
14519   case X86::BI__builtin_ia32_pslldqi512_byteshift: {
14520     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
14521     auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType());
14522     // Builtin type is vXi64 so multiply by 8 to get bytes.
14523     unsigned NumElts = ResultType->getNumElements() * 8;
14524 
14525     // If pslldq is shifting the vector more than 15 bytes, emit zero.
14526     if (ShiftVal >= 16)
14527       return llvm::Constant::getNullValue(ResultType);
14528 
14529     int Indices[64];
14530     // 256/512-bit pslldq operates on 128-bit lanes so we need to handle that
14531     for (unsigned l = 0; l != NumElts; l += 16) {
14532       for (unsigned i = 0; i != 16; ++i) {
14533         unsigned Idx = NumElts + i - ShiftVal;
14534         if (Idx < NumElts) Idx -= NumElts - 16; // end of lane, switch operand.
14535         Indices[l + i] = Idx + l;
14536       }
14537     }
14538 
14539     auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts);
14540     Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast");
14541     Value *Zero = llvm::Constant::getNullValue(VecTy);
14542     Value *SV = Builder.CreateShuffleVector(
14543         Zero, Cast, ArrayRef(Indices, NumElts), "pslldq");
14544     return Builder.CreateBitCast(SV, Ops[0]->getType(), "cast");
14545   }
14546   case X86::BI__builtin_ia32_psrldqi128_byteshift:
14547   case X86::BI__builtin_ia32_psrldqi256_byteshift:
14548   case X86::BI__builtin_ia32_psrldqi512_byteshift: {
14549     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
14550     auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType());
14551     // Builtin type is vXi64 so multiply by 8 to get bytes.
14552     unsigned NumElts = ResultType->getNumElements() * 8;
14553 
14554     // If psrldq is shifting the vector more than 15 bytes, emit zero.
14555     if (ShiftVal >= 16)
14556       return llvm::Constant::getNullValue(ResultType);
14557 
14558     int Indices[64];
14559     // 256/512-bit psrldq operates on 128-bit lanes so we need to handle that
14560     for (unsigned l = 0; l != NumElts; l += 16) {
14561       for (unsigned i = 0; i != 16; ++i) {
14562         unsigned Idx = i + ShiftVal;
14563         if (Idx >= 16) Idx += NumElts - 16; // end of lane, switch operand.
14564         Indices[l + i] = Idx + l;
14565       }
14566     }
14567 
14568     auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts);
14569     Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast");
14570     Value *Zero = llvm::Constant::getNullValue(VecTy);
14571     Value *SV = Builder.CreateShuffleVector(
14572         Cast, Zero, ArrayRef(Indices, NumElts), "psrldq");
14573     return Builder.CreateBitCast(SV, ResultType, "cast");
14574   }
14575   case X86::BI__builtin_ia32_kshiftliqi:
14576   case X86::BI__builtin_ia32_kshiftlihi:
14577   case X86::BI__builtin_ia32_kshiftlisi:
14578   case X86::BI__builtin_ia32_kshiftlidi: {
14579     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
14580     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
14581 
14582     if (ShiftVal >= NumElts)
14583       return llvm::Constant::getNullValue(Ops[0]->getType());
14584 
14585     Value *In = getMaskVecValue(*this, Ops[0], NumElts);
14586 
14587     int Indices[64];
14588     for (unsigned i = 0; i != NumElts; ++i)
14589       Indices[i] = NumElts + i - ShiftVal;
14590 
14591     Value *Zero = llvm::Constant::getNullValue(In->getType());
14592     Value *SV = Builder.CreateShuffleVector(
14593         Zero, In, ArrayRef(Indices, NumElts), "kshiftl");
14594     return Builder.CreateBitCast(SV, Ops[0]->getType());
14595   }
14596   case X86::BI__builtin_ia32_kshiftriqi:
14597   case X86::BI__builtin_ia32_kshiftrihi:
14598   case X86::BI__builtin_ia32_kshiftrisi:
14599   case X86::BI__builtin_ia32_kshiftridi: {
14600     unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
14601     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
14602 
14603     if (ShiftVal >= NumElts)
14604       return llvm::Constant::getNullValue(Ops[0]->getType());
14605 
14606     Value *In = getMaskVecValue(*this, Ops[0], NumElts);
14607 
14608     int Indices[64];
14609     for (unsigned i = 0; i != NumElts; ++i)
14610       Indices[i] = i + ShiftVal;
14611 
14612     Value *Zero = llvm::Constant::getNullValue(In->getType());
14613     Value *SV = Builder.CreateShuffleVector(
14614         In, Zero, ArrayRef(Indices, NumElts), "kshiftr");
14615     return Builder.CreateBitCast(SV, Ops[0]->getType());
14616   }
14617   case X86::BI__builtin_ia32_movnti:
14618   case X86::BI__builtin_ia32_movnti64:
14619   case X86::BI__builtin_ia32_movntsd:
14620   case X86::BI__builtin_ia32_movntss: {
14621     llvm::MDNode *Node = llvm::MDNode::get(
14622         getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
14623 
14624     Value *Ptr = Ops[0];
14625     Value *Src = Ops[1];
14626 
14627     // Extract the 0'th element of the source vector.
14628     if (BuiltinID == X86::BI__builtin_ia32_movntsd ||
14629         BuiltinID == X86::BI__builtin_ia32_movntss)
14630       Src = Builder.CreateExtractElement(Src, (uint64_t)0, "extract");
14631 
14632     // Convert the type of the pointer to a pointer to the stored type.
14633     Value *BC = Builder.CreateBitCast(
14634         Ptr, llvm::PointerType::getUnqual(Src->getType()), "cast");
14635 
14636     // Unaligned nontemporal store of the scalar value.
14637     StoreInst *SI = Builder.CreateDefaultAlignedStore(Src, BC);
14638     SI->setMetadata(llvm::LLVMContext::MD_nontemporal, Node);
14639     SI->setAlignment(llvm::Align(1));
14640     return SI;
14641   }
14642   // Rotate is a special case of funnel shift - 1st 2 args are the same.
14643   case X86::BI__builtin_ia32_vprotb:
14644   case X86::BI__builtin_ia32_vprotw:
14645   case X86::BI__builtin_ia32_vprotd:
14646   case X86::BI__builtin_ia32_vprotq:
14647   case X86::BI__builtin_ia32_vprotbi:
14648   case X86::BI__builtin_ia32_vprotwi:
14649   case X86::BI__builtin_ia32_vprotdi:
14650   case X86::BI__builtin_ia32_vprotqi:
14651   case X86::BI__builtin_ia32_prold128:
14652   case X86::BI__builtin_ia32_prold256:
14653   case X86::BI__builtin_ia32_prold512:
14654   case X86::BI__builtin_ia32_prolq128:
14655   case X86::BI__builtin_ia32_prolq256:
14656   case X86::BI__builtin_ia32_prolq512:
14657   case X86::BI__builtin_ia32_prolvd128:
14658   case X86::BI__builtin_ia32_prolvd256:
14659   case X86::BI__builtin_ia32_prolvd512:
14660   case X86::BI__builtin_ia32_prolvq128:
14661   case X86::BI__builtin_ia32_prolvq256:
14662   case X86::BI__builtin_ia32_prolvq512:
14663     return EmitX86FunnelShift(*this, Ops[0], Ops[0], Ops[1], false);
14664   case X86::BI__builtin_ia32_prord128:
14665   case X86::BI__builtin_ia32_prord256:
14666   case X86::BI__builtin_ia32_prord512:
14667   case X86::BI__builtin_ia32_prorq128:
14668   case X86::BI__builtin_ia32_prorq256:
14669   case X86::BI__builtin_ia32_prorq512:
14670   case X86::BI__builtin_ia32_prorvd128:
14671   case X86::BI__builtin_ia32_prorvd256:
14672   case X86::BI__builtin_ia32_prorvd512:
14673   case X86::BI__builtin_ia32_prorvq128:
14674   case X86::BI__builtin_ia32_prorvq256:
14675   case X86::BI__builtin_ia32_prorvq512:
14676     return EmitX86FunnelShift(*this, Ops[0], Ops[0], Ops[1], true);
14677   case X86::BI__builtin_ia32_selectb_128:
14678   case X86::BI__builtin_ia32_selectb_256:
14679   case X86::BI__builtin_ia32_selectb_512:
14680   case X86::BI__builtin_ia32_selectw_128:
14681   case X86::BI__builtin_ia32_selectw_256:
14682   case X86::BI__builtin_ia32_selectw_512:
14683   case X86::BI__builtin_ia32_selectd_128:
14684   case X86::BI__builtin_ia32_selectd_256:
14685   case X86::BI__builtin_ia32_selectd_512:
14686   case X86::BI__builtin_ia32_selectq_128:
14687   case X86::BI__builtin_ia32_selectq_256:
14688   case X86::BI__builtin_ia32_selectq_512:
14689   case X86::BI__builtin_ia32_selectph_128:
14690   case X86::BI__builtin_ia32_selectph_256:
14691   case X86::BI__builtin_ia32_selectph_512:
14692   case X86::BI__builtin_ia32_selectpbf_128:
14693   case X86::BI__builtin_ia32_selectpbf_256:
14694   case X86::BI__builtin_ia32_selectpbf_512:
14695   case X86::BI__builtin_ia32_selectps_128:
14696   case X86::BI__builtin_ia32_selectps_256:
14697   case X86::BI__builtin_ia32_selectps_512:
14698   case X86::BI__builtin_ia32_selectpd_128:
14699   case X86::BI__builtin_ia32_selectpd_256:
14700   case X86::BI__builtin_ia32_selectpd_512:
14701     return EmitX86Select(*this, Ops[0], Ops[1], Ops[2]);
14702   case X86::BI__builtin_ia32_selectsh_128:
14703   case X86::BI__builtin_ia32_selectsbf_128:
14704   case X86::BI__builtin_ia32_selectss_128:
14705   case X86::BI__builtin_ia32_selectsd_128: {
14706     Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
14707     Value *B = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
14708     A = EmitX86ScalarSelect(*this, Ops[0], A, B);
14709     return Builder.CreateInsertElement(Ops[1], A, (uint64_t)0);
14710   }
14711   case X86::BI__builtin_ia32_cmpb128_mask:
14712   case X86::BI__builtin_ia32_cmpb256_mask:
14713   case X86::BI__builtin_ia32_cmpb512_mask:
14714   case X86::BI__builtin_ia32_cmpw128_mask:
14715   case X86::BI__builtin_ia32_cmpw256_mask:
14716   case X86::BI__builtin_ia32_cmpw512_mask:
14717   case X86::BI__builtin_ia32_cmpd128_mask:
14718   case X86::BI__builtin_ia32_cmpd256_mask:
14719   case X86::BI__builtin_ia32_cmpd512_mask:
14720   case X86::BI__builtin_ia32_cmpq128_mask:
14721   case X86::BI__builtin_ia32_cmpq256_mask:
14722   case X86::BI__builtin_ia32_cmpq512_mask: {
14723     unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7;
14724     return EmitX86MaskedCompare(*this, CC, true, Ops);
14725   }
14726   case X86::BI__builtin_ia32_ucmpb128_mask:
14727   case X86::BI__builtin_ia32_ucmpb256_mask:
14728   case X86::BI__builtin_ia32_ucmpb512_mask:
14729   case X86::BI__builtin_ia32_ucmpw128_mask:
14730   case X86::BI__builtin_ia32_ucmpw256_mask:
14731   case X86::BI__builtin_ia32_ucmpw512_mask:
14732   case X86::BI__builtin_ia32_ucmpd128_mask:
14733   case X86::BI__builtin_ia32_ucmpd256_mask:
14734   case X86::BI__builtin_ia32_ucmpd512_mask:
14735   case X86::BI__builtin_ia32_ucmpq128_mask:
14736   case X86::BI__builtin_ia32_ucmpq256_mask:
14737   case X86::BI__builtin_ia32_ucmpq512_mask: {
14738     unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7;
14739     return EmitX86MaskedCompare(*this, CC, false, Ops);
14740   }
14741   case X86::BI__builtin_ia32_vpcomb:
14742   case X86::BI__builtin_ia32_vpcomw:
14743   case X86::BI__builtin_ia32_vpcomd:
14744   case X86::BI__builtin_ia32_vpcomq:
14745     return EmitX86vpcom(*this, Ops, true);
14746   case X86::BI__builtin_ia32_vpcomub:
14747   case X86::BI__builtin_ia32_vpcomuw:
14748   case X86::BI__builtin_ia32_vpcomud:
14749   case X86::BI__builtin_ia32_vpcomuq:
14750     return EmitX86vpcom(*this, Ops, false);
14751 
14752   case X86::BI__builtin_ia32_kortestcqi:
14753   case X86::BI__builtin_ia32_kortestchi:
14754   case X86::BI__builtin_ia32_kortestcsi:
14755   case X86::BI__builtin_ia32_kortestcdi: {
14756     Value *Or = EmitX86MaskLogic(*this, Instruction::Or, Ops);
14757     Value *C = llvm::Constant::getAllOnesValue(Ops[0]->getType());
14758     Value *Cmp = Builder.CreateICmpEQ(Or, C);
14759     return Builder.CreateZExt(Cmp, ConvertType(E->getType()));
14760   }
14761   case X86::BI__builtin_ia32_kortestzqi:
14762   case X86::BI__builtin_ia32_kortestzhi:
14763   case X86::BI__builtin_ia32_kortestzsi:
14764   case X86::BI__builtin_ia32_kortestzdi: {
14765     Value *Or = EmitX86MaskLogic(*this, Instruction::Or, Ops);
14766     Value *C = llvm::Constant::getNullValue(Ops[0]->getType());
14767     Value *Cmp = Builder.CreateICmpEQ(Or, C);
14768     return Builder.CreateZExt(Cmp, ConvertType(E->getType()));
14769   }
14770 
14771   case X86::BI__builtin_ia32_ktestcqi:
14772   case X86::BI__builtin_ia32_ktestzqi:
14773   case X86::BI__builtin_ia32_ktestchi:
14774   case X86::BI__builtin_ia32_ktestzhi:
14775   case X86::BI__builtin_ia32_ktestcsi:
14776   case X86::BI__builtin_ia32_ktestzsi:
14777   case X86::BI__builtin_ia32_ktestcdi:
14778   case X86::BI__builtin_ia32_ktestzdi: {
14779     Intrinsic::ID IID;
14780     switch (BuiltinID) {
14781     default: llvm_unreachable("Unsupported intrinsic!");
14782     case X86::BI__builtin_ia32_ktestcqi:
14783       IID = Intrinsic::x86_avx512_ktestc_b;
14784       break;
14785     case X86::BI__builtin_ia32_ktestzqi:
14786       IID = Intrinsic::x86_avx512_ktestz_b;
14787       break;
14788     case X86::BI__builtin_ia32_ktestchi:
14789       IID = Intrinsic::x86_avx512_ktestc_w;
14790       break;
14791     case X86::BI__builtin_ia32_ktestzhi:
14792       IID = Intrinsic::x86_avx512_ktestz_w;
14793       break;
14794     case X86::BI__builtin_ia32_ktestcsi:
14795       IID = Intrinsic::x86_avx512_ktestc_d;
14796       break;
14797     case X86::BI__builtin_ia32_ktestzsi:
14798       IID = Intrinsic::x86_avx512_ktestz_d;
14799       break;
14800     case X86::BI__builtin_ia32_ktestcdi:
14801       IID = Intrinsic::x86_avx512_ktestc_q;
14802       break;
14803     case X86::BI__builtin_ia32_ktestzdi:
14804       IID = Intrinsic::x86_avx512_ktestz_q;
14805       break;
14806     }
14807 
14808     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
14809     Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
14810     Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
14811     Function *Intr = CGM.getIntrinsic(IID);
14812     return Builder.CreateCall(Intr, {LHS, RHS});
14813   }
14814 
14815   case X86::BI__builtin_ia32_kaddqi:
14816   case X86::BI__builtin_ia32_kaddhi:
14817   case X86::BI__builtin_ia32_kaddsi:
14818   case X86::BI__builtin_ia32_kadddi: {
14819     Intrinsic::ID IID;
14820     switch (BuiltinID) {
14821     default: llvm_unreachable("Unsupported intrinsic!");
14822     case X86::BI__builtin_ia32_kaddqi:
14823       IID = Intrinsic::x86_avx512_kadd_b;
14824       break;
14825     case X86::BI__builtin_ia32_kaddhi:
14826       IID = Intrinsic::x86_avx512_kadd_w;
14827       break;
14828     case X86::BI__builtin_ia32_kaddsi:
14829       IID = Intrinsic::x86_avx512_kadd_d;
14830       break;
14831     case X86::BI__builtin_ia32_kadddi:
14832       IID = Intrinsic::x86_avx512_kadd_q;
14833       break;
14834     }
14835 
14836     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
14837     Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
14838     Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
14839     Function *Intr = CGM.getIntrinsic(IID);
14840     Value *Res = Builder.CreateCall(Intr, {LHS, RHS});
14841     return Builder.CreateBitCast(Res, Ops[0]->getType());
14842   }
14843   case X86::BI__builtin_ia32_kandqi:
14844   case X86::BI__builtin_ia32_kandhi:
14845   case X86::BI__builtin_ia32_kandsi:
14846   case X86::BI__builtin_ia32_kanddi:
14847     return EmitX86MaskLogic(*this, Instruction::And, Ops);
14848   case X86::BI__builtin_ia32_kandnqi:
14849   case X86::BI__builtin_ia32_kandnhi:
14850   case X86::BI__builtin_ia32_kandnsi:
14851   case X86::BI__builtin_ia32_kandndi:
14852     return EmitX86MaskLogic(*this, Instruction::And, Ops, true);
14853   case X86::BI__builtin_ia32_korqi:
14854   case X86::BI__builtin_ia32_korhi:
14855   case X86::BI__builtin_ia32_korsi:
14856   case X86::BI__builtin_ia32_kordi:
14857     return EmitX86MaskLogic(*this, Instruction::Or, Ops);
14858   case X86::BI__builtin_ia32_kxnorqi:
14859   case X86::BI__builtin_ia32_kxnorhi:
14860   case X86::BI__builtin_ia32_kxnorsi:
14861   case X86::BI__builtin_ia32_kxnordi:
14862     return EmitX86MaskLogic(*this, Instruction::Xor, Ops, true);
14863   case X86::BI__builtin_ia32_kxorqi:
14864   case X86::BI__builtin_ia32_kxorhi:
14865   case X86::BI__builtin_ia32_kxorsi:
14866   case X86::BI__builtin_ia32_kxordi:
14867     return EmitX86MaskLogic(*this, Instruction::Xor,  Ops);
14868   case X86::BI__builtin_ia32_knotqi:
14869   case X86::BI__builtin_ia32_knothi:
14870   case X86::BI__builtin_ia32_knotsi:
14871   case X86::BI__builtin_ia32_knotdi: {
14872     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
14873     Value *Res = getMaskVecValue(*this, Ops[0], NumElts);
14874     return Builder.CreateBitCast(Builder.CreateNot(Res),
14875                                  Ops[0]->getType());
14876   }
14877   case X86::BI__builtin_ia32_kmovb:
14878   case X86::BI__builtin_ia32_kmovw:
14879   case X86::BI__builtin_ia32_kmovd:
14880   case X86::BI__builtin_ia32_kmovq: {
14881     // Bitcast to vXi1 type and then back to integer. This gets the mask
14882     // register type into the IR, but might be optimized out depending on
14883     // what's around it.
14884     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
14885     Value *Res = getMaskVecValue(*this, Ops[0], NumElts);
14886     return Builder.CreateBitCast(Res, Ops[0]->getType());
14887   }
14888 
14889   case X86::BI__builtin_ia32_kunpckdi:
14890   case X86::BI__builtin_ia32_kunpcksi:
14891   case X86::BI__builtin_ia32_kunpckhi: {
14892     unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
14893     Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
14894     Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
14895     int Indices[64];
14896     for (unsigned i = 0; i != NumElts; ++i)
14897       Indices[i] = i;
14898 
14899     // First extract half of each vector. This gives better codegen than
14900     // doing it in a single shuffle.
14901     LHS = Builder.CreateShuffleVector(LHS, LHS, ArrayRef(Indices, NumElts / 2));
14902     RHS = Builder.CreateShuffleVector(RHS, RHS, ArrayRef(Indices, NumElts / 2));
14903     // Concat the vectors.
14904     // NOTE: Operands are swapped to match the intrinsic definition.
14905     Value *Res =
14906         Builder.CreateShuffleVector(RHS, LHS, ArrayRef(Indices, NumElts));
14907     return Builder.CreateBitCast(Res, Ops[0]->getType());
14908   }
14909 
14910   case X86::BI__builtin_ia32_vplzcntd_128:
14911   case X86::BI__builtin_ia32_vplzcntd_256:
14912   case X86::BI__builtin_ia32_vplzcntd_512:
14913   case X86::BI__builtin_ia32_vplzcntq_128:
14914   case X86::BI__builtin_ia32_vplzcntq_256:
14915   case X86::BI__builtin_ia32_vplzcntq_512: {
14916     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType());
14917     return Builder.CreateCall(F, {Ops[0],Builder.getInt1(false)});
14918   }
14919   case X86::BI__builtin_ia32_sqrtss:
14920   case X86::BI__builtin_ia32_sqrtsd: {
14921     Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
14922     Function *F;
14923     if (Builder.getIsFPConstrained()) {
14924       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
14925       F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
14926                            A->getType());
14927       A = Builder.CreateConstrainedFPCall(F, {A});
14928     } else {
14929       F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
14930       A = Builder.CreateCall(F, {A});
14931     }
14932     return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
14933   }
14934   case X86::BI__builtin_ia32_sqrtsh_round_mask:
14935   case X86::BI__builtin_ia32_sqrtsd_round_mask:
14936   case X86::BI__builtin_ia32_sqrtss_round_mask: {
14937     unsigned CC = cast<llvm::ConstantInt>(Ops[4])->getZExtValue();
14938     // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
14939     // otherwise keep the intrinsic.
14940     if (CC != 4) {
14941       Intrinsic::ID IID;
14942 
14943       switch (BuiltinID) {
14944       default:
14945         llvm_unreachable("Unsupported intrinsic!");
14946       case X86::BI__builtin_ia32_sqrtsh_round_mask:
14947         IID = Intrinsic::x86_avx512fp16_mask_sqrt_sh;
14948         break;
14949       case X86::BI__builtin_ia32_sqrtsd_round_mask:
14950         IID = Intrinsic::x86_avx512_mask_sqrt_sd;
14951         break;
14952       case X86::BI__builtin_ia32_sqrtss_round_mask:
14953         IID = Intrinsic::x86_avx512_mask_sqrt_ss;
14954         break;
14955       }
14956       return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
14957     }
14958     Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
14959     Function *F;
14960     if (Builder.getIsFPConstrained()) {
14961       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
14962       F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
14963                            A->getType());
14964       A = Builder.CreateConstrainedFPCall(F, A);
14965     } else {
14966       F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
14967       A = Builder.CreateCall(F, A);
14968     }
14969     Value *Src = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
14970     A = EmitX86ScalarSelect(*this, Ops[3], A, Src);
14971     return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
14972   }
14973   case X86::BI__builtin_ia32_sqrtpd256:
14974   case X86::BI__builtin_ia32_sqrtpd:
14975   case X86::BI__builtin_ia32_sqrtps256:
14976   case X86::BI__builtin_ia32_sqrtps:
14977   case X86::BI__builtin_ia32_sqrtph256:
14978   case X86::BI__builtin_ia32_sqrtph:
14979   case X86::BI__builtin_ia32_sqrtph512:
14980   case X86::BI__builtin_ia32_sqrtps512:
14981   case X86::BI__builtin_ia32_sqrtpd512: {
14982     if (Ops.size() == 2) {
14983       unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
14984       // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
14985       // otherwise keep the intrinsic.
14986       if (CC != 4) {
14987         Intrinsic::ID IID;
14988 
14989         switch (BuiltinID) {
14990         default:
14991           llvm_unreachable("Unsupported intrinsic!");
14992         case X86::BI__builtin_ia32_sqrtph512:
14993           IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
14994           break;
14995         case X86::BI__builtin_ia32_sqrtps512:
14996           IID = Intrinsic::x86_avx512_sqrt_ps_512;
14997           break;
14998         case X86::BI__builtin_ia32_sqrtpd512:
14999           IID = Intrinsic::x86_avx512_sqrt_pd_512;
15000           break;
15001         }
15002         return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
15003       }
15004     }
15005     if (Builder.getIsFPConstrained()) {
15006       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
15007       Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
15008                                      Ops[0]->getType());
15009       return Builder.CreateConstrainedFPCall(F, Ops[0]);
15010     } else {
15011       Function *F = CGM.getIntrinsic(Intrinsic::sqrt, Ops[0]->getType());
15012       return Builder.CreateCall(F, Ops[0]);
15013     }
15014   }
15015 
15016   case X86::BI__builtin_ia32_pmuludq128:
15017   case X86::BI__builtin_ia32_pmuludq256:
15018   case X86::BI__builtin_ia32_pmuludq512:
15019     return EmitX86Muldq(*this, /*IsSigned*/false, Ops);
15020 
15021   case X86::BI__builtin_ia32_pmuldq128:
15022   case X86::BI__builtin_ia32_pmuldq256:
15023   case X86::BI__builtin_ia32_pmuldq512:
15024     return EmitX86Muldq(*this, /*IsSigned*/true, Ops);
15025 
15026   case X86::BI__builtin_ia32_pternlogd512_mask:
15027   case X86::BI__builtin_ia32_pternlogq512_mask:
15028   case X86::BI__builtin_ia32_pternlogd128_mask:
15029   case X86::BI__builtin_ia32_pternlogd256_mask:
15030   case X86::BI__builtin_ia32_pternlogq128_mask:
15031   case X86::BI__builtin_ia32_pternlogq256_mask:
15032     return EmitX86Ternlog(*this, /*ZeroMask*/false, Ops);
15033 
15034   case X86::BI__builtin_ia32_pternlogd512_maskz:
15035   case X86::BI__builtin_ia32_pternlogq512_maskz:
15036   case X86::BI__builtin_ia32_pternlogd128_maskz:
15037   case X86::BI__builtin_ia32_pternlogd256_maskz:
15038   case X86::BI__builtin_ia32_pternlogq128_maskz:
15039   case X86::BI__builtin_ia32_pternlogq256_maskz:
15040     return EmitX86Ternlog(*this, /*ZeroMask*/true, Ops);
15041 
15042   case X86::BI__builtin_ia32_vpshldd128:
15043   case X86::BI__builtin_ia32_vpshldd256:
15044   case X86::BI__builtin_ia32_vpshldd512:
15045   case X86::BI__builtin_ia32_vpshldq128:
15046   case X86::BI__builtin_ia32_vpshldq256:
15047   case X86::BI__builtin_ia32_vpshldq512:
15048   case X86::BI__builtin_ia32_vpshldw128:
15049   case X86::BI__builtin_ia32_vpshldw256:
15050   case X86::BI__builtin_ia32_vpshldw512:
15051     return EmitX86FunnelShift(*this, Ops[0], Ops[1], Ops[2], false);
15052 
15053   case X86::BI__builtin_ia32_vpshrdd128:
15054   case X86::BI__builtin_ia32_vpshrdd256:
15055   case X86::BI__builtin_ia32_vpshrdd512:
15056   case X86::BI__builtin_ia32_vpshrdq128:
15057   case X86::BI__builtin_ia32_vpshrdq256:
15058   case X86::BI__builtin_ia32_vpshrdq512:
15059   case X86::BI__builtin_ia32_vpshrdw128:
15060   case X86::BI__builtin_ia32_vpshrdw256:
15061   case X86::BI__builtin_ia32_vpshrdw512:
15062     // Ops 0 and 1 are swapped.
15063     return EmitX86FunnelShift(*this, Ops[1], Ops[0], Ops[2], true);
15064 
15065   case X86::BI__builtin_ia32_vpshldvd128:
15066   case X86::BI__builtin_ia32_vpshldvd256:
15067   case X86::BI__builtin_ia32_vpshldvd512:
15068   case X86::BI__builtin_ia32_vpshldvq128:
15069   case X86::BI__builtin_ia32_vpshldvq256:
15070   case X86::BI__builtin_ia32_vpshldvq512:
15071   case X86::BI__builtin_ia32_vpshldvw128:
15072   case X86::BI__builtin_ia32_vpshldvw256:
15073   case X86::BI__builtin_ia32_vpshldvw512:
15074     return EmitX86FunnelShift(*this, Ops[0], Ops[1], Ops[2], false);
15075 
15076   case X86::BI__builtin_ia32_vpshrdvd128:
15077   case X86::BI__builtin_ia32_vpshrdvd256:
15078   case X86::BI__builtin_ia32_vpshrdvd512:
15079   case X86::BI__builtin_ia32_vpshrdvq128:
15080   case X86::BI__builtin_ia32_vpshrdvq256:
15081   case X86::BI__builtin_ia32_vpshrdvq512:
15082   case X86::BI__builtin_ia32_vpshrdvw128:
15083   case X86::BI__builtin_ia32_vpshrdvw256:
15084   case X86::BI__builtin_ia32_vpshrdvw512:
15085     // Ops 0 and 1 are swapped.
15086     return EmitX86FunnelShift(*this, Ops[1], Ops[0], Ops[2], true);
15087 
15088   // Reductions
15089   case X86::BI__builtin_ia32_reduce_fadd_pd512:
15090   case X86::BI__builtin_ia32_reduce_fadd_ps512:
15091   case X86::BI__builtin_ia32_reduce_fadd_ph512:
15092   case X86::BI__builtin_ia32_reduce_fadd_ph256:
15093   case X86::BI__builtin_ia32_reduce_fadd_ph128: {
15094     Function *F =
15095         CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Ops[1]->getType());
15096     IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
15097     Builder.getFastMathFlags().setAllowReassoc();
15098     return Builder.CreateCall(F, {Ops[0], Ops[1]});
15099   }
15100   case X86::BI__builtin_ia32_reduce_fmul_pd512:
15101   case X86::BI__builtin_ia32_reduce_fmul_ps512:
15102   case X86::BI__builtin_ia32_reduce_fmul_ph512:
15103   case X86::BI__builtin_ia32_reduce_fmul_ph256:
15104   case X86::BI__builtin_ia32_reduce_fmul_ph128: {
15105     Function *F =
15106         CGM.getIntrinsic(Intrinsic::vector_reduce_fmul, Ops[1]->getType());
15107     IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
15108     Builder.getFastMathFlags().setAllowReassoc();
15109     return Builder.CreateCall(F, {Ops[0], Ops[1]});
15110   }
15111   case X86::BI__builtin_ia32_reduce_fmax_pd512:
15112   case X86::BI__builtin_ia32_reduce_fmax_ps512:
15113   case X86::BI__builtin_ia32_reduce_fmax_ph512:
15114   case X86::BI__builtin_ia32_reduce_fmax_ph256:
15115   case X86::BI__builtin_ia32_reduce_fmax_ph128: {
15116     Function *F =
15117         CGM.getIntrinsic(Intrinsic::vector_reduce_fmax, Ops[0]->getType());
15118     IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
15119     Builder.getFastMathFlags().setNoNaNs();
15120     return Builder.CreateCall(F, {Ops[0]});
15121   }
15122   case X86::BI__builtin_ia32_reduce_fmin_pd512:
15123   case X86::BI__builtin_ia32_reduce_fmin_ps512:
15124   case X86::BI__builtin_ia32_reduce_fmin_ph512:
15125   case X86::BI__builtin_ia32_reduce_fmin_ph256:
15126   case X86::BI__builtin_ia32_reduce_fmin_ph128: {
15127     Function *F =
15128         CGM.getIntrinsic(Intrinsic::vector_reduce_fmin, Ops[0]->getType());
15129     IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
15130     Builder.getFastMathFlags().setNoNaNs();
15131     return Builder.CreateCall(F, {Ops[0]});
15132   }
15133 
15134   // 3DNow!
15135   case X86::BI__builtin_ia32_pswapdsf:
15136   case X86::BI__builtin_ia32_pswapdsi: {
15137     llvm::Type *MMXTy = llvm::Type::getX86_MMXTy(getLLVMContext());
15138     Ops[0] = Builder.CreateBitCast(Ops[0], MMXTy, "cast");
15139     llvm::Function *F = CGM.getIntrinsic(Intrinsic::x86_3dnowa_pswapd);
15140     return Builder.CreateCall(F, Ops, "pswapd");
15141   }
15142   case X86::BI__builtin_ia32_rdrand16_step:
15143   case X86::BI__builtin_ia32_rdrand32_step:
15144   case X86::BI__builtin_ia32_rdrand64_step:
15145   case X86::BI__builtin_ia32_rdseed16_step:
15146   case X86::BI__builtin_ia32_rdseed32_step:
15147   case X86::BI__builtin_ia32_rdseed64_step: {
15148     Intrinsic::ID ID;
15149     switch (BuiltinID) {
15150     default: llvm_unreachable("Unsupported intrinsic!");
15151     case X86::BI__builtin_ia32_rdrand16_step:
15152       ID = Intrinsic::x86_rdrand_16;
15153       break;
15154     case X86::BI__builtin_ia32_rdrand32_step:
15155       ID = Intrinsic::x86_rdrand_32;
15156       break;
15157     case X86::BI__builtin_ia32_rdrand64_step:
15158       ID = Intrinsic::x86_rdrand_64;
15159       break;
15160     case X86::BI__builtin_ia32_rdseed16_step:
15161       ID = Intrinsic::x86_rdseed_16;
15162       break;
15163     case X86::BI__builtin_ia32_rdseed32_step:
15164       ID = Intrinsic::x86_rdseed_32;
15165       break;
15166     case X86::BI__builtin_ia32_rdseed64_step:
15167       ID = Intrinsic::x86_rdseed_64;
15168       break;
15169     }
15170 
15171     Value *Call = Builder.CreateCall(CGM.getIntrinsic(ID));
15172     Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 0),
15173                                       Ops[0]);
15174     return Builder.CreateExtractValue(Call, 1);
15175   }
15176   case X86::BI__builtin_ia32_addcarryx_u32:
15177   case X86::BI__builtin_ia32_addcarryx_u64:
15178   case X86::BI__builtin_ia32_subborrow_u32:
15179   case X86::BI__builtin_ia32_subborrow_u64: {
15180     Intrinsic::ID IID;
15181     switch (BuiltinID) {
15182     default: llvm_unreachable("Unsupported intrinsic!");
15183     case X86::BI__builtin_ia32_addcarryx_u32:
15184       IID = Intrinsic::x86_addcarry_32;
15185       break;
15186     case X86::BI__builtin_ia32_addcarryx_u64:
15187       IID = Intrinsic::x86_addcarry_64;
15188       break;
15189     case X86::BI__builtin_ia32_subborrow_u32:
15190       IID = Intrinsic::x86_subborrow_32;
15191       break;
15192     case X86::BI__builtin_ia32_subborrow_u64:
15193       IID = Intrinsic::x86_subborrow_64;
15194       break;
15195     }
15196 
15197     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID),
15198                                      { Ops[0], Ops[1], Ops[2] });
15199     Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 1),
15200                                       Ops[3]);
15201     return Builder.CreateExtractValue(Call, 0);
15202   }
15203 
15204   case X86::BI__builtin_ia32_fpclassps128_mask:
15205   case X86::BI__builtin_ia32_fpclassps256_mask:
15206   case X86::BI__builtin_ia32_fpclassps512_mask:
15207   case X86::BI__builtin_ia32_fpclassph128_mask:
15208   case X86::BI__builtin_ia32_fpclassph256_mask:
15209   case X86::BI__builtin_ia32_fpclassph512_mask:
15210   case X86::BI__builtin_ia32_fpclasspd128_mask:
15211   case X86::BI__builtin_ia32_fpclasspd256_mask:
15212   case X86::BI__builtin_ia32_fpclasspd512_mask: {
15213     unsigned NumElts =
15214         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
15215     Value *MaskIn = Ops[2];
15216     Ops.erase(&Ops[2]);
15217 
15218     Intrinsic::ID ID;
15219     switch (BuiltinID) {
15220     default: llvm_unreachable("Unsupported intrinsic!");
15221     case X86::BI__builtin_ia32_fpclassph128_mask:
15222       ID = Intrinsic::x86_avx512fp16_fpclass_ph_128;
15223       break;
15224     case X86::BI__builtin_ia32_fpclassph256_mask:
15225       ID = Intrinsic::x86_avx512fp16_fpclass_ph_256;
15226       break;
15227     case X86::BI__builtin_ia32_fpclassph512_mask:
15228       ID = Intrinsic::x86_avx512fp16_fpclass_ph_512;
15229       break;
15230     case X86::BI__builtin_ia32_fpclassps128_mask:
15231       ID = Intrinsic::x86_avx512_fpclass_ps_128;
15232       break;
15233     case X86::BI__builtin_ia32_fpclassps256_mask:
15234       ID = Intrinsic::x86_avx512_fpclass_ps_256;
15235       break;
15236     case X86::BI__builtin_ia32_fpclassps512_mask:
15237       ID = Intrinsic::x86_avx512_fpclass_ps_512;
15238       break;
15239     case X86::BI__builtin_ia32_fpclasspd128_mask:
15240       ID = Intrinsic::x86_avx512_fpclass_pd_128;
15241       break;
15242     case X86::BI__builtin_ia32_fpclasspd256_mask:
15243       ID = Intrinsic::x86_avx512_fpclass_pd_256;
15244       break;
15245     case X86::BI__builtin_ia32_fpclasspd512_mask:
15246       ID = Intrinsic::x86_avx512_fpclass_pd_512;
15247       break;
15248     }
15249 
15250     Value *Fpclass = Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
15251     return EmitX86MaskedCompareResult(*this, Fpclass, NumElts, MaskIn);
15252   }
15253 
15254   case X86::BI__builtin_ia32_vp2intersect_q_512:
15255   case X86::BI__builtin_ia32_vp2intersect_q_256:
15256   case X86::BI__builtin_ia32_vp2intersect_q_128:
15257   case X86::BI__builtin_ia32_vp2intersect_d_512:
15258   case X86::BI__builtin_ia32_vp2intersect_d_256:
15259   case X86::BI__builtin_ia32_vp2intersect_d_128: {
15260     unsigned NumElts =
15261         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
15262     Intrinsic::ID ID;
15263 
15264     switch (BuiltinID) {
15265     default: llvm_unreachable("Unsupported intrinsic!");
15266     case X86::BI__builtin_ia32_vp2intersect_q_512:
15267       ID = Intrinsic::x86_avx512_vp2intersect_q_512;
15268       break;
15269     case X86::BI__builtin_ia32_vp2intersect_q_256:
15270       ID = Intrinsic::x86_avx512_vp2intersect_q_256;
15271       break;
15272     case X86::BI__builtin_ia32_vp2intersect_q_128:
15273       ID = Intrinsic::x86_avx512_vp2intersect_q_128;
15274       break;
15275     case X86::BI__builtin_ia32_vp2intersect_d_512:
15276       ID = Intrinsic::x86_avx512_vp2intersect_d_512;
15277       break;
15278     case X86::BI__builtin_ia32_vp2intersect_d_256:
15279       ID = Intrinsic::x86_avx512_vp2intersect_d_256;
15280       break;
15281     case X86::BI__builtin_ia32_vp2intersect_d_128:
15282       ID = Intrinsic::x86_avx512_vp2intersect_d_128;
15283       break;
15284     }
15285 
15286     Value *Call = Builder.CreateCall(CGM.getIntrinsic(ID), {Ops[0], Ops[1]});
15287     Value *Result = Builder.CreateExtractValue(Call, 0);
15288     Result = EmitX86MaskedCompareResult(*this, Result, NumElts, nullptr);
15289     Builder.CreateDefaultAlignedStore(Result, Ops[2]);
15290 
15291     Result = Builder.CreateExtractValue(Call, 1);
15292     Result = EmitX86MaskedCompareResult(*this, Result, NumElts, nullptr);
15293     return Builder.CreateDefaultAlignedStore(Result, Ops[3]);
15294   }
15295 
15296   case X86::BI__builtin_ia32_vpmultishiftqb128:
15297   case X86::BI__builtin_ia32_vpmultishiftqb256:
15298   case X86::BI__builtin_ia32_vpmultishiftqb512: {
15299     Intrinsic::ID ID;
15300     switch (BuiltinID) {
15301     default: llvm_unreachable("Unsupported intrinsic!");
15302     case X86::BI__builtin_ia32_vpmultishiftqb128:
15303       ID = Intrinsic::x86_avx512_pmultishift_qb_128;
15304       break;
15305     case X86::BI__builtin_ia32_vpmultishiftqb256:
15306       ID = Intrinsic::x86_avx512_pmultishift_qb_256;
15307       break;
15308     case X86::BI__builtin_ia32_vpmultishiftqb512:
15309       ID = Intrinsic::x86_avx512_pmultishift_qb_512;
15310       break;
15311     }
15312 
15313     return Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
15314   }
15315 
15316   case X86::BI__builtin_ia32_vpshufbitqmb128_mask:
15317   case X86::BI__builtin_ia32_vpshufbitqmb256_mask:
15318   case X86::BI__builtin_ia32_vpshufbitqmb512_mask: {
15319     unsigned NumElts =
15320         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
15321     Value *MaskIn = Ops[2];
15322     Ops.erase(&Ops[2]);
15323 
15324     Intrinsic::ID ID;
15325     switch (BuiltinID) {
15326     default: llvm_unreachable("Unsupported intrinsic!");
15327     case X86::BI__builtin_ia32_vpshufbitqmb128_mask:
15328       ID = Intrinsic::x86_avx512_vpshufbitqmb_128;
15329       break;
15330     case X86::BI__builtin_ia32_vpshufbitqmb256_mask:
15331       ID = Intrinsic::x86_avx512_vpshufbitqmb_256;
15332       break;
15333     case X86::BI__builtin_ia32_vpshufbitqmb512_mask:
15334       ID = Intrinsic::x86_avx512_vpshufbitqmb_512;
15335       break;
15336     }
15337 
15338     Value *Shufbit = Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
15339     return EmitX86MaskedCompareResult(*this, Shufbit, NumElts, MaskIn);
15340   }
15341 
15342   // packed comparison intrinsics
15343   case X86::BI__builtin_ia32_cmpeqps:
15344   case X86::BI__builtin_ia32_cmpeqpd:
15345     return getVectorFCmpIR(CmpInst::FCMP_OEQ, /*IsSignaling*/false);
15346   case X86::BI__builtin_ia32_cmpltps:
15347   case X86::BI__builtin_ia32_cmpltpd:
15348     return getVectorFCmpIR(CmpInst::FCMP_OLT, /*IsSignaling*/true);
15349   case X86::BI__builtin_ia32_cmpleps:
15350   case X86::BI__builtin_ia32_cmplepd:
15351     return getVectorFCmpIR(CmpInst::FCMP_OLE, /*IsSignaling*/true);
15352   case X86::BI__builtin_ia32_cmpunordps:
15353   case X86::BI__builtin_ia32_cmpunordpd:
15354     return getVectorFCmpIR(CmpInst::FCMP_UNO, /*IsSignaling*/false);
15355   case X86::BI__builtin_ia32_cmpneqps:
15356   case X86::BI__builtin_ia32_cmpneqpd:
15357     return getVectorFCmpIR(CmpInst::FCMP_UNE, /*IsSignaling*/false);
15358   case X86::BI__builtin_ia32_cmpnltps:
15359   case X86::BI__builtin_ia32_cmpnltpd:
15360     return getVectorFCmpIR(CmpInst::FCMP_UGE, /*IsSignaling*/true);
15361   case X86::BI__builtin_ia32_cmpnleps:
15362   case X86::BI__builtin_ia32_cmpnlepd:
15363     return getVectorFCmpIR(CmpInst::FCMP_UGT, /*IsSignaling*/true);
15364   case X86::BI__builtin_ia32_cmpordps:
15365   case X86::BI__builtin_ia32_cmpordpd:
15366     return getVectorFCmpIR(CmpInst::FCMP_ORD, /*IsSignaling*/false);
15367   case X86::BI__builtin_ia32_cmpph128_mask:
15368   case X86::BI__builtin_ia32_cmpph256_mask:
15369   case X86::BI__builtin_ia32_cmpph512_mask:
15370   case X86::BI__builtin_ia32_cmpps128_mask:
15371   case X86::BI__builtin_ia32_cmpps256_mask:
15372   case X86::BI__builtin_ia32_cmpps512_mask:
15373   case X86::BI__builtin_ia32_cmppd128_mask:
15374   case X86::BI__builtin_ia32_cmppd256_mask:
15375   case X86::BI__builtin_ia32_cmppd512_mask:
15376     IsMaskFCmp = true;
15377     [[fallthrough]];
15378   case X86::BI__builtin_ia32_cmpps:
15379   case X86::BI__builtin_ia32_cmpps256:
15380   case X86::BI__builtin_ia32_cmppd:
15381   case X86::BI__builtin_ia32_cmppd256: {
15382     // Lowering vector comparisons to fcmp instructions, while
15383     // ignoring signalling behaviour requested
15384     // ignoring rounding mode requested
15385     // This is only possible if fp-model is not strict and FENV_ACCESS is off.
15386 
15387     // The third argument is the comparison condition, and integer in the
15388     // range [0, 31]
15389     unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x1f;
15390 
15391     // Lowering to IR fcmp instruction.
15392     // Ignoring requested signaling behaviour,
15393     // e.g. both _CMP_GT_OS & _CMP_GT_OQ are translated to FCMP_OGT.
15394     FCmpInst::Predicate Pred;
15395     bool IsSignaling;
15396     // Predicates for 16-31 repeat the 0-15 predicates. Only the signalling
15397     // behavior is inverted. We'll handle that after the switch.
15398     switch (CC & 0xf) {
15399     case 0x00: Pred = FCmpInst::FCMP_OEQ;   IsSignaling = false; break;
15400     case 0x01: Pred = FCmpInst::FCMP_OLT;   IsSignaling = true;  break;
15401     case 0x02: Pred = FCmpInst::FCMP_OLE;   IsSignaling = true;  break;
15402     case 0x03: Pred = FCmpInst::FCMP_UNO;   IsSignaling = false; break;
15403     case 0x04: Pred = FCmpInst::FCMP_UNE;   IsSignaling = false; break;
15404     case 0x05: Pred = FCmpInst::FCMP_UGE;   IsSignaling = true;  break;
15405     case 0x06: Pred = FCmpInst::FCMP_UGT;   IsSignaling = true;  break;
15406     case 0x07: Pred = FCmpInst::FCMP_ORD;   IsSignaling = false; break;
15407     case 0x08: Pred = FCmpInst::FCMP_UEQ;   IsSignaling = false; break;
15408     case 0x09: Pred = FCmpInst::FCMP_ULT;   IsSignaling = true;  break;
15409     case 0x0a: Pred = FCmpInst::FCMP_ULE;   IsSignaling = true;  break;
15410     case 0x0b: Pred = FCmpInst::FCMP_FALSE; IsSignaling = false; break;
15411     case 0x0c: Pred = FCmpInst::FCMP_ONE;   IsSignaling = false; break;
15412     case 0x0d: Pred = FCmpInst::FCMP_OGE;   IsSignaling = true;  break;
15413     case 0x0e: Pred = FCmpInst::FCMP_OGT;   IsSignaling = true;  break;
15414     case 0x0f: Pred = FCmpInst::FCMP_TRUE;  IsSignaling = false; break;
15415     default: llvm_unreachable("Unhandled CC");
15416     }
15417 
15418     // Invert the signalling behavior for 16-31.
15419     if (CC & 0x10)
15420       IsSignaling = !IsSignaling;
15421 
15422     // If the predicate is true or false and we're using constrained intrinsics,
15423     // we don't have a compare intrinsic we can use. Just use the legacy X86
15424     // specific intrinsic.
15425     // If the intrinsic is mask enabled and we're using constrained intrinsics,
15426     // use the legacy X86 specific intrinsic.
15427     if (Builder.getIsFPConstrained() &&
15428         (Pred == FCmpInst::FCMP_TRUE || Pred == FCmpInst::FCMP_FALSE ||
15429          IsMaskFCmp)) {
15430 
15431       Intrinsic::ID IID;
15432       switch (BuiltinID) {
15433       default: llvm_unreachable("Unexpected builtin");
15434       case X86::BI__builtin_ia32_cmpps:
15435         IID = Intrinsic::x86_sse_cmp_ps;
15436         break;
15437       case X86::BI__builtin_ia32_cmpps256:
15438         IID = Intrinsic::x86_avx_cmp_ps_256;
15439         break;
15440       case X86::BI__builtin_ia32_cmppd:
15441         IID = Intrinsic::x86_sse2_cmp_pd;
15442         break;
15443       case X86::BI__builtin_ia32_cmppd256:
15444         IID = Intrinsic::x86_avx_cmp_pd_256;
15445         break;
15446       case X86::BI__builtin_ia32_cmpps512_mask:
15447         IID = Intrinsic::x86_avx512_mask_cmp_ps_512;
15448         break;
15449       case X86::BI__builtin_ia32_cmppd512_mask:
15450         IID = Intrinsic::x86_avx512_mask_cmp_pd_512;
15451         break;
15452       case X86::BI__builtin_ia32_cmpps128_mask:
15453         IID = Intrinsic::x86_avx512_mask_cmp_ps_128;
15454         break;
15455       case X86::BI__builtin_ia32_cmpps256_mask:
15456         IID = Intrinsic::x86_avx512_mask_cmp_ps_256;
15457         break;
15458       case X86::BI__builtin_ia32_cmppd128_mask:
15459         IID = Intrinsic::x86_avx512_mask_cmp_pd_128;
15460         break;
15461       case X86::BI__builtin_ia32_cmppd256_mask:
15462         IID = Intrinsic::x86_avx512_mask_cmp_pd_256;
15463         break;
15464       }
15465 
15466       Function *Intr = CGM.getIntrinsic(IID);
15467       if (IsMaskFCmp) {
15468         unsigned NumElts =
15469             cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
15470         Ops[3] = getMaskVecValue(*this, Ops[3], NumElts);
15471         Value *Cmp = Builder.CreateCall(Intr, Ops);
15472         return EmitX86MaskedCompareResult(*this, Cmp, NumElts, nullptr);
15473       }
15474 
15475       return Builder.CreateCall(Intr, Ops);
15476     }
15477 
15478     // Builtins without the _mask suffix return a vector of integers
15479     // of the same width as the input vectors
15480     if (IsMaskFCmp) {
15481       // We ignore SAE if strict FP is disabled. We only keep precise
15482       // exception behavior under strict FP.
15483       // NOTE: If strict FP does ever go through here a CGFPOptionsRAII
15484       // object will be required.
15485       unsigned NumElts =
15486           cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
15487       Value *Cmp;
15488       if (IsSignaling)
15489         Cmp = Builder.CreateFCmpS(Pred, Ops[0], Ops[1]);
15490       else
15491         Cmp = Builder.CreateFCmp(Pred, Ops[0], Ops[1]);
15492       return EmitX86MaskedCompareResult(*this, Cmp, NumElts, Ops[3]);
15493     }
15494 
15495     return getVectorFCmpIR(Pred, IsSignaling);
15496   }
15497 
15498   // SSE scalar comparison intrinsics
15499   case X86::BI__builtin_ia32_cmpeqss:
15500     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 0);
15501   case X86::BI__builtin_ia32_cmpltss:
15502     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 1);
15503   case X86::BI__builtin_ia32_cmpless:
15504     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 2);
15505   case X86::BI__builtin_ia32_cmpunordss:
15506     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 3);
15507   case X86::BI__builtin_ia32_cmpneqss:
15508     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 4);
15509   case X86::BI__builtin_ia32_cmpnltss:
15510     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 5);
15511   case X86::BI__builtin_ia32_cmpnless:
15512     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 6);
15513   case X86::BI__builtin_ia32_cmpordss:
15514     return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 7);
15515   case X86::BI__builtin_ia32_cmpeqsd:
15516     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 0);
15517   case X86::BI__builtin_ia32_cmpltsd:
15518     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 1);
15519   case X86::BI__builtin_ia32_cmplesd:
15520     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 2);
15521   case X86::BI__builtin_ia32_cmpunordsd:
15522     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 3);
15523   case X86::BI__builtin_ia32_cmpneqsd:
15524     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 4);
15525   case X86::BI__builtin_ia32_cmpnltsd:
15526     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 5);
15527   case X86::BI__builtin_ia32_cmpnlesd:
15528     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 6);
15529   case X86::BI__builtin_ia32_cmpordsd:
15530     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 7);
15531 
15532   // f16c half2float intrinsics
15533   case X86::BI__builtin_ia32_vcvtph2ps:
15534   case X86::BI__builtin_ia32_vcvtph2ps256:
15535   case X86::BI__builtin_ia32_vcvtph2ps_mask:
15536   case X86::BI__builtin_ia32_vcvtph2ps256_mask:
15537   case X86::BI__builtin_ia32_vcvtph2ps512_mask: {
15538     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
15539     return EmitX86CvtF16ToFloatExpr(*this, Ops, ConvertType(E->getType()));
15540   }
15541 
15542   // AVX512 bf16 intrinsics
15543   case X86::BI__builtin_ia32_cvtneps2bf16_128_mask: {
15544     Ops[2] = getMaskVecValue(
15545         *this, Ops[2],
15546         cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements());
15547     Intrinsic::ID IID = Intrinsic::x86_avx512bf16_mask_cvtneps2bf16_128;
15548     return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
15549   }
15550   case X86::BI__builtin_ia32_cvtsbf162ss_32:
15551     return Builder.CreateFPExt(Ops[0], Builder.getFloatTy());
15552 
15553   case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
15554   case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: {
15555     Intrinsic::ID IID;
15556     switch (BuiltinID) {
15557     default: llvm_unreachable("Unsupported intrinsic!");
15558     case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
15559       IID = Intrinsic::x86_avx512bf16_cvtneps2bf16_256;
15560       break;
15561     case X86::BI__builtin_ia32_cvtneps2bf16_512_mask:
15562       IID = Intrinsic::x86_avx512bf16_cvtneps2bf16_512;
15563       break;
15564     }
15565     Value *Res = Builder.CreateCall(CGM.getIntrinsic(IID), Ops[0]);
15566     return EmitX86Select(*this, Ops[2], Res, Ops[1]);
15567   }
15568 
15569   case X86::BI__cpuid:
15570   case X86::BI__cpuidex: {
15571     Value *FuncId = EmitScalarExpr(E->getArg(1));
15572     Value *SubFuncId = BuiltinID == X86::BI__cpuidex
15573                            ? EmitScalarExpr(E->getArg(2))
15574                            : llvm::ConstantInt::get(Int32Ty, 0);
15575 
15576     llvm::StructType *CpuidRetTy =
15577         llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty, Int32Ty);
15578     llvm::FunctionType *FTy =
15579         llvm::FunctionType::get(CpuidRetTy, {Int32Ty, Int32Ty}, false);
15580 
15581     StringRef Asm, Constraints;
15582     if (getTarget().getTriple().getArch() == llvm::Triple::x86) {
15583       Asm = "cpuid";
15584       Constraints = "={ax},={bx},={cx},={dx},{ax},{cx}";
15585     } else {
15586       // x86-64 uses %rbx as the base register, so preserve it.
15587       Asm = "xchgq %rbx, ${1:q}\n"
15588             "cpuid\n"
15589             "xchgq %rbx, ${1:q}";
15590       Constraints = "={ax},=r,={cx},={dx},0,2";
15591     }
15592 
15593     llvm::InlineAsm *IA = llvm::InlineAsm::get(FTy, Asm, Constraints,
15594                                                /*hasSideEffects=*/false);
15595     Value *IACall = Builder.CreateCall(IA, {FuncId, SubFuncId});
15596     Value *BasePtr = EmitScalarExpr(E->getArg(0));
15597     Value *Store = nullptr;
15598     for (unsigned i = 0; i < 4; i++) {
15599       Value *Extracted = Builder.CreateExtractValue(IACall, i);
15600       Value *StorePtr = Builder.CreateConstInBoundsGEP1_32(Int32Ty, BasePtr, i);
15601       Store = Builder.CreateAlignedStore(Extracted, StorePtr, getIntAlign());
15602     }
15603 
15604     // Return the last store instruction to signal that we have emitted the
15605     // the intrinsic.
15606     return Store;
15607   }
15608 
15609   case X86::BI__emul:
15610   case X86::BI__emulu: {
15611     llvm::Type *Int64Ty = llvm::IntegerType::get(getLLVMContext(), 64);
15612     bool isSigned = (BuiltinID == X86::BI__emul);
15613     Value *LHS = Builder.CreateIntCast(Ops[0], Int64Ty, isSigned);
15614     Value *RHS = Builder.CreateIntCast(Ops[1], Int64Ty, isSigned);
15615     return Builder.CreateMul(LHS, RHS, "", !isSigned, isSigned);
15616   }
15617   case X86::BI__mulh:
15618   case X86::BI__umulh:
15619   case X86::BI_mul128:
15620   case X86::BI_umul128: {
15621     llvm::Type *ResType = ConvertType(E->getType());
15622     llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
15623 
15624     bool IsSigned = (BuiltinID == X86::BI__mulh || BuiltinID == X86::BI_mul128);
15625     Value *LHS = Builder.CreateIntCast(Ops[0], Int128Ty, IsSigned);
15626     Value *RHS = Builder.CreateIntCast(Ops[1], Int128Ty, IsSigned);
15627 
15628     Value *MulResult, *HigherBits;
15629     if (IsSigned) {
15630       MulResult = Builder.CreateNSWMul(LHS, RHS);
15631       HigherBits = Builder.CreateAShr(MulResult, 64);
15632     } else {
15633       MulResult = Builder.CreateNUWMul(LHS, RHS);
15634       HigherBits = Builder.CreateLShr(MulResult, 64);
15635     }
15636     HigherBits = Builder.CreateIntCast(HigherBits, ResType, IsSigned);
15637 
15638     if (BuiltinID == X86::BI__mulh || BuiltinID == X86::BI__umulh)
15639       return HigherBits;
15640 
15641     Address HighBitsAddress = EmitPointerWithAlignment(E->getArg(2));
15642     Builder.CreateStore(HigherBits, HighBitsAddress);
15643     return Builder.CreateIntCast(MulResult, ResType, IsSigned);
15644   }
15645 
15646   case X86::BI__faststorefence: {
15647     return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
15648                                llvm::SyncScope::System);
15649   }
15650   case X86::BI__shiftleft128:
15651   case X86::BI__shiftright128: {
15652     llvm::Function *F = CGM.getIntrinsic(
15653         BuiltinID == X86::BI__shiftleft128 ? Intrinsic::fshl : Intrinsic::fshr,
15654         Int64Ty);
15655     // Flip low/high ops and zero-extend amount to matching type.
15656     // shiftleft128(Low, High, Amt) -> fshl(High, Low, Amt)
15657     // shiftright128(Low, High, Amt) -> fshr(High, Low, Amt)
15658     std::swap(Ops[0], Ops[1]);
15659     Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty);
15660     return Builder.CreateCall(F, Ops);
15661   }
15662   case X86::BI_ReadWriteBarrier:
15663   case X86::BI_ReadBarrier:
15664   case X86::BI_WriteBarrier: {
15665     return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
15666                                llvm::SyncScope::SingleThread);
15667   }
15668 
15669   case X86::BI_AddressOfReturnAddress: {
15670     Function *F =
15671         CGM.getIntrinsic(Intrinsic::addressofreturnaddress, AllocaInt8PtrTy);
15672     return Builder.CreateCall(F);
15673   }
15674   case X86::BI__stosb: {
15675     // We treat __stosb as a volatile memset - it may not generate "rep stosb"
15676     // instruction, but it will create a memset that won't be optimized away.
15677     return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true);
15678   }
15679   case X86::BI__ud2:
15680     // llvm.trap makes a ud2a instruction on x86.
15681     return EmitTrapCall(Intrinsic::trap);
15682   case X86::BI__int2c: {
15683     // This syscall signals a driver assertion failure in x86 NT kernels.
15684     llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, false);
15685     llvm::InlineAsm *IA =
15686         llvm::InlineAsm::get(FTy, "int $$0x2c", "", /*hasSideEffects=*/true);
15687     llvm::AttributeList NoReturnAttr = llvm::AttributeList::get(
15688         getLLVMContext(), llvm::AttributeList::FunctionIndex,
15689         llvm::Attribute::NoReturn);
15690     llvm::CallInst *CI = Builder.CreateCall(IA);
15691     CI->setAttributes(NoReturnAttr);
15692     return CI;
15693   }
15694   case X86::BI__readfsbyte:
15695   case X86::BI__readfsword:
15696   case X86::BI__readfsdword:
15697   case X86::BI__readfsqword: {
15698     llvm::Type *IntTy = ConvertType(E->getType());
15699     Value *Ptr =
15700         Builder.CreateIntToPtr(Ops[0], llvm::PointerType::get(IntTy, 257));
15701     LoadInst *Load = Builder.CreateAlignedLoad(
15702         IntTy, Ptr, getContext().getTypeAlignInChars(E->getType()));
15703     Load->setVolatile(true);
15704     return Load;
15705   }
15706   case X86::BI__readgsbyte:
15707   case X86::BI__readgsword:
15708   case X86::BI__readgsdword:
15709   case X86::BI__readgsqword: {
15710     llvm::Type *IntTy = ConvertType(E->getType());
15711     Value *Ptr =
15712         Builder.CreateIntToPtr(Ops[0], llvm::PointerType::get(IntTy, 256));
15713     LoadInst *Load = Builder.CreateAlignedLoad(
15714         IntTy, Ptr, getContext().getTypeAlignInChars(E->getType()));
15715     Load->setVolatile(true);
15716     return Load;
15717   }
15718   case X86::BI__builtin_ia32_encodekey128_u32: {
15719     Intrinsic::ID IID = Intrinsic::x86_encodekey128;
15720 
15721     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[0], Ops[1]});
15722 
15723     for (int i = 0; i < 3; ++i) {
15724       Value *Extract = Builder.CreateExtractValue(Call, i + 1);
15725       Value *Ptr = Builder.CreateConstGEP1_32(Int8Ty, Ops[2], i * 16);
15726       Ptr = Builder.CreateBitCast(
15727           Ptr, llvm::PointerType::getUnqual(Extract->getType()));
15728       Builder.CreateAlignedStore(Extract, Ptr, Align(1));
15729     }
15730 
15731     return Builder.CreateExtractValue(Call, 0);
15732   }
15733   case X86::BI__builtin_ia32_encodekey256_u32: {
15734     Intrinsic::ID IID = Intrinsic::x86_encodekey256;
15735 
15736     Value *Call =
15737         Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[0], Ops[1], Ops[2]});
15738 
15739     for (int i = 0; i < 4; ++i) {
15740       Value *Extract = Builder.CreateExtractValue(Call, i + 1);
15741       Value *Ptr = Builder.CreateConstGEP1_32(Int8Ty, Ops[3], i * 16);
15742       Ptr = Builder.CreateBitCast(
15743           Ptr, llvm::PointerType::getUnqual(Extract->getType()));
15744       Builder.CreateAlignedStore(Extract, Ptr, Align(1));
15745     }
15746 
15747     return Builder.CreateExtractValue(Call, 0);
15748   }
15749   case X86::BI__builtin_ia32_aesenc128kl_u8:
15750   case X86::BI__builtin_ia32_aesdec128kl_u8:
15751   case X86::BI__builtin_ia32_aesenc256kl_u8:
15752   case X86::BI__builtin_ia32_aesdec256kl_u8: {
15753     Intrinsic::ID IID;
15754     StringRef BlockName;
15755     switch (BuiltinID) {
15756     default:
15757       llvm_unreachable("Unexpected builtin");
15758     case X86::BI__builtin_ia32_aesenc128kl_u8:
15759       IID = Intrinsic::x86_aesenc128kl;
15760       BlockName = "aesenc128kl";
15761       break;
15762     case X86::BI__builtin_ia32_aesdec128kl_u8:
15763       IID = Intrinsic::x86_aesdec128kl;
15764       BlockName = "aesdec128kl";
15765       break;
15766     case X86::BI__builtin_ia32_aesenc256kl_u8:
15767       IID = Intrinsic::x86_aesenc256kl;
15768       BlockName = "aesenc256kl";
15769       break;
15770     case X86::BI__builtin_ia32_aesdec256kl_u8:
15771       IID = Intrinsic::x86_aesdec256kl;
15772       BlockName = "aesdec256kl";
15773       break;
15774     }
15775 
15776     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[1], Ops[2]});
15777 
15778     BasicBlock *NoError =
15779         createBasicBlock(BlockName + "_no_error", this->CurFn);
15780     BasicBlock *Error = createBasicBlock(BlockName + "_error", this->CurFn);
15781     BasicBlock *End = createBasicBlock(BlockName + "_end", this->CurFn);
15782 
15783     Value *Ret = Builder.CreateExtractValue(Call, 0);
15784     Value *Succ = Builder.CreateTrunc(Ret, Builder.getInt1Ty());
15785     Value *Out = Builder.CreateExtractValue(Call, 1);
15786     Builder.CreateCondBr(Succ, NoError, Error);
15787 
15788     Builder.SetInsertPoint(NoError);
15789     Builder.CreateDefaultAlignedStore(Out, Ops[0]);
15790     Builder.CreateBr(End);
15791 
15792     Builder.SetInsertPoint(Error);
15793     Constant *Zero = llvm::Constant::getNullValue(Out->getType());
15794     Builder.CreateDefaultAlignedStore(Zero, Ops[0]);
15795     Builder.CreateBr(End);
15796 
15797     Builder.SetInsertPoint(End);
15798     return Builder.CreateExtractValue(Call, 0);
15799   }
15800   case X86::BI__builtin_ia32_aesencwide128kl_u8:
15801   case X86::BI__builtin_ia32_aesdecwide128kl_u8:
15802   case X86::BI__builtin_ia32_aesencwide256kl_u8:
15803   case X86::BI__builtin_ia32_aesdecwide256kl_u8: {
15804     Intrinsic::ID IID;
15805     StringRef BlockName;
15806     switch (BuiltinID) {
15807     case X86::BI__builtin_ia32_aesencwide128kl_u8:
15808       IID = Intrinsic::x86_aesencwide128kl;
15809       BlockName = "aesencwide128kl";
15810       break;
15811     case X86::BI__builtin_ia32_aesdecwide128kl_u8:
15812       IID = Intrinsic::x86_aesdecwide128kl;
15813       BlockName = "aesdecwide128kl";
15814       break;
15815     case X86::BI__builtin_ia32_aesencwide256kl_u8:
15816       IID = Intrinsic::x86_aesencwide256kl;
15817       BlockName = "aesencwide256kl";
15818       break;
15819     case X86::BI__builtin_ia32_aesdecwide256kl_u8:
15820       IID = Intrinsic::x86_aesdecwide256kl;
15821       BlockName = "aesdecwide256kl";
15822       break;
15823     }
15824 
15825     llvm::Type *Ty = FixedVectorType::get(Builder.getInt64Ty(), 2);
15826     Value *InOps[9];
15827     InOps[0] = Ops[2];
15828     for (int i = 0; i != 8; ++i) {
15829       Value *Ptr = Builder.CreateConstGEP1_32(Ty, Ops[1], i);
15830       InOps[i + 1] = Builder.CreateAlignedLoad(Ty, Ptr, Align(16));
15831     }
15832 
15833     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), InOps);
15834 
15835     BasicBlock *NoError =
15836         createBasicBlock(BlockName + "_no_error", this->CurFn);
15837     BasicBlock *Error = createBasicBlock(BlockName + "_error", this->CurFn);
15838     BasicBlock *End = createBasicBlock(BlockName + "_end", this->CurFn);
15839 
15840     Value *Ret = Builder.CreateExtractValue(Call, 0);
15841     Value *Succ = Builder.CreateTrunc(Ret, Builder.getInt1Ty());
15842     Builder.CreateCondBr(Succ, NoError, Error);
15843 
15844     Builder.SetInsertPoint(NoError);
15845     for (int i = 0; i != 8; ++i) {
15846       Value *Extract = Builder.CreateExtractValue(Call, i + 1);
15847       Value *Ptr = Builder.CreateConstGEP1_32(Extract->getType(), Ops[0], i);
15848       Builder.CreateAlignedStore(Extract, Ptr, Align(16));
15849     }
15850     Builder.CreateBr(End);
15851 
15852     Builder.SetInsertPoint(Error);
15853     for (int i = 0; i != 8; ++i) {
15854       Value *Out = Builder.CreateExtractValue(Call, i + 1);
15855       Constant *Zero = llvm::Constant::getNullValue(Out->getType());
15856       Value *Ptr = Builder.CreateConstGEP1_32(Out->getType(), Ops[0], i);
15857       Builder.CreateAlignedStore(Zero, Ptr, Align(16));
15858     }
15859     Builder.CreateBr(End);
15860 
15861     Builder.SetInsertPoint(End);
15862     return Builder.CreateExtractValue(Call, 0);
15863   }
15864   case X86::BI__builtin_ia32_vfcmaddcph512_mask:
15865     IsConjFMA = true;
15866     [[fallthrough]];
15867   case X86::BI__builtin_ia32_vfmaddcph512_mask: {
15868     Intrinsic::ID IID = IsConjFMA
15869                             ? Intrinsic::x86_avx512fp16_mask_vfcmadd_cph_512
15870                             : Intrinsic::x86_avx512fp16_mask_vfmadd_cph_512;
15871     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
15872     return EmitX86Select(*this, Ops[3], Call, Ops[0]);
15873   }
15874   case X86::BI__builtin_ia32_vfcmaddcsh_round_mask:
15875     IsConjFMA = true;
15876     [[fallthrough]];
15877   case X86::BI__builtin_ia32_vfmaddcsh_round_mask: {
15878     Intrinsic::ID IID = IsConjFMA ? Intrinsic::x86_avx512fp16_mask_vfcmadd_csh
15879                                   : Intrinsic::x86_avx512fp16_mask_vfmadd_csh;
15880     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
15881     Value *And = Builder.CreateAnd(Ops[3], llvm::ConstantInt::get(Int8Ty, 1));
15882     return EmitX86Select(*this, And, Call, Ops[0]);
15883   }
15884   case X86::BI__builtin_ia32_vfcmaddcsh_round_mask3:
15885     IsConjFMA = true;
15886     [[fallthrough]];
15887   case X86::BI__builtin_ia32_vfmaddcsh_round_mask3: {
15888     Intrinsic::ID IID = IsConjFMA ? Intrinsic::x86_avx512fp16_mask_vfcmadd_csh
15889                                   : Intrinsic::x86_avx512fp16_mask_vfmadd_csh;
15890     Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
15891     static constexpr int Mask[] = {0, 5, 6, 7};
15892     return Builder.CreateShuffleVector(Call, Ops[2], Mask);
15893   }
15894   case X86::BI__builtin_ia32_prefetchi:
15895     return Builder.CreateCall(
15896         CGM.getIntrinsic(Intrinsic::prefetch, Ops[0]->getType()),
15897         {Ops[0], llvm::ConstantInt::get(Int32Ty, 0), Ops[1],
15898          llvm::ConstantInt::get(Int32Ty, 0)});
15899   }
15900 }
15901 
15902 Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
15903                                            const CallExpr *E) {
15904   // Do not emit the builtin arguments in the arguments of a function call,
15905   // because the evaluation order of function arguments is not specified in C++.
15906   // This is important when testing to ensure the arguments are emitted in the
15907   // same order every time. Eg:
15908   // Instead of:
15909   //   return Builder.CreateFDiv(EmitScalarExpr(E->getArg(0)),
15910   //                             EmitScalarExpr(E->getArg(1)), "swdiv");
15911   // Use:
15912   //   Value *Op0 = EmitScalarExpr(E->getArg(0));
15913   //   Value *Op1 = EmitScalarExpr(E->getArg(1));
15914   //   return Builder.CreateFDiv(Op0, Op1, "swdiv")
15915 
15916   Intrinsic::ID ID = Intrinsic::not_intrinsic;
15917 
15918   switch (BuiltinID) {
15919   default: return nullptr;
15920 
15921   // __builtin_ppc_get_timebase is GCC 4.8+'s PowerPC-specific name for what we
15922   // call __builtin_readcyclecounter.
15923   case PPC::BI__builtin_ppc_get_timebase:
15924     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::readcyclecounter));
15925 
15926   // vec_ld, vec_xl_be, vec_lvsl, vec_lvsr
15927   case PPC::BI__builtin_altivec_lvx:
15928   case PPC::BI__builtin_altivec_lvxl:
15929   case PPC::BI__builtin_altivec_lvebx:
15930   case PPC::BI__builtin_altivec_lvehx:
15931   case PPC::BI__builtin_altivec_lvewx:
15932   case PPC::BI__builtin_altivec_lvsl:
15933   case PPC::BI__builtin_altivec_lvsr:
15934   case PPC::BI__builtin_vsx_lxvd2x:
15935   case PPC::BI__builtin_vsx_lxvw4x:
15936   case PPC::BI__builtin_vsx_lxvd2x_be:
15937   case PPC::BI__builtin_vsx_lxvw4x_be:
15938   case PPC::BI__builtin_vsx_lxvl:
15939   case PPC::BI__builtin_vsx_lxvll:
15940   {
15941     SmallVector<Value *, 2> Ops;
15942     Ops.push_back(EmitScalarExpr(E->getArg(0)));
15943     Ops.push_back(EmitScalarExpr(E->getArg(1)));
15944     if(BuiltinID == PPC::BI__builtin_vsx_lxvl ||
15945        BuiltinID == PPC::BI__builtin_vsx_lxvll){
15946       Ops[0] = Builder.CreateBitCast(Ops[0], Int8PtrTy);
15947     }else {
15948       Ops[1] = Builder.CreateBitCast(Ops[1], Int8PtrTy);
15949       Ops[0] = Builder.CreateGEP(Int8Ty, Ops[1], Ops[0]);
15950       Ops.pop_back();
15951     }
15952 
15953     switch (BuiltinID) {
15954     default: llvm_unreachable("Unsupported ld/lvsl/lvsr intrinsic!");
15955     case PPC::BI__builtin_altivec_lvx:
15956       ID = Intrinsic::ppc_altivec_lvx;
15957       break;
15958     case PPC::BI__builtin_altivec_lvxl:
15959       ID = Intrinsic::ppc_altivec_lvxl;
15960       break;
15961     case PPC::BI__builtin_altivec_lvebx:
15962       ID = Intrinsic::ppc_altivec_lvebx;
15963       break;
15964     case PPC::BI__builtin_altivec_lvehx:
15965       ID = Intrinsic::ppc_altivec_lvehx;
15966       break;
15967     case PPC::BI__builtin_altivec_lvewx:
15968       ID = Intrinsic::ppc_altivec_lvewx;
15969       break;
15970     case PPC::BI__builtin_altivec_lvsl:
15971       ID = Intrinsic::ppc_altivec_lvsl;
15972       break;
15973     case PPC::BI__builtin_altivec_lvsr:
15974       ID = Intrinsic::ppc_altivec_lvsr;
15975       break;
15976     case PPC::BI__builtin_vsx_lxvd2x:
15977       ID = Intrinsic::ppc_vsx_lxvd2x;
15978       break;
15979     case PPC::BI__builtin_vsx_lxvw4x:
15980       ID = Intrinsic::ppc_vsx_lxvw4x;
15981       break;
15982     case PPC::BI__builtin_vsx_lxvd2x_be:
15983       ID = Intrinsic::ppc_vsx_lxvd2x_be;
15984       break;
15985     case PPC::BI__builtin_vsx_lxvw4x_be:
15986       ID = Intrinsic::ppc_vsx_lxvw4x_be;
15987       break;
15988     case PPC::BI__builtin_vsx_lxvl:
15989       ID = Intrinsic::ppc_vsx_lxvl;
15990       break;
15991     case PPC::BI__builtin_vsx_lxvll:
15992       ID = Intrinsic::ppc_vsx_lxvll;
15993       break;
15994     }
15995     llvm::Function *F = CGM.getIntrinsic(ID);
15996     return Builder.CreateCall(F, Ops, "");
15997   }
15998 
15999   // vec_st, vec_xst_be
16000   case PPC::BI__builtin_altivec_stvx:
16001   case PPC::BI__builtin_altivec_stvxl:
16002   case PPC::BI__builtin_altivec_stvebx:
16003   case PPC::BI__builtin_altivec_stvehx:
16004   case PPC::BI__builtin_altivec_stvewx:
16005   case PPC::BI__builtin_vsx_stxvd2x:
16006   case PPC::BI__builtin_vsx_stxvw4x:
16007   case PPC::BI__builtin_vsx_stxvd2x_be:
16008   case PPC::BI__builtin_vsx_stxvw4x_be:
16009   case PPC::BI__builtin_vsx_stxvl:
16010   case PPC::BI__builtin_vsx_stxvll:
16011   {
16012     SmallVector<Value *, 3> Ops;
16013     Ops.push_back(EmitScalarExpr(E->getArg(0)));
16014     Ops.push_back(EmitScalarExpr(E->getArg(1)));
16015     Ops.push_back(EmitScalarExpr(E->getArg(2)));
16016     if(BuiltinID == PPC::BI__builtin_vsx_stxvl ||
16017       BuiltinID == PPC::BI__builtin_vsx_stxvll ){
16018       Ops[1] = Builder.CreateBitCast(Ops[1], Int8PtrTy);
16019     }else {
16020       Ops[2] = Builder.CreateBitCast(Ops[2], Int8PtrTy);
16021       Ops[1] = Builder.CreateGEP(Int8Ty, Ops[2], Ops[1]);
16022       Ops.pop_back();
16023     }
16024 
16025     switch (BuiltinID) {
16026     default: llvm_unreachable("Unsupported st intrinsic!");
16027     case PPC::BI__builtin_altivec_stvx:
16028       ID = Intrinsic::ppc_altivec_stvx;
16029       break;
16030     case PPC::BI__builtin_altivec_stvxl:
16031       ID = Intrinsic::ppc_altivec_stvxl;
16032       break;
16033     case PPC::BI__builtin_altivec_stvebx:
16034       ID = Intrinsic::ppc_altivec_stvebx;
16035       break;
16036     case PPC::BI__builtin_altivec_stvehx:
16037       ID = Intrinsic::ppc_altivec_stvehx;
16038       break;
16039     case PPC::BI__builtin_altivec_stvewx:
16040       ID = Intrinsic::ppc_altivec_stvewx;
16041       break;
16042     case PPC::BI__builtin_vsx_stxvd2x:
16043       ID = Intrinsic::ppc_vsx_stxvd2x;
16044       break;
16045     case PPC::BI__builtin_vsx_stxvw4x:
16046       ID = Intrinsic::ppc_vsx_stxvw4x;
16047       break;
16048     case PPC::BI__builtin_vsx_stxvd2x_be:
16049       ID = Intrinsic::ppc_vsx_stxvd2x_be;
16050       break;
16051     case PPC::BI__builtin_vsx_stxvw4x_be:
16052       ID = Intrinsic::ppc_vsx_stxvw4x_be;
16053       break;
16054     case PPC::BI__builtin_vsx_stxvl:
16055       ID = Intrinsic::ppc_vsx_stxvl;
16056       break;
16057     case PPC::BI__builtin_vsx_stxvll:
16058       ID = Intrinsic::ppc_vsx_stxvll;
16059       break;
16060     }
16061     llvm::Function *F = CGM.getIntrinsic(ID);
16062     return Builder.CreateCall(F, Ops, "");
16063   }
16064   case PPC::BI__builtin_vsx_ldrmb: {
16065     // Essentially boils down to performing an unaligned VMX load sequence so
16066     // as to avoid crossing a page boundary and then shuffling the elements
16067     // into the right side of the vector register.
16068     Value *Op0 = EmitScalarExpr(E->getArg(0));
16069     Value *Op1 = EmitScalarExpr(E->getArg(1));
16070     int64_t NumBytes = cast<ConstantInt>(Op1)->getZExtValue();
16071     llvm::Type *ResTy = ConvertType(E->getType());
16072     bool IsLE = getTarget().isLittleEndian();
16073 
16074     // If the user wants the entire vector, just load the entire vector.
16075     if (NumBytes == 16) {
16076       Value *LD =
16077           Builder.CreateLoad(Address(Op0, ResTy, CharUnits::fromQuantity(1)));
16078       if (!IsLE)
16079         return LD;
16080 
16081       // Reverse the bytes on LE.
16082       SmallVector<int, 16> RevMask;
16083       for (int Idx = 0; Idx < 16; Idx++)
16084         RevMask.push_back(15 - Idx);
16085       return Builder.CreateShuffleVector(LD, LD, RevMask);
16086     }
16087 
16088     llvm::Function *Lvx = CGM.getIntrinsic(Intrinsic::ppc_altivec_lvx);
16089     llvm::Function *Lvs = CGM.getIntrinsic(IsLE ? Intrinsic::ppc_altivec_lvsr
16090                                                 : Intrinsic::ppc_altivec_lvsl);
16091     llvm::Function *Vperm = CGM.getIntrinsic(Intrinsic::ppc_altivec_vperm);
16092     Value *HiMem = Builder.CreateGEP(
16093         Int8Ty, Op0, ConstantInt::get(Op1->getType(), NumBytes - 1));
16094     Value *LoLd = Builder.CreateCall(Lvx, Op0, "ld.lo");
16095     Value *HiLd = Builder.CreateCall(Lvx, HiMem, "ld.hi");
16096     Value *Mask1 = Builder.CreateCall(Lvs, Op0, "mask1");
16097 
16098     Op0 = IsLE ? HiLd : LoLd;
16099     Op1 = IsLE ? LoLd : HiLd;
16100     Value *AllElts = Builder.CreateCall(Vperm, {Op0, Op1, Mask1}, "shuffle1");
16101     Constant *Zero = llvm::Constant::getNullValue(IsLE ? ResTy : AllElts->getType());
16102 
16103     if (IsLE) {
16104       SmallVector<int, 16> Consts;
16105       for (int Idx = 0; Idx < 16; Idx++) {
16106         int Val = (NumBytes - Idx - 1 >= 0) ? (NumBytes - Idx - 1)
16107                                             : 16 - (NumBytes - Idx);
16108         Consts.push_back(Val);
16109       }
16110       return Builder.CreateShuffleVector(Builder.CreateBitCast(AllElts, ResTy),
16111                                          Zero, Consts);
16112     }
16113     SmallVector<Constant *, 16> Consts;
16114     for (int Idx = 0; Idx < 16; Idx++)
16115       Consts.push_back(Builder.getInt8(NumBytes + Idx));
16116     Value *Mask2 = ConstantVector::get(Consts);
16117     return Builder.CreateBitCast(
16118         Builder.CreateCall(Vperm, {Zero, AllElts, Mask2}, "shuffle2"), ResTy);
16119   }
16120   case PPC::BI__builtin_vsx_strmb: {
16121     Value *Op0 = EmitScalarExpr(E->getArg(0));
16122     Value *Op1 = EmitScalarExpr(E->getArg(1));
16123     Value *Op2 = EmitScalarExpr(E->getArg(2));
16124     int64_t NumBytes = cast<ConstantInt>(Op1)->getZExtValue();
16125     bool IsLE = getTarget().isLittleEndian();
16126     auto StoreSubVec = [&](unsigned Width, unsigned Offset, unsigned EltNo) {
16127       // Storing the whole vector, simply store it on BE and reverse bytes and
16128       // store on LE.
16129       if (Width == 16) {
16130         Value *StVec = Op2;
16131         if (IsLE) {
16132           SmallVector<int, 16> RevMask;
16133           for (int Idx = 0; Idx < 16; Idx++)
16134             RevMask.push_back(15 - Idx);
16135           StVec = Builder.CreateShuffleVector(Op2, Op2, RevMask);
16136         }
16137         return Builder.CreateStore(
16138             StVec, Address(Op0, Op2->getType(), CharUnits::fromQuantity(1)));
16139       }
16140       auto *ConvTy = Int64Ty;
16141       unsigned NumElts = 0;
16142       switch (Width) {
16143       default:
16144         llvm_unreachable("width for stores must be a power of 2");
16145       case 8:
16146         ConvTy = Int64Ty;
16147         NumElts = 2;
16148         break;
16149       case 4:
16150         ConvTy = Int32Ty;
16151         NumElts = 4;
16152         break;
16153       case 2:
16154         ConvTy = Int16Ty;
16155         NumElts = 8;
16156         break;
16157       case 1:
16158         ConvTy = Int8Ty;
16159         NumElts = 16;
16160         break;
16161       }
16162       Value *Vec = Builder.CreateBitCast(
16163           Op2, llvm::FixedVectorType::get(ConvTy, NumElts));
16164       Value *Ptr =
16165           Builder.CreateGEP(Int8Ty, Op0, ConstantInt::get(Int64Ty, Offset));
16166       Value *Elt = Builder.CreateExtractElement(Vec, EltNo);
16167       if (IsLE && Width > 1) {
16168         Function *F = CGM.getIntrinsic(Intrinsic::bswap, ConvTy);
16169         Elt = Builder.CreateCall(F, Elt);
16170       }
16171       return Builder.CreateStore(
16172           Elt, Address(Ptr, ConvTy, CharUnits::fromQuantity(1)));
16173     };
16174     unsigned Stored = 0;
16175     unsigned RemainingBytes = NumBytes;
16176     Value *Result;
16177     if (NumBytes == 16)
16178       return StoreSubVec(16, 0, 0);
16179     if (NumBytes >= 8) {
16180       Result = StoreSubVec(8, NumBytes - 8, IsLE ? 0 : 1);
16181       RemainingBytes -= 8;
16182       Stored += 8;
16183     }
16184     if (RemainingBytes >= 4) {
16185       Result = StoreSubVec(4, NumBytes - Stored - 4,
16186                            IsLE ? (Stored >> 2) : 3 - (Stored >> 2));
16187       RemainingBytes -= 4;
16188       Stored += 4;
16189     }
16190     if (RemainingBytes >= 2) {
16191       Result = StoreSubVec(2, NumBytes - Stored - 2,
16192                            IsLE ? (Stored >> 1) : 7 - (Stored >> 1));
16193       RemainingBytes -= 2;
16194       Stored += 2;
16195     }
16196     if (RemainingBytes)
16197       Result =
16198           StoreSubVec(1, NumBytes - Stored - 1, IsLE ? Stored : 15 - Stored);
16199     return Result;
16200   }
16201   // Square root
16202   case PPC::BI__builtin_vsx_xvsqrtsp:
16203   case PPC::BI__builtin_vsx_xvsqrtdp: {
16204     llvm::Type *ResultType = ConvertType(E->getType());
16205     Value *X = EmitScalarExpr(E->getArg(0));
16206     if (Builder.getIsFPConstrained()) {
16207       llvm::Function *F = CGM.getIntrinsic(
16208           Intrinsic::experimental_constrained_sqrt, ResultType);
16209       return Builder.CreateConstrainedFPCall(F, X);
16210     } else {
16211       llvm::Function *F = CGM.getIntrinsic(Intrinsic::sqrt, ResultType);
16212       return Builder.CreateCall(F, X);
16213     }
16214   }
16215   // Count leading zeros
16216   case PPC::BI__builtin_altivec_vclzb:
16217   case PPC::BI__builtin_altivec_vclzh:
16218   case PPC::BI__builtin_altivec_vclzw:
16219   case PPC::BI__builtin_altivec_vclzd: {
16220     llvm::Type *ResultType = ConvertType(E->getType());
16221     Value *X = EmitScalarExpr(E->getArg(0));
16222     Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
16223     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ResultType);
16224     return Builder.CreateCall(F, {X, Undef});
16225   }
16226   case PPC::BI__builtin_altivec_vctzb:
16227   case PPC::BI__builtin_altivec_vctzh:
16228   case PPC::BI__builtin_altivec_vctzw:
16229   case PPC::BI__builtin_altivec_vctzd: {
16230     llvm::Type *ResultType = ConvertType(E->getType());
16231     Value *X = EmitScalarExpr(E->getArg(0));
16232     Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
16233     Function *F = CGM.getIntrinsic(Intrinsic::cttz, ResultType);
16234     return Builder.CreateCall(F, {X, Undef});
16235   }
16236   case PPC::BI__builtin_altivec_vinsd:
16237   case PPC::BI__builtin_altivec_vinsw:
16238   case PPC::BI__builtin_altivec_vinsd_elt:
16239   case PPC::BI__builtin_altivec_vinsw_elt: {
16240     llvm::Type *ResultType = ConvertType(E->getType());
16241     Value *Op0 = EmitScalarExpr(E->getArg(0));
16242     Value *Op1 = EmitScalarExpr(E->getArg(1));
16243     Value *Op2 = EmitScalarExpr(E->getArg(2));
16244 
16245     bool IsUnaligned = (BuiltinID == PPC::BI__builtin_altivec_vinsw ||
16246                         BuiltinID == PPC::BI__builtin_altivec_vinsd);
16247 
16248     bool Is32bit = (BuiltinID == PPC::BI__builtin_altivec_vinsw ||
16249                     BuiltinID == PPC::BI__builtin_altivec_vinsw_elt);
16250 
16251     // The third argument must be a compile time constant.
16252     ConstantInt *ArgCI = dyn_cast<ConstantInt>(Op2);
16253     assert(ArgCI &&
16254            "Third Arg to vinsw/vinsd intrinsic must be a constant integer!");
16255 
16256     // Valid value for the third argument is dependent on the input type and
16257     // builtin called.
16258     int ValidMaxValue = 0;
16259     if (IsUnaligned)
16260       ValidMaxValue = (Is32bit) ? 12 : 8;
16261     else
16262       ValidMaxValue = (Is32bit) ? 3 : 1;
16263 
16264     // Get value of third argument.
16265     int64_t ConstArg = ArgCI->getSExtValue();
16266 
16267     // Compose range checking error message.
16268     std::string RangeErrMsg = IsUnaligned ? "byte" : "element";
16269     RangeErrMsg += " number " + llvm::to_string(ConstArg);
16270     RangeErrMsg += " is outside of the valid range [0, ";
16271     RangeErrMsg += llvm::to_string(ValidMaxValue) + "]";
16272 
16273     // Issue error if third argument is not within the valid range.
16274     if (ConstArg < 0 || ConstArg > ValidMaxValue)
16275       CGM.Error(E->getExprLoc(), RangeErrMsg);
16276 
16277     // Input to vec_replace_elt is an element index, convert to byte index.
16278     if (!IsUnaligned) {
16279       ConstArg *= Is32bit ? 4 : 8;
16280       // Fix the constant according to endianess.
16281       if (getTarget().isLittleEndian())
16282         ConstArg = (Is32bit ? 12 : 8) - ConstArg;
16283     }
16284 
16285     ID = Is32bit ? Intrinsic::ppc_altivec_vinsw : Intrinsic::ppc_altivec_vinsd;
16286     Op2 = ConstantInt::getSigned(Int32Ty, ConstArg);
16287     // Casting input to vector int as per intrinsic definition.
16288     Op0 =
16289         Is32bit
16290             ? Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int32Ty, 4))
16291             : Builder.CreateBitCast(Op0,
16292                                     llvm::FixedVectorType::get(Int64Ty, 2));
16293     return Builder.CreateBitCast(
16294         Builder.CreateCall(CGM.getIntrinsic(ID), {Op0, Op1, Op2}), ResultType);
16295   }
16296   case PPC::BI__builtin_altivec_vpopcntb:
16297   case PPC::BI__builtin_altivec_vpopcnth:
16298   case PPC::BI__builtin_altivec_vpopcntw:
16299   case PPC::BI__builtin_altivec_vpopcntd: {
16300     llvm::Type *ResultType = ConvertType(E->getType());
16301     Value *X = EmitScalarExpr(E->getArg(0));
16302     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ResultType);
16303     return Builder.CreateCall(F, X);
16304   }
16305   case PPC::BI__builtin_altivec_vadduqm:
16306   case PPC::BI__builtin_altivec_vsubuqm: {
16307     Value *Op0 = EmitScalarExpr(E->getArg(0));
16308     Value *Op1 = EmitScalarExpr(E->getArg(1));
16309     llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
16310     Op0 = Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int128Ty, 1));
16311     Op1 = Builder.CreateBitCast(Op1, llvm::FixedVectorType::get(Int128Ty, 1));
16312     if (BuiltinID == PPC::BI__builtin_altivec_vadduqm)
16313       return Builder.CreateAdd(Op0, Op1, "vadduqm");
16314     else
16315       return Builder.CreateSub(Op0, Op1, "vsubuqm");
16316   }
16317   case PPC::BI__builtin_altivec_vaddcuq_c:
16318   case PPC::BI__builtin_altivec_vsubcuq_c: {
16319     SmallVector<Value *, 2> Ops;
16320     Value *Op0 = EmitScalarExpr(E->getArg(0));
16321     Value *Op1 = EmitScalarExpr(E->getArg(1));
16322     llvm::Type *V1I128Ty = llvm::FixedVectorType::get(
16323         llvm::IntegerType::get(getLLVMContext(), 128), 1);
16324     Ops.push_back(Builder.CreateBitCast(Op0, V1I128Ty));
16325     Ops.push_back(Builder.CreateBitCast(Op1, V1I128Ty));
16326     ID = (BuiltinID == PPC::BI__builtin_altivec_vaddcuq_c)
16327              ? Intrinsic::ppc_altivec_vaddcuq
16328              : Intrinsic::ppc_altivec_vsubcuq;
16329     return Builder.CreateCall(CGM.getIntrinsic(ID), Ops, "");
16330   }
16331   case PPC::BI__builtin_altivec_vaddeuqm_c:
16332   case PPC::BI__builtin_altivec_vaddecuq_c:
16333   case PPC::BI__builtin_altivec_vsubeuqm_c:
16334   case PPC::BI__builtin_altivec_vsubecuq_c: {
16335     SmallVector<Value *, 3> Ops;
16336     Value *Op0 = EmitScalarExpr(E->getArg(0));
16337     Value *Op1 = EmitScalarExpr(E->getArg(1));
16338     Value *Op2 = EmitScalarExpr(E->getArg(2));
16339     llvm::Type *V1I128Ty = llvm::FixedVectorType::get(
16340         llvm::IntegerType::get(getLLVMContext(), 128), 1);
16341     Ops.push_back(Builder.CreateBitCast(Op0, V1I128Ty));
16342     Ops.push_back(Builder.CreateBitCast(Op1, V1I128Ty));
16343     Ops.push_back(Builder.CreateBitCast(Op2, V1I128Ty));
16344     switch (BuiltinID) {
16345     default:
16346       llvm_unreachable("Unsupported intrinsic!");
16347     case PPC::BI__builtin_altivec_vaddeuqm_c:
16348       ID = Intrinsic::ppc_altivec_vaddeuqm;
16349       break;
16350     case PPC::BI__builtin_altivec_vaddecuq_c:
16351       ID = Intrinsic::ppc_altivec_vaddecuq;
16352       break;
16353     case PPC::BI__builtin_altivec_vsubeuqm_c:
16354       ID = Intrinsic::ppc_altivec_vsubeuqm;
16355       break;
16356     case PPC::BI__builtin_altivec_vsubecuq_c:
16357       ID = Intrinsic::ppc_altivec_vsubecuq;
16358       break;
16359     }
16360     return Builder.CreateCall(CGM.getIntrinsic(ID), Ops, "");
16361   }
16362   // Rotate and insert under mask operation.
16363   // __rldimi(rs, is, shift, mask)
16364   // (rotl64(rs, shift) & mask) | (is & ~mask)
16365   // __rlwimi(rs, is, shift, mask)
16366   // (rotl(rs, shift) & mask) | (is & ~mask)
16367   case PPC::BI__builtin_ppc_rldimi:
16368   case PPC::BI__builtin_ppc_rlwimi: {
16369     Value *Op0 = EmitScalarExpr(E->getArg(0));
16370     Value *Op1 = EmitScalarExpr(E->getArg(1));
16371     Value *Op2 = EmitScalarExpr(E->getArg(2));
16372     Value *Op3 = EmitScalarExpr(E->getArg(3));
16373     llvm::Type *Ty = Op0->getType();
16374     Function *F = CGM.getIntrinsic(Intrinsic::fshl, Ty);
16375     if (BuiltinID == PPC::BI__builtin_ppc_rldimi)
16376       Op2 = Builder.CreateZExt(Op2, Int64Ty);
16377     Value *Shift = Builder.CreateCall(F, {Op0, Op0, Op2});
16378     Value *X = Builder.CreateAnd(Shift, Op3);
16379     Value *Y = Builder.CreateAnd(Op1, Builder.CreateNot(Op3));
16380     return Builder.CreateOr(X, Y);
16381   }
16382   // Rotate and insert under mask operation.
16383   // __rlwnm(rs, shift, mask)
16384   // rotl(rs, shift) & mask
16385   case PPC::BI__builtin_ppc_rlwnm: {
16386     Value *Op0 = EmitScalarExpr(E->getArg(0));
16387     Value *Op1 = EmitScalarExpr(E->getArg(1));
16388     Value *Op2 = EmitScalarExpr(E->getArg(2));
16389     llvm::Type *Ty = Op0->getType();
16390     Function *F = CGM.getIntrinsic(Intrinsic::fshl, Ty);
16391     Value *Shift = Builder.CreateCall(F, {Op0, Op0, Op1});
16392     return Builder.CreateAnd(Shift, Op2);
16393   }
16394   case PPC::BI__builtin_ppc_poppar4:
16395   case PPC::BI__builtin_ppc_poppar8: {
16396     Value *Op0 = EmitScalarExpr(E->getArg(0));
16397     llvm::Type *ArgType = Op0->getType();
16398     Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType);
16399     Value *Tmp = Builder.CreateCall(F, Op0);
16400 
16401     llvm::Type *ResultType = ConvertType(E->getType());
16402     Value *Result = Builder.CreateAnd(Tmp, llvm::ConstantInt::get(ArgType, 1));
16403     if (Result->getType() != ResultType)
16404       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
16405                                      "cast");
16406     return Result;
16407   }
16408   case PPC::BI__builtin_ppc_cmpb: {
16409     Value *Op0 = EmitScalarExpr(E->getArg(0));
16410     Value *Op1 = EmitScalarExpr(E->getArg(1));
16411     if (getTarget().getTriple().isPPC64()) {
16412       Function *F =
16413           CGM.getIntrinsic(Intrinsic::ppc_cmpb, {Int64Ty, Int64Ty, Int64Ty});
16414       return Builder.CreateCall(F, {Op0, Op1}, "cmpb");
16415     }
16416     // For 32 bit, emit the code as below:
16417     // %conv = trunc i64 %a to i32
16418     // %conv1 = trunc i64 %b to i32
16419     // %shr = lshr i64 %a, 32
16420     // %conv2 = trunc i64 %shr to i32
16421     // %shr3 = lshr i64 %b, 32
16422     // %conv4 = trunc i64 %shr3 to i32
16423     // %0 = tail call i32 @llvm.ppc.cmpb32(i32 %conv, i32 %conv1)
16424     // %conv5 = zext i32 %0 to i64
16425     // %1 = tail call i32 @llvm.ppc.cmpb32(i32 %conv2, i32 %conv4)
16426     // %conv614 = zext i32 %1 to i64
16427     // %shl = shl nuw i64 %conv614, 32
16428     // %or = or i64 %shl, %conv5
16429     // ret i64 %or
16430     Function *F =
16431         CGM.getIntrinsic(Intrinsic::ppc_cmpb, {Int32Ty, Int32Ty, Int32Ty});
16432     Value *ArgOneLo = Builder.CreateTrunc(Op0, Int32Ty);
16433     Value *ArgTwoLo = Builder.CreateTrunc(Op1, Int32Ty);
16434     Constant *ShiftAmt = ConstantInt::get(Int64Ty, 32);
16435     Value *ArgOneHi =
16436         Builder.CreateTrunc(Builder.CreateLShr(Op0, ShiftAmt), Int32Ty);
16437     Value *ArgTwoHi =
16438         Builder.CreateTrunc(Builder.CreateLShr(Op1, ShiftAmt), Int32Ty);
16439     Value *ResLo = Builder.CreateZExt(
16440         Builder.CreateCall(F, {ArgOneLo, ArgTwoLo}, "cmpb"), Int64Ty);
16441     Value *ResHiShift = Builder.CreateZExt(
16442         Builder.CreateCall(F, {ArgOneHi, ArgTwoHi}, "cmpb"), Int64Ty);
16443     Value *ResHi = Builder.CreateShl(ResHiShift, ShiftAmt);
16444     return Builder.CreateOr(ResLo, ResHi);
16445   }
16446   // Copy sign
16447   case PPC::BI__builtin_vsx_xvcpsgnsp:
16448   case PPC::BI__builtin_vsx_xvcpsgndp: {
16449     llvm::Type *ResultType = ConvertType(E->getType());
16450     Value *X = EmitScalarExpr(E->getArg(0));
16451     Value *Y = EmitScalarExpr(E->getArg(1));
16452     ID = Intrinsic::copysign;
16453     llvm::Function *F = CGM.getIntrinsic(ID, ResultType);
16454     return Builder.CreateCall(F, {X, Y});
16455   }
16456   // Rounding/truncation
16457   case PPC::BI__builtin_vsx_xvrspip:
16458   case PPC::BI__builtin_vsx_xvrdpip:
16459   case PPC::BI__builtin_vsx_xvrdpim:
16460   case PPC::BI__builtin_vsx_xvrspim:
16461   case PPC::BI__builtin_vsx_xvrdpi:
16462   case PPC::BI__builtin_vsx_xvrspi:
16463   case PPC::BI__builtin_vsx_xvrdpic:
16464   case PPC::BI__builtin_vsx_xvrspic:
16465   case PPC::BI__builtin_vsx_xvrdpiz:
16466   case PPC::BI__builtin_vsx_xvrspiz: {
16467     llvm::Type *ResultType = ConvertType(E->getType());
16468     Value *X = EmitScalarExpr(E->getArg(0));
16469     if (BuiltinID == PPC::BI__builtin_vsx_xvrdpim ||
16470         BuiltinID == PPC::BI__builtin_vsx_xvrspim)
16471       ID = Builder.getIsFPConstrained()
16472                ? Intrinsic::experimental_constrained_floor
16473                : Intrinsic::floor;
16474     else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpi ||
16475              BuiltinID == PPC::BI__builtin_vsx_xvrspi)
16476       ID = Builder.getIsFPConstrained()
16477                ? Intrinsic::experimental_constrained_round
16478                : Intrinsic::round;
16479     else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpic ||
16480              BuiltinID == PPC::BI__builtin_vsx_xvrspic)
16481       ID = Builder.getIsFPConstrained()
16482                ? Intrinsic::experimental_constrained_rint
16483                : Intrinsic::rint;
16484     else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpip ||
16485              BuiltinID == PPC::BI__builtin_vsx_xvrspip)
16486       ID = Builder.getIsFPConstrained()
16487                ? Intrinsic::experimental_constrained_ceil
16488                : Intrinsic::ceil;
16489     else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpiz ||
16490              BuiltinID == PPC::BI__builtin_vsx_xvrspiz)
16491       ID = Builder.getIsFPConstrained()
16492                ? Intrinsic::experimental_constrained_trunc
16493                : Intrinsic::trunc;
16494     llvm::Function *F = CGM.getIntrinsic(ID, ResultType);
16495     return Builder.getIsFPConstrained() ? Builder.CreateConstrainedFPCall(F, X)
16496                                         : Builder.CreateCall(F, X);
16497   }
16498 
16499   // Absolute value
16500   case PPC::BI__builtin_vsx_xvabsdp:
16501   case PPC::BI__builtin_vsx_xvabssp: {
16502     llvm::Type *ResultType = ConvertType(E->getType());
16503     Value *X = EmitScalarExpr(E->getArg(0));
16504     llvm::Function *F = CGM.getIntrinsic(Intrinsic::fabs, ResultType);
16505     return Builder.CreateCall(F, X);
16506   }
16507 
16508   // Fastmath by default
16509   case PPC::BI__builtin_ppc_recipdivf:
16510   case PPC::BI__builtin_ppc_recipdivd:
16511   case PPC::BI__builtin_ppc_rsqrtf:
16512   case PPC::BI__builtin_ppc_rsqrtd: {
16513     FastMathFlags FMF = Builder.getFastMathFlags();
16514     Builder.getFastMathFlags().setFast();
16515     llvm::Type *ResultType = ConvertType(E->getType());
16516     Value *X = EmitScalarExpr(E->getArg(0));
16517 
16518     if (BuiltinID == PPC::BI__builtin_ppc_recipdivf ||
16519         BuiltinID == PPC::BI__builtin_ppc_recipdivd) {
16520       Value *Y = EmitScalarExpr(E->getArg(1));
16521       Value *FDiv = Builder.CreateFDiv(X, Y, "recipdiv");
16522       Builder.getFastMathFlags() &= (FMF);
16523       return FDiv;
16524     }
16525     auto *One = ConstantFP::get(ResultType, 1.0);
16526     llvm::Function *F = CGM.getIntrinsic(Intrinsic::sqrt, ResultType);
16527     Value *FDiv = Builder.CreateFDiv(One, Builder.CreateCall(F, X), "rsqrt");
16528     Builder.getFastMathFlags() &= (FMF);
16529     return FDiv;
16530   }
16531   case PPC::BI__builtin_ppc_alignx: {
16532     Value *Op0 = EmitScalarExpr(E->getArg(0));
16533     Value *Op1 = EmitScalarExpr(E->getArg(1));
16534     ConstantInt *AlignmentCI = cast<ConstantInt>(Op0);
16535     if (AlignmentCI->getValue().ugt(llvm::Value::MaximumAlignment))
16536       AlignmentCI = ConstantInt::get(AlignmentCI->getType(),
16537                                      llvm::Value::MaximumAlignment);
16538 
16539     emitAlignmentAssumption(Op1, E->getArg(1),
16540                             /*The expr loc is sufficient.*/ SourceLocation(),
16541                             AlignmentCI, nullptr);
16542     return Op1;
16543   }
16544   case PPC::BI__builtin_ppc_rdlam: {
16545     Value *Op0 = EmitScalarExpr(E->getArg(0));
16546     Value *Op1 = EmitScalarExpr(E->getArg(1));
16547     Value *Op2 = EmitScalarExpr(E->getArg(2));
16548     llvm::Type *Ty = Op0->getType();
16549     Value *ShiftAmt = Builder.CreateIntCast(Op1, Ty, false);
16550     Function *F = CGM.getIntrinsic(Intrinsic::fshl, Ty);
16551     Value *Rotate = Builder.CreateCall(F, {Op0, Op0, ShiftAmt});
16552     return Builder.CreateAnd(Rotate, Op2);
16553   }
16554   case PPC::BI__builtin_ppc_load2r: {
16555     Function *F = CGM.getIntrinsic(Intrinsic::ppc_load2r);
16556     Value *Op0 = Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), Int8PtrTy);
16557     Value *LoadIntrinsic = Builder.CreateCall(F, {Op0});
16558     return Builder.CreateTrunc(LoadIntrinsic, Int16Ty);
16559   }
16560   // FMA variations
16561   case PPC::BI__builtin_ppc_fnmsub:
16562   case PPC::BI__builtin_ppc_fnmsubs:
16563   case PPC::BI__builtin_vsx_xvmaddadp:
16564   case PPC::BI__builtin_vsx_xvmaddasp:
16565   case PPC::BI__builtin_vsx_xvnmaddadp:
16566   case PPC::BI__builtin_vsx_xvnmaddasp:
16567   case PPC::BI__builtin_vsx_xvmsubadp:
16568   case PPC::BI__builtin_vsx_xvmsubasp:
16569   case PPC::BI__builtin_vsx_xvnmsubadp:
16570   case PPC::BI__builtin_vsx_xvnmsubasp: {
16571     llvm::Type *ResultType = ConvertType(E->getType());
16572     Value *X = EmitScalarExpr(E->getArg(0));
16573     Value *Y = EmitScalarExpr(E->getArg(1));
16574     Value *Z = EmitScalarExpr(E->getArg(2));
16575     llvm::Function *F;
16576     if (Builder.getIsFPConstrained())
16577       F = CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, ResultType);
16578     else
16579       F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
16580     switch (BuiltinID) {
16581       case PPC::BI__builtin_vsx_xvmaddadp:
16582       case PPC::BI__builtin_vsx_xvmaddasp:
16583         if (Builder.getIsFPConstrained())
16584           return Builder.CreateConstrainedFPCall(F, {X, Y, Z});
16585         else
16586           return Builder.CreateCall(F, {X, Y, Z});
16587       case PPC::BI__builtin_vsx_xvnmaddadp:
16588       case PPC::BI__builtin_vsx_xvnmaddasp:
16589         if (Builder.getIsFPConstrained())
16590           return Builder.CreateFNeg(
16591               Builder.CreateConstrainedFPCall(F, {X, Y, Z}), "neg");
16592         else
16593           return Builder.CreateFNeg(Builder.CreateCall(F, {X, Y, Z}), "neg");
16594       case PPC::BI__builtin_vsx_xvmsubadp:
16595       case PPC::BI__builtin_vsx_xvmsubasp:
16596         if (Builder.getIsFPConstrained())
16597           return Builder.CreateConstrainedFPCall(
16598               F, {X, Y, Builder.CreateFNeg(Z, "neg")});
16599         else
16600           return Builder.CreateCall(F, {X, Y, Builder.CreateFNeg(Z, "neg")});
16601       case PPC::BI__builtin_ppc_fnmsub:
16602       case PPC::BI__builtin_ppc_fnmsubs:
16603       case PPC::BI__builtin_vsx_xvnmsubadp:
16604       case PPC::BI__builtin_vsx_xvnmsubasp:
16605         if (Builder.getIsFPConstrained())
16606           return Builder.CreateFNeg(
16607               Builder.CreateConstrainedFPCall(
16608                   F, {X, Y, Builder.CreateFNeg(Z, "neg")}),
16609               "neg");
16610         else
16611           return Builder.CreateCall(
16612               CGM.getIntrinsic(Intrinsic::ppc_fnmsub, ResultType), {X, Y, Z});
16613       }
16614     llvm_unreachable("Unknown FMA operation");
16615     return nullptr; // Suppress no-return warning
16616   }
16617 
16618   case PPC::BI__builtin_vsx_insertword: {
16619     Value *Op0 = EmitScalarExpr(E->getArg(0));
16620     Value *Op1 = EmitScalarExpr(E->getArg(1));
16621     Value *Op2 = EmitScalarExpr(E->getArg(2));
16622     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_vsx_xxinsertw);
16623 
16624     // Third argument is a compile time constant int. It must be clamped to
16625     // to the range [0, 12].
16626     ConstantInt *ArgCI = dyn_cast<ConstantInt>(Op2);
16627     assert(ArgCI &&
16628            "Third arg to xxinsertw intrinsic must be constant integer");
16629     const int64_t MaxIndex = 12;
16630     int64_t Index = std::clamp(ArgCI->getSExtValue(), (int64_t)0, MaxIndex);
16631 
16632     // The builtin semantics don't exactly match the xxinsertw instructions
16633     // semantics (which ppc_vsx_xxinsertw follows). The builtin extracts the
16634     // word from the first argument, and inserts it in the second argument. The
16635     // instruction extracts the word from its second input register and inserts
16636     // it into its first input register, so swap the first and second arguments.
16637     std::swap(Op0, Op1);
16638 
16639     // Need to cast the second argument from a vector of unsigned int to a
16640     // vector of long long.
16641     Op1 = Builder.CreateBitCast(Op1, llvm::FixedVectorType::get(Int64Ty, 2));
16642 
16643     if (getTarget().isLittleEndian()) {
16644       // Reverse the double words in the vector we will extract from.
16645       Op0 = Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int64Ty, 2));
16646       Op0 = Builder.CreateShuffleVector(Op0, Op0, ArrayRef<int>{1, 0});
16647 
16648       // Reverse the index.
16649       Index = MaxIndex - Index;
16650     }
16651 
16652     // Intrinsic expects the first arg to be a vector of int.
16653     Op0 = Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int32Ty, 4));
16654     Op2 = ConstantInt::getSigned(Int32Ty, Index);
16655     return Builder.CreateCall(F, {Op0, Op1, Op2});
16656   }
16657 
16658   case PPC::BI__builtin_vsx_extractuword: {
16659     Value *Op0 = EmitScalarExpr(E->getArg(0));
16660     Value *Op1 = EmitScalarExpr(E->getArg(1));
16661     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_vsx_xxextractuw);
16662 
16663     // Intrinsic expects the first argument to be a vector of doublewords.
16664     Op0 = Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int64Ty, 2));
16665 
16666     // The second argument is a compile time constant int that needs to
16667     // be clamped to the range [0, 12].
16668     ConstantInt *ArgCI = dyn_cast<ConstantInt>(Op1);
16669     assert(ArgCI &&
16670            "Second Arg to xxextractuw intrinsic must be a constant integer!");
16671     const int64_t MaxIndex = 12;
16672     int64_t Index = std::clamp(ArgCI->getSExtValue(), (int64_t)0, MaxIndex);
16673 
16674     if (getTarget().isLittleEndian()) {
16675       // Reverse the index.
16676       Index = MaxIndex - Index;
16677       Op1 = ConstantInt::getSigned(Int32Ty, Index);
16678 
16679       // Emit the call, then reverse the double words of the results vector.
16680       Value *Call = Builder.CreateCall(F, {Op0, Op1});
16681 
16682       Value *ShuffleCall =
16683           Builder.CreateShuffleVector(Call, Call, ArrayRef<int>{1, 0});
16684       return ShuffleCall;
16685     } else {
16686       Op1 = ConstantInt::getSigned(Int32Ty, Index);
16687       return Builder.CreateCall(F, {Op0, Op1});
16688     }
16689   }
16690 
16691   case PPC::BI__builtin_vsx_xxpermdi: {
16692     Value *Op0 = EmitScalarExpr(E->getArg(0));
16693     Value *Op1 = EmitScalarExpr(E->getArg(1));
16694     Value *Op2 = EmitScalarExpr(E->getArg(2));
16695     ConstantInt *ArgCI = dyn_cast<ConstantInt>(Op2);
16696     assert(ArgCI && "Third arg must be constant integer!");
16697 
16698     unsigned Index = ArgCI->getZExtValue();
16699     Op0 = Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int64Ty, 2));
16700     Op1 = Builder.CreateBitCast(Op1, llvm::FixedVectorType::get(Int64Ty, 2));
16701 
16702     // Account for endianness by treating this as just a shuffle. So we use the
16703     // same indices for both LE and BE in order to produce expected results in
16704     // both cases.
16705     int ElemIdx0 = (Index & 2) >> 1;
16706     int ElemIdx1 = 2 + (Index & 1);
16707 
16708     int ShuffleElts[2] = {ElemIdx0, ElemIdx1};
16709     Value *ShuffleCall = Builder.CreateShuffleVector(Op0, Op1, ShuffleElts);
16710     QualType BIRetType = E->getType();
16711     auto RetTy = ConvertType(BIRetType);
16712     return Builder.CreateBitCast(ShuffleCall, RetTy);
16713   }
16714 
16715   case PPC::BI__builtin_vsx_xxsldwi: {
16716     Value *Op0 = EmitScalarExpr(E->getArg(0));
16717     Value *Op1 = EmitScalarExpr(E->getArg(1));
16718     Value *Op2 = EmitScalarExpr(E->getArg(2));
16719     ConstantInt *ArgCI = dyn_cast<ConstantInt>(Op2);
16720     assert(ArgCI && "Third argument must be a compile time constant");
16721     unsigned Index = ArgCI->getZExtValue() & 0x3;
16722     Op0 = Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int32Ty, 4));
16723     Op1 = Builder.CreateBitCast(Op1, llvm::FixedVectorType::get(Int32Ty, 4));
16724 
16725     // Create a shuffle mask
16726     int ElemIdx0;
16727     int ElemIdx1;
16728     int ElemIdx2;
16729     int ElemIdx3;
16730     if (getTarget().isLittleEndian()) {
16731       // Little endian element N comes from element 8+N-Index of the
16732       // concatenated wide vector (of course, using modulo arithmetic on
16733       // the total number of elements).
16734       ElemIdx0 = (8 - Index) % 8;
16735       ElemIdx1 = (9 - Index) % 8;
16736       ElemIdx2 = (10 - Index) % 8;
16737       ElemIdx3 = (11 - Index) % 8;
16738     } else {
16739       // Big endian ElemIdx<N> = Index + N
16740       ElemIdx0 = Index;
16741       ElemIdx1 = Index + 1;
16742       ElemIdx2 = Index + 2;
16743       ElemIdx3 = Index + 3;
16744     }
16745 
16746     int ShuffleElts[4] = {ElemIdx0, ElemIdx1, ElemIdx2, ElemIdx3};
16747     Value *ShuffleCall = Builder.CreateShuffleVector(Op0, Op1, ShuffleElts);
16748     QualType BIRetType = E->getType();
16749     auto RetTy = ConvertType(BIRetType);
16750     return Builder.CreateBitCast(ShuffleCall, RetTy);
16751   }
16752 
16753   case PPC::BI__builtin_pack_vector_int128: {
16754     Value *Op0 = EmitScalarExpr(E->getArg(0));
16755     Value *Op1 = EmitScalarExpr(E->getArg(1));
16756     bool isLittleEndian = getTarget().isLittleEndian();
16757     Value *PoisonValue =
16758         llvm::PoisonValue::get(llvm::FixedVectorType::get(Op0->getType(), 2));
16759     Value *Res = Builder.CreateInsertElement(
16760         PoisonValue, Op0, (uint64_t)(isLittleEndian ? 1 : 0));
16761     Res = Builder.CreateInsertElement(Res, Op1,
16762                                       (uint64_t)(isLittleEndian ? 0 : 1));
16763     return Builder.CreateBitCast(Res, ConvertType(E->getType()));
16764   }
16765 
16766   case PPC::BI__builtin_unpack_vector_int128: {
16767     Value *Op0 = EmitScalarExpr(E->getArg(0));
16768     Value *Op1 = EmitScalarExpr(E->getArg(1));
16769     ConstantInt *Index = cast<ConstantInt>(Op1);
16770     Value *Unpacked = Builder.CreateBitCast(
16771         Op0, llvm::FixedVectorType::get(ConvertType(E->getType()), 2));
16772 
16773     if (getTarget().isLittleEndian())
16774       Index = ConstantInt::get(Index->getType(), 1 - Index->getZExtValue());
16775 
16776     return Builder.CreateExtractElement(Unpacked, Index);
16777   }
16778 
16779   case PPC::BI__builtin_ppc_sthcx: {
16780     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_sthcx);
16781     Value *Op0 = Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), Int8PtrTy);
16782     Value *Op1 = Builder.CreateSExt(EmitScalarExpr(E->getArg(1)), Int32Ty);
16783     return Builder.CreateCall(F, {Op0, Op1});
16784   }
16785 
16786   // The PPC MMA builtins take a pointer to a __vector_quad as an argument.
16787   // Some of the MMA instructions accumulate their result into an existing
16788   // accumulator whereas the others generate a new accumulator. So we need to
16789   // use custom code generation to expand a builtin call with a pointer to a
16790   // load (if the corresponding instruction accumulates its result) followed by
16791   // the call to the intrinsic and a store of the result.
16792 #define CUSTOM_BUILTIN(Name, Intr, Types, Accumulate, Feature) \
16793   case PPC::BI__builtin_##Name:
16794 #include "clang/Basic/BuiltinsPPC.def"
16795   {
16796     SmallVector<Value *, 4> Ops;
16797     for (unsigned i = 0, e = E->getNumArgs(); i != e; i++)
16798       if (E->getArg(i)->getType()->isArrayType())
16799         Ops.push_back(EmitArrayToPointerDecay(E->getArg(i)).getPointer());
16800       else
16801         Ops.push_back(EmitScalarExpr(E->getArg(i)));
16802     // The first argument of these two builtins is a pointer used to store their
16803     // result. However, the llvm intrinsics return their result in multiple
16804     // return values. So, here we emit code extracting these values from the
16805     // intrinsic results and storing them using that pointer.
16806     if (BuiltinID == PPC::BI__builtin_mma_disassemble_acc ||
16807         BuiltinID == PPC::BI__builtin_vsx_disassemble_pair ||
16808         BuiltinID == PPC::BI__builtin_mma_disassemble_pair) {
16809       unsigned NumVecs = 2;
16810       auto Intrinsic = Intrinsic::ppc_vsx_disassemble_pair;
16811       if (BuiltinID == PPC::BI__builtin_mma_disassemble_acc) {
16812         NumVecs = 4;
16813         Intrinsic = Intrinsic::ppc_mma_disassemble_acc;
16814       }
16815       llvm::Function *F = CGM.getIntrinsic(Intrinsic);
16816       Address Addr = EmitPointerWithAlignment(E->getArg(1));
16817       Value *Vec = Builder.CreateLoad(Addr);
16818       Value *Call = Builder.CreateCall(F, {Vec});
16819       llvm::Type *VTy = llvm::FixedVectorType::get(Int8Ty, 16);
16820       Value *Ptr = Ops[0];
16821       for (unsigned i=0; i<NumVecs; i++) {
16822         Value *Vec = Builder.CreateExtractValue(Call, i);
16823         llvm::ConstantInt* Index = llvm::ConstantInt::get(IntTy, i);
16824         Value *GEP = Builder.CreateInBoundsGEP(VTy, Ptr, Index);
16825         Builder.CreateAlignedStore(Vec, GEP, MaybeAlign(16));
16826       }
16827       return Call;
16828     }
16829     if (BuiltinID == PPC::BI__builtin_vsx_build_pair ||
16830         BuiltinID == PPC::BI__builtin_mma_build_acc) {
16831       // Reverse the order of the operands for LE, so the
16832       // same builtin call can be used on both LE and BE
16833       // without the need for the programmer to swap operands.
16834       // The operands are reversed starting from the second argument,
16835       // the first operand is the pointer to the pair/accumulator
16836       // that is being built.
16837       if (getTarget().isLittleEndian())
16838         std::reverse(Ops.begin() + 1, Ops.end());
16839     }
16840     bool Accumulate;
16841     switch (BuiltinID) {
16842   #define CUSTOM_BUILTIN(Name, Intr, Types, Acc, Feature) \
16843     case PPC::BI__builtin_##Name: \
16844       ID = Intrinsic::ppc_##Intr; \
16845       Accumulate = Acc; \
16846       break;
16847   #include "clang/Basic/BuiltinsPPC.def"
16848     }
16849     if (BuiltinID == PPC::BI__builtin_vsx_lxvp ||
16850         BuiltinID == PPC::BI__builtin_vsx_stxvp ||
16851         BuiltinID == PPC::BI__builtin_mma_lxvp ||
16852         BuiltinID == PPC::BI__builtin_mma_stxvp) {
16853       if (BuiltinID == PPC::BI__builtin_vsx_lxvp ||
16854           BuiltinID == PPC::BI__builtin_mma_lxvp) {
16855         Ops[1] = Builder.CreateBitCast(Ops[1], Int8PtrTy);
16856         Ops[0] = Builder.CreateGEP(Int8Ty, Ops[1], Ops[0]);
16857       } else {
16858         Ops[2] = Builder.CreateBitCast(Ops[2], Int8PtrTy);
16859         Ops[1] = Builder.CreateGEP(Int8Ty, Ops[2], Ops[1]);
16860       }
16861       Ops.pop_back();
16862       llvm::Function *F = CGM.getIntrinsic(ID);
16863       return Builder.CreateCall(F, Ops, "");
16864     }
16865     SmallVector<Value*, 4> CallOps;
16866     if (Accumulate) {
16867       Address Addr = EmitPointerWithAlignment(E->getArg(0));
16868       Value *Acc = Builder.CreateLoad(Addr);
16869       CallOps.push_back(Acc);
16870     }
16871     for (unsigned i=1; i<Ops.size(); i++)
16872       CallOps.push_back(Ops[i]);
16873     llvm::Function *F = CGM.getIntrinsic(ID);
16874     Value *Call = Builder.CreateCall(F, CallOps);
16875     return Builder.CreateAlignedStore(Call, Ops[0], MaybeAlign(64));
16876   }
16877 
16878   case PPC::BI__builtin_ppc_compare_and_swap:
16879   case PPC::BI__builtin_ppc_compare_and_swaplp: {
16880     Address Addr = EmitPointerWithAlignment(E->getArg(0));
16881     Address OldValAddr = EmitPointerWithAlignment(E->getArg(1));
16882     Value *OldVal = Builder.CreateLoad(OldValAddr);
16883     QualType AtomicTy = E->getArg(0)->getType()->getPointeeType();
16884     LValue LV = MakeAddrLValue(Addr, AtomicTy);
16885     Value *Op2 = EmitScalarExpr(E->getArg(2));
16886     auto Pair = EmitAtomicCompareExchange(
16887         LV, RValue::get(OldVal), RValue::get(Op2), E->getExprLoc(),
16888         llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Monotonic, true);
16889     // Unlike c11's atomic_compare_exchange, according to
16890     // https://www.ibm.com/docs/en/xl-c-and-cpp-aix/16.1?topic=functions-compare-swap-compare-swaplp
16891     // > In either case, the contents of the memory location specified by addr
16892     // > are copied into the memory location specified by old_val_addr.
16893     // But it hasn't specified storing to OldValAddr is atomic or not and
16894     // which order to use. Now following XL's codegen, treat it as a normal
16895     // store.
16896     Value *LoadedVal = Pair.first.getScalarVal();
16897     Builder.CreateStore(LoadedVal, OldValAddr);
16898     return Builder.CreateZExt(Pair.second, Builder.getInt32Ty());
16899   }
16900   case PPC::BI__builtin_ppc_fetch_and_add:
16901   case PPC::BI__builtin_ppc_fetch_and_addlp: {
16902     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E,
16903                                  llvm::AtomicOrdering::Monotonic);
16904   }
16905   case PPC::BI__builtin_ppc_fetch_and_and:
16906   case PPC::BI__builtin_ppc_fetch_and_andlp: {
16907     return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E,
16908                                  llvm::AtomicOrdering::Monotonic);
16909   }
16910 
16911   case PPC::BI__builtin_ppc_fetch_and_or:
16912   case PPC::BI__builtin_ppc_fetch_and_orlp: {
16913     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E,
16914                                  llvm::AtomicOrdering::Monotonic);
16915   }
16916   case PPC::BI__builtin_ppc_fetch_and_swap:
16917   case PPC::BI__builtin_ppc_fetch_and_swaplp: {
16918     return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E,
16919                                  llvm::AtomicOrdering::Monotonic);
16920   }
16921   case PPC::BI__builtin_ppc_ldarx:
16922   case PPC::BI__builtin_ppc_lwarx:
16923   case PPC::BI__builtin_ppc_lharx:
16924   case PPC::BI__builtin_ppc_lbarx:
16925     return emitPPCLoadReserveIntrinsic(*this, BuiltinID, E);
16926   case PPC::BI__builtin_ppc_mfspr: {
16927     Value *Op0 = EmitScalarExpr(E->getArg(0));
16928     llvm::Type *RetType = CGM.getDataLayout().getTypeSizeInBits(VoidPtrTy) == 32
16929                               ? Int32Ty
16930                               : Int64Ty;
16931     Function *F = CGM.getIntrinsic(Intrinsic::ppc_mfspr, RetType);
16932     return Builder.CreateCall(F, {Op0});
16933   }
16934   case PPC::BI__builtin_ppc_mtspr: {
16935     Value *Op0 = EmitScalarExpr(E->getArg(0));
16936     Value *Op1 = EmitScalarExpr(E->getArg(1));
16937     llvm::Type *RetType = CGM.getDataLayout().getTypeSizeInBits(VoidPtrTy) == 32
16938                               ? Int32Ty
16939                               : Int64Ty;
16940     Function *F = CGM.getIntrinsic(Intrinsic::ppc_mtspr, RetType);
16941     return Builder.CreateCall(F, {Op0, Op1});
16942   }
16943   case PPC::BI__builtin_ppc_popcntb: {
16944     Value *ArgValue = EmitScalarExpr(E->getArg(0));
16945     llvm::Type *ArgType = ArgValue->getType();
16946     Function *F = CGM.getIntrinsic(Intrinsic::ppc_popcntb, {ArgType, ArgType});
16947     return Builder.CreateCall(F, {ArgValue}, "popcntb");
16948   }
16949   case PPC::BI__builtin_ppc_mtfsf: {
16950     // The builtin takes a uint32 that needs to be cast to an
16951     // f64 to be passed to the intrinsic.
16952     Value *Op0 = EmitScalarExpr(E->getArg(0));
16953     Value *Op1 = EmitScalarExpr(E->getArg(1));
16954     Value *Cast = Builder.CreateUIToFP(Op1, DoubleTy);
16955     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_mtfsf);
16956     return Builder.CreateCall(F, {Op0, Cast}, "");
16957   }
16958 
16959   case PPC::BI__builtin_ppc_swdiv_nochk:
16960   case PPC::BI__builtin_ppc_swdivs_nochk: {
16961     Value *Op0 = EmitScalarExpr(E->getArg(0));
16962     Value *Op1 = EmitScalarExpr(E->getArg(1));
16963     FastMathFlags FMF = Builder.getFastMathFlags();
16964     Builder.getFastMathFlags().setFast();
16965     Value *FDiv = Builder.CreateFDiv(Op0, Op1, "swdiv_nochk");
16966     Builder.getFastMathFlags() &= (FMF);
16967     return FDiv;
16968   }
16969   case PPC::BI__builtin_ppc_fric:
16970     return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
16971                            *this, E, Intrinsic::rint,
16972                            Intrinsic::experimental_constrained_rint))
16973         .getScalarVal();
16974   case PPC::BI__builtin_ppc_frim:
16975   case PPC::BI__builtin_ppc_frims:
16976     return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
16977                            *this, E, Intrinsic::floor,
16978                            Intrinsic::experimental_constrained_floor))
16979         .getScalarVal();
16980   case PPC::BI__builtin_ppc_frin:
16981   case PPC::BI__builtin_ppc_frins:
16982     return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
16983                            *this, E, Intrinsic::round,
16984                            Intrinsic::experimental_constrained_round))
16985         .getScalarVal();
16986   case PPC::BI__builtin_ppc_frip:
16987   case PPC::BI__builtin_ppc_frips:
16988     return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
16989                            *this, E, Intrinsic::ceil,
16990                            Intrinsic::experimental_constrained_ceil))
16991         .getScalarVal();
16992   case PPC::BI__builtin_ppc_friz:
16993   case PPC::BI__builtin_ppc_frizs:
16994     return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
16995                            *this, E, Intrinsic::trunc,
16996                            Intrinsic::experimental_constrained_trunc))
16997         .getScalarVal();
16998   case PPC::BI__builtin_ppc_fsqrt:
16999   case PPC::BI__builtin_ppc_fsqrts:
17000     return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
17001                            *this, E, Intrinsic::sqrt,
17002                            Intrinsic::experimental_constrained_sqrt))
17003         .getScalarVal();
17004   case PPC::BI__builtin_ppc_test_data_class: {
17005     Value *Op0 = EmitScalarExpr(E->getArg(0));
17006     Value *Op1 = EmitScalarExpr(E->getArg(1));
17007     return Builder.CreateCall(
17008         CGM.getIntrinsic(Intrinsic::ppc_test_data_class, Op0->getType()),
17009         {Op0, Op1}, "test_data_class");
17010   }
17011   case PPC::BI__builtin_ppc_maxfe: {
17012     Value *Op0 = EmitScalarExpr(E->getArg(0));
17013     Value *Op1 = EmitScalarExpr(E->getArg(1));
17014     Value *Op2 = EmitScalarExpr(E->getArg(2));
17015     Value *Op3 = EmitScalarExpr(E->getArg(3));
17016     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_maxfe),
17017                               {Op0, Op1, Op2, Op3});
17018   }
17019   case PPC::BI__builtin_ppc_maxfl: {
17020     Value *Op0 = EmitScalarExpr(E->getArg(0));
17021     Value *Op1 = EmitScalarExpr(E->getArg(1));
17022     Value *Op2 = EmitScalarExpr(E->getArg(2));
17023     Value *Op3 = EmitScalarExpr(E->getArg(3));
17024     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_maxfl),
17025                               {Op0, Op1, Op2, Op3});
17026   }
17027   case PPC::BI__builtin_ppc_maxfs: {
17028     Value *Op0 = EmitScalarExpr(E->getArg(0));
17029     Value *Op1 = EmitScalarExpr(E->getArg(1));
17030     Value *Op2 = EmitScalarExpr(E->getArg(2));
17031     Value *Op3 = EmitScalarExpr(E->getArg(3));
17032     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_maxfs),
17033                               {Op0, Op1, Op2, Op3});
17034   }
17035   case PPC::BI__builtin_ppc_minfe: {
17036     Value *Op0 = EmitScalarExpr(E->getArg(0));
17037     Value *Op1 = EmitScalarExpr(E->getArg(1));
17038     Value *Op2 = EmitScalarExpr(E->getArg(2));
17039     Value *Op3 = EmitScalarExpr(E->getArg(3));
17040     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_minfe),
17041                               {Op0, Op1, Op2, Op3});
17042   }
17043   case PPC::BI__builtin_ppc_minfl: {
17044     Value *Op0 = EmitScalarExpr(E->getArg(0));
17045     Value *Op1 = EmitScalarExpr(E->getArg(1));
17046     Value *Op2 = EmitScalarExpr(E->getArg(2));
17047     Value *Op3 = EmitScalarExpr(E->getArg(3));
17048     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_minfl),
17049                               {Op0, Op1, Op2, Op3});
17050   }
17051   case PPC::BI__builtin_ppc_minfs: {
17052     Value *Op0 = EmitScalarExpr(E->getArg(0));
17053     Value *Op1 = EmitScalarExpr(E->getArg(1));
17054     Value *Op2 = EmitScalarExpr(E->getArg(2));
17055     Value *Op3 = EmitScalarExpr(E->getArg(3));
17056     return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_minfs),
17057                               {Op0, Op1, Op2, Op3});
17058   }
17059   case PPC::BI__builtin_ppc_swdiv:
17060   case PPC::BI__builtin_ppc_swdivs: {
17061     Value *Op0 = EmitScalarExpr(E->getArg(0));
17062     Value *Op1 = EmitScalarExpr(E->getArg(1));
17063     return Builder.CreateFDiv(Op0, Op1, "swdiv");
17064   }
17065   }
17066 }
17067 
17068 namespace {
17069 // If \p E is not null pointer, insert address space cast to match return
17070 // type of \p E if necessary.
17071 Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF,
17072                              const CallExpr *E = nullptr) {
17073   auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_dispatch_ptr);
17074   auto *Call = CGF.Builder.CreateCall(F);
17075   Call->addRetAttr(
17076       Attribute::getWithDereferenceableBytes(Call->getContext(), 64));
17077   Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4)));
17078   if (!E)
17079     return Call;
17080   QualType BuiltinRetType = E->getType();
17081   auto *RetTy = cast<llvm::PointerType>(CGF.ConvertType(BuiltinRetType));
17082   if (RetTy == Call->getType())
17083     return Call;
17084   return CGF.Builder.CreateAddrSpaceCast(Call, RetTy);
17085 }
17086 
17087 Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) {
17088   auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_implicitarg_ptr);
17089   auto *Call = CGF.Builder.CreateCall(F);
17090   Call->addRetAttr(
17091       Attribute::getWithDereferenceableBytes(Call->getContext(), 256));
17092   Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(8)));
17093   return Call;
17094 }
17095 
17096 // \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
17097 Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
17098   bool IsCOV_5 = CGF.getTarget().getTargetOpts().CodeObjectVersion ==
17099                  clang::TargetOptions::COV_5;
17100   Constant *Offset;
17101   Value *DP;
17102   if (IsCOV_5) {
17103     // Indexing the implicit kernarg segment.
17104     Offset = llvm::ConstantInt::get(CGF.Int32Ty, 12 + Index * 2);
17105     DP = EmitAMDGPUImplicitArgPtr(CGF);
17106   } else {
17107     // Indexing the HSA kernel_dispatch_packet struct.
17108     Offset = llvm::ConstantInt::get(CGF.Int32Ty, 4 + Index * 2);
17109     DP = EmitAMDGPUDispatchPtr(CGF);
17110   }
17111 
17112   auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset);
17113   auto *LD = CGF.Builder.CreateLoad(
17114       Address(GEP, CGF.Int16Ty, CharUnits::fromQuantity(2)));
17115   llvm::MDBuilder MDHelper(CGF.getLLVMContext());
17116   llvm::MDNode *RNode = MDHelper.createRange(APInt(16, 1),
17117       APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1));
17118   LD->setMetadata(llvm::LLVMContext::MD_range, RNode);
17119   LD->setMetadata(llvm::LLVMContext::MD_noundef,
17120                   llvm::MDNode::get(CGF.getLLVMContext(), std::nullopt));
17121   LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
17122                   llvm::MDNode::get(CGF.getLLVMContext(), std::nullopt));
17123   return LD;
17124 }
17125 
17126 // \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
17127 Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned Index) {
17128   const unsigned XOffset = 12;
17129   auto *DP = EmitAMDGPUDispatchPtr(CGF);
17130   // Indexing the HSA kernel_dispatch_packet struct.
17131   auto *Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index * 4);
17132   auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset);
17133   auto *LD = CGF.Builder.CreateLoad(
17134       Address(GEP, CGF.Int32Ty, CharUnits::fromQuantity(4)));
17135   LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
17136                   llvm::MDNode::get(CGF.getLLVMContext(), std::nullopt));
17137   return LD;
17138 }
17139 } // namespace
17140 
17141 // For processing memory ordering and memory scope arguments of various
17142 // amdgcn builtins.
17143 // \p Order takes a C++11 comptabile memory-ordering specifier and converts
17144 // it into LLVM's memory ordering specifier using atomic C ABI, and writes
17145 // to \p AO. \p Scope takes a const char * and converts it into AMDGCN
17146 // specific SyncScopeID and writes it to \p SSID.
17147 void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
17148                                               llvm::AtomicOrdering &AO,
17149                                               llvm::SyncScope::ID &SSID) {
17150   int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
17151 
17152   // Map C11/C++11 memory ordering to LLVM memory ordering
17153   assert(llvm::isValidAtomicOrderingCABI(ord));
17154   switch (static_cast<llvm::AtomicOrderingCABI>(ord)) {
17155   case llvm::AtomicOrderingCABI::acquire:
17156   case llvm::AtomicOrderingCABI::consume:
17157     AO = llvm::AtomicOrdering::Acquire;
17158     break;
17159   case llvm::AtomicOrderingCABI::release:
17160     AO = llvm::AtomicOrdering::Release;
17161     break;
17162   case llvm::AtomicOrderingCABI::acq_rel:
17163     AO = llvm::AtomicOrdering::AcquireRelease;
17164     break;
17165   case llvm::AtomicOrderingCABI::seq_cst:
17166     AO = llvm::AtomicOrdering::SequentiallyConsistent;
17167     break;
17168   case llvm::AtomicOrderingCABI::relaxed:
17169     AO = llvm::AtomicOrdering::Monotonic;
17170     break;
17171   }
17172 
17173   StringRef scp;
17174   llvm::getConstantStringInfo(Scope, scp);
17175   SSID = getLLVMContext().getOrInsertSyncScopeID(scp);
17176 }
17177 
17178 Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
17179                                               const CallExpr *E) {
17180   llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
17181   llvm::SyncScope::ID SSID;
17182   switch (BuiltinID) {
17183   case AMDGPU::BI__builtin_amdgcn_div_scale:
17184   case AMDGPU::BI__builtin_amdgcn_div_scalef: {
17185     // Translate from the intrinsics's struct return to the builtin's out
17186     // argument.
17187 
17188     Address FlagOutPtr = EmitPointerWithAlignment(E->getArg(3));
17189 
17190     llvm::Value *X = EmitScalarExpr(E->getArg(0));
17191     llvm::Value *Y = EmitScalarExpr(E->getArg(1));
17192     llvm::Value *Z = EmitScalarExpr(E->getArg(2));
17193 
17194     llvm::Function *Callee = CGM.getIntrinsic(Intrinsic::amdgcn_div_scale,
17195                                            X->getType());
17196 
17197     llvm::Value *Tmp = Builder.CreateCall(Callee, {X, Y, Z});
17198 
17199     llvm::Value *Result = Builder.CreateExtractValue(Tmp, 0);
17200     llvm::Value *Flag = Builder.CreateExtractValue(Tmp, 1);
17201 
17202     llvm::Type *RealFlagType = FlagOutPtr.getElementType();
17203 
17204     llvm::Value *FlagExt = Builder.CreateZExt(Flag, RealFlagType);
17205     Builder.CreateStore(FlagExt, FlagOutPtr);
17206     return Result;
17207   }
17208   case AMDGPU::BI__builtin_amdgcn_div_fmas:
17209   case AMDGPU::BI__builtin_amdgcn_div_fmasf: {
17210     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
17211     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
17212     llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
17213     llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
17214 
17215     llvm::Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_div_fmas,
17216                                       Src0->getType());
17217     llvm::Value *Src3ToBool = Builder.CreateIsNotNull(Src3);
17218     return Builder.CreateCall(F, {Src0, Src1, Src2, Src3ToBool});
17219   }
17220 
17221   case AMDGPU::BI__builtin_amdgcn_ds_swizzle:
17222     return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_ds_swizzle);
17223   case AMDGPU::BI__builtin_amdgcn_mov_dpp8:
17224     return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_mov_dpp8);
17225   case AMDGPU::BI__builtin_amdgcn_mov_dpp:
17226   case AMDGPU::BI__builtin_amdgcn_update_dpp: {
17227     llvm::SmallVector<llvm::Value *, 6> Args;
17228     for (unsigned I = 0; I != E->getNumArgs(); ++I)
17229       Args.push_back(EmitScalarExpr(E->getArg(I)));
17230     assert(Args.size() == 5 || Args.size() == 6);
17231     if (Args.size() == 5)
17232       Args.insert(Args.begin(), llvm::PoisonValue::get(Args[0]->getType()));
17233     Function *F =
17234         CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
17235     return Builder.CreateCall(F, Args);
17236   }
17237   case AMDGPU::BI__builtin_amdgcn_div_fixup:
17238   case AMDGPU::BI__builtin_amdgcn_div_fixupf:
17239   case AMDGPU::BI__builtin_amdgcn_div_fixuph:
17240     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_div_fixup);
17241   case AMDGPU::BI__builtin_amdgcn_trig_preop:
17242   case AMDGPU::BI__builtin_amdgcn_trig_preopf:
17243     return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_trig_preop);
17244   case AMDGPU::BI__builtin_amdgcn_rcp:
17245   case AMDGPU::BI__builtin_amdgcn_rcpf:
17246   case AMDGPU::BI__builtin_amdgcn_rcph:
17247     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rcp);
17248   case AMDGPU::BI__builtin_amdgcn_sqrt:
17249   case AMDGPU::BI__builtin_amdgcn_sqrtf:
17250   case AMDGPU::BI__builtin_amdgcn_sqrth:
17251     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_sqrt);
17252   case AMDGPU::BI__builtin_amdgcn_rsq:
17253   case AMDGPU::BI__builtin_amdgcn_rsqf:
17254   case AMDGPU::BI__builtin_amdgcn_rsqh:
17255     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rsq);
17256   case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
17257   case AMDGPU::BI__builtin_amdgcn_rsq_clampf:
17258     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rsq_clamp);
17259   case AMDGPU::BI__builtin_amdgcn_sinf:
17260   case AMDGPU::BI__builtin_amdgcn_sinh:
17261     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_sin);
17262   case AMDGPU::BI__builtin_amdgcn_cosf:
17263   case AMDGPU::BI__builtin_amdgcn_cosh:
17264     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_cos);
17265   case AMDGPU::BI__builtin_amdgcn_dispatch_ptr:
17266     return EmitAMDGPUDispatchPtr(*this, E);
17267   case AMDGPU::BI__builtin_amdgcn_logf:
17268     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log);
17269   case AMDGPU::BI__builtin_amdgcn_exp2f:
17270     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_exp2);
17271   case AMDGPU::BI__builtin_amdgcn_log_clampf:
17272     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log_clamp);
17273   case AMDGPU::BI__builtin_amdgcn_ldexp:
17274   case AMDGPU::BI__builtin_amdgcn_ldexpf:
17275   case AMDGPU::BI__builtin_amdgcn_ldexph: {
17276     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
17277     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
17278     llvm::Function *F =
17279         CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Src1->getType()});
17280     return Builder.CreateCall(F, {Src0, Src1});
17281   }
17282   case AMDGPU::BI__builtin_amdgcn_frexp_mant:
17283   case AMDGPU::BI__builtin_amdgcn_frexp_mantf:
17284   case AMDGPU::BI__builtin_amdgcn_frexp_manth:
17285     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_frexp_mant);
17286   case AMDGPU::BI__builtin_amdgcn_frexp_exp:
17287   case AMDGPU::BI__builtin_amdgcn_frexp_expf: {
17288     Value *Src0 = EmitScalarExpr(E->getArg(0));
17289     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
17290                                 { Builder.getInt32Ty(), Src0->getType() });
17291     return Builder.CreateCall(F, Src0);
17292   }
17293   case AMDGPU::BI__builtin_amdgcn_frexp_exph: {
17294     Value *Src0 = EmitScalarExpr(E->getArg(0));
17295     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
17296                                 { Builder.getInt16Ty(), Src0->getType() });
17297     return Builder.CreateCall(F, Src0);
17298   }
17299   case AMDGPU::BI__builtin_amdgcn_fract:
17300   case AMDGPU::BI__builtin_amdgcn_fractf:
17301   case AMDGPU::BI__builtin_amdgcn_fracth:
17302     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_fract);
17303   case AMDGPU::BI__builtin_amdgcn_lerp:
17304     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_lerp);
17305   case AMDGPU::BI__builtin_amdgcn_ubfe:
17306     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_ubfe);
17307   case AMDGPU::BI__builtin_amdgcn_sbfe:
17308     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_sbfe);
17309   case AMDGPU::BI__builtin_amdgcn_ballot_w32:
17310   case AMDGPU::BI__builtin_amdgcn_ballot_w64: {
17311     llvm::Type *ResultType = ConvertType(E->getType());
17312     llvm::Value *Src = EmitScalarExpr(E->getArg(0));
17313     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType });
17314     return Builder.CreateCall(F, { Src });
17315   }
17316   case AMDGPU::BI__builtin_amdgcn_uicmp:
17317   case AMDGPU::BI__builtin_amdgcn_uicmpl:
17318   case AMDGPU::BI__builtin_amdgcn_sicmp:
17319   case AMDGPU::BI__builtin_amdgcn_sicmpl: {
17320     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
17321     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
17322     llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
17323 
17324     // FIXME-GFX10: How should 32 bit mask be handled?
17325     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_icmp,
17326       { Builder.getInt64Ty(), Src0->getType() });
17327     return Builder.CreateCall(F, { Src0, Src1, Src2 });
17328   }
17329   case AMDGPU::BI__builtin_amdgcn_fcmp:
17330   case AMDGPU::BI__builtin_amdgcn_fcmpf: {
17331     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
17332     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
17333     llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
17334 
17335     // FIXME-GFX10: How should 32 bit mask be handled?
17336     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_fcmp,
17337       { Builder.getInt64Ty(), Src0->getType() });
17338     return Builder.CreateCall(F, { Src0, Src1, Src2 });
17339   }
17340   case AMDGPU::BI__builtin_amdgcn_class:
17341   case AMDGPU::BI__builtin_amdgcn_classf:
17342   case AMDGPU::BI__builtin_amdgcn_classh:
17343     return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_class);
17344   case AMDGPU::BI__builtin_amdgcn_fmed3f:
17345   case AMDGPU::BI__builtin_amdgcn_fmed3h:
17346     return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_fmed3);
17347   case AMDGPU::BI__builtin_amdgcn_ds_append:
17348   case AMDGPU::BI__builtin_amdgcn_ds_consume: {
17349     Intrinsic::ID Intrin = BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_append ?
17350       Intrinsic::amdgcn_ds_append : Intrinsic::amdgcn_ds_consume;
17351     Value *Src0 = EmitScalarExpr(E->getArg(0));
17352     Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
17353     return Builder.CreateCall(F, { Src0, Builder.getFalse() });
17354   }
17355   case AMDGPU::BI__builtin_amdgcn_ds_faddf:
17356   case AMDGPU::BI__builtin_amdgcn_ds_fminf:
17357   case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
17358     Intrinsic::ID Intrin;
17359     switch (BuiltinID) {
17360     case AMDGPU::BI__builtin_amdgcn_ds_faddf:
17361       Intrin = Intrinsic::amdgcn_ds_fadd;
17362       break;
17363     case AMDGPU::BI__builtin_amdgcn_ds_fminf:
17364       Intrin = Intrinsic::amdgcn_ds_fmin;
17365       break;
17366     case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
17367       Intrin = Intrinsic::amdgcn_ds_fmax;
17368       break;
17369     }
17370     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
17371     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
17372     llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
17373     llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
17374     llvm::Value *Src4 = EmitScalarExpr(E->getArg(4));
17375     llvm::Function *F = CGM.getIntrinsic(Intrin, { Src1->getType() });
17376     llvm::FunctionType *FTy = F->getFunctionType();
17377     llvm::Type *PTy = FTy->getParamType(0);
17378     Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
17379     return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
17380   }
17381   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
17382   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
17383   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
17384   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
17385   case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
17386   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
17387   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
17388   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
17389   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
17390   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: {
17391     Intrinsic::ID IID;
17392     llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
17393     switch (BuiltinID) {
17394     case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
17395       ArgTy = llvm::Type::getFloatTy(getLLVMContext());
17396       IID = Intrinsic::amdgcn_global_atomic_fadd;
17397       break;
17398     case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
17399       ArgTy = llvm::FixedVectorType::get(
17400           llvm::Type::getHalfTy(getLLVMContext()), 2);
17401       IID = Intrinsic::amdgcn_global_atomic_fadd;
17402       break;
17403     case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
17404       IID = Intrinsic::amdgcn_global_atomic_fadd;
17405       break;
17406     case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
17407       IID = Intrinsic::amdgcn_global_atomic_fmin;
17408       break;
17409     case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
17410       IID = Intrinsic::amdgcn_global_atomic_fmax;
17411       break;
17412     case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
17413       IID = Intrinsic::amdgcn_flat_atomic_fadd;
17414       break;
17415     case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
17416       IID = Intrinsic::amdgcn_flat_atomic_fmin;
17417       break;
17418     case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
17419       IID = Intrinsic::amdgcn_flat_atomic_fmax;
17420       break;
17421     case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
17422       ArgTy = llvm::Type::getFloatTy(getLLVMContext());
17423       IID = Intrinsic::amdgcn_flat_atomic_fadd;
17424       break;
17425     case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
17426       ArgTy = llvm::FixedVectorType::get(
17427           llvm::Type::getHalfTy(getLLVMContext()), 2);
17428       IID = Intrinsic::amdgcn_flat_atomic_fadd;
17429       break;
17430     }
17431     llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
17432     llvm::Value *Val = EmitScalarExpr(E->getArg(1));
17433     llvm::Function *F =
17434         CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()});
17435     return Builder.CreateCall(F, {Addr, Val});
17436   }
17437   case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
17438   case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
17439     Intrinsic::ID IID;
17440     switch (BuiltinID) {
17441     case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
17442       IID = Intrinsic::amdgcn_global_atomic_fadd_v2bf16;
17443       break;
17444     case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
17445       IID = Intrinsic::amdgcn_flat_atomic_fadd_v2bf16;
17446       break;
17447     }
17448     llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
17449     llvm::Value *Val = EmitScalarExpr(E->getArg(1));
17450     llvm::Function *F = CGM.getIntrinsic(IID, {Addr->getType()});
17451     return Builder.CreateCall(F, {Addr, Val});
17452   }
17453   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
17454   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
17455   case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16: {
17456     Intrinsic::ID IID;
17457     llvm::Type *ArgTy;
17458     switch (BuiltinID) {
17459     case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
17460       ArgTy = llvm::Type::getFloatTy(getLLVMContext());
17461       IID = Intrinsic::amdgcn_ds_fadd;
17462       break;
17463     case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
17464       ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
17465       IID = Intrinsic::amdgcn_ds_fadd;
17466       break;
17467     case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
17468       ArgTy = llvm::FixedVectorType::get(
17469           llvm::Type::getHalfTy(getLLVMContext()), 2);
17470       IID = Intrinsic::amdgcn_ds_fadd;
17471       break;
17472     }
17473     llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
17474     llvm::Value *Val = EmitScalarExpr(E->getArg(1));
17475     llvm::Constant *ZeroI32 = llvm::ConstantInt::getIntegerValue(
17476         llvm::Type::getInt32Ty(getLLVMContext()), APInt(32, 0, true));
17477     llvm::Constant *ZeroI1 = llvm::ConstantInt::getIntegerValue(
17478         llvm::Type::getInt1Ty(getLLVMContext()), APInt(1, 0));
17479     llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
17480     return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
17481   }
17482   case AMDGPU::BI__builtin_amdgcn_read_exec: {
17483     CallInst *CI = cast<CallInst>(
17484       EmitSpecialRegisterBuiltin(*this, E, Int64Ty, Int64Ty, NormalRead, "exec"));
17485     CI->setConvergent();
17486     return CI;
17487   }
17488   case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
17489   case AMDGPU::BI__builtin_amdgcn_read_exec_hi: {
17490     StringRef RegName = BuiltinID == AMDGPU::BI__builtin_amdgcn_read_exec_lo ?
17491       "exec_lo" : "exec_hi";
17492     CallInst *CI = cast<CallInst>(
17493       EmitSpecialRegisterBuiltin(*this, E, Int32Ty, Int32Ty, NormalRead, RegName));
17494     CI->setConvergent();
17495     return CI;
17496   }
17497   case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray:
17498   case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_h:
17499   case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_l:
17500   case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_lh: {
17501     llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0));
17502     llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1));
17503     llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(2));
17504     llvm::Value *RayDir = EmitScalarExpr(E->getArg(3));
17505     llvm::Value *RayInverseDir = EmitScalarExpr(E->getArg(4));
17506     llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(5));
17507 
17508     // The builtins take these arguments as vec4 where the last element is
17509     // ignored. The intrinsic takes them as vec3.
17510     RayOrigin = Builder.CreateShuffleVector(RayOrigin, RayOrigin,
17511                                             ArrayRef<int>{0, 1, 2});
17512     RayDir =
17513         Builder.CreateShuffleVector(RayDir, RayDir, ArrayRef<int>{0, 1, 2});
17514     RayInverseDir = Builder.CreateShuffleVector(RayInverseDir, RayInverseDir,
17515                                                 ArrayRef<int>{0, 1, 2});
17516 
17517     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_image_bvh_intersect_ray,
17518                                    {NodePtr->getType(), RayDir->getType()});
17519     return Builder.CreateCall(F, {NodePtr, RayExtent, RayOrigin, RayDir,
17520                                   RayInverseDir, TextureDescr});
17521   }
17522 
17523   case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn: {
17524     SmallVector<Value *, 4> Args;
17525     for (int i = 0, e = E->getNumArgs(); i != e; ++i)
17526       Args.push_back(EmitScalarExpr(E->getArg(i)));
17527 
17528     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ds_bvh_stack_rtn);
17529     Value *Call = Builder.CreateCall(F, Args);
17530     Value *Rtn = Builder.CreateExtractValue(Call, 0);
17531     Value *A = Builder.CreateExtractValue(Call, 1);
17532     llvm::Type *RetTy = ConvertType(E->getType());
17533     Value *I0 = Builder.CreateInsertElement(PoisonValue::get(RetTy), Rtn,
17534                                             (uint64_t)0);
17535     return Builder.CreateInsertElement(I0, A, 1);
17536   }
17537 
17538   case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
17539   case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
17540   case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
17541   case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
17542   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
17543   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
17544   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
17545   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
17546   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
17547   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
17548   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
17549   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: {
17550 
17551     // These operations perform a matrix multiplication and accumulation of
17552     // the form:
17553     //             D = A * B + C
17554     // The return type always matches the type of matrix C.
17555     unsigned ArgForMatchingRetType;
17556     unsigned BuiltinWMMAOp;
17557 
17558     switch (BuiltinID) {
17559     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
17560     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
17561       ArgForMatchingRetType = 2;
17562       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
17563       break;
17564     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
17565     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
17566       ArgForMatchingRetType = 2;
17567       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
17568       break;
17569     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
17570     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
17571       ArgForMatchingRetType = 2;
17572       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
17573       break;
17574     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
17575     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
17576       ArgForMatchingRetType = 2;
17577       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
17578       break;
17579     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
17580     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
17581       ArgForMatchingRetType = 4;
17582       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
17583       break;
17584     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
17585     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
17586       ArgForMatchingRetType = 4;
17587       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
17588       break;
17589     }
17590 
17591     SmallVector<Value *, 6> Args;
17592     for (int i = 0, e = E->getNumArgs(); i != e; ++i)
17593       Args.push_back(EmitScalarExpr(E->getArg(i)));
17594 
17595     Function *F = CGM.getIntrinsic(BuiltinWMMAOp,
17596                                    {Args[ArgForMatchingRetType]->getType()});
17597 
17598     return Builder.CreateCall(F, Args);
17599   }
17600 
17601   // amdgcn workitem
17602   case AMDGPU::BI__builtin_amdgcn_workitem_id_x:
17603     return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_x, 0, 1024);
17604   case AMDGPU::BI__builtin_amdgcn_workitem_id_y:
17605     return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_y, 0, 1024);
17606   case AMDGPU::BI__builtin_amdgcn_workitem_id_z:
17607     return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_z, 0, 1024);
17608 
17609   // amdgcn workgroup size
17610   case AMDGPU::BI__builtin_amdgcn_workgroup_size_x:
17611     return EmitAMDGPUWorkGroupSize(*this, 0);
17612   case AMDGPU::BI__builtin_amdgcn_workgroup_size_y:
17613     return EmitAMDGPUWorkGroupSize(*this, 1);
17614   case AMDGPU::BI__builtin_amdgcn_workgroup_size_z:
17615     return EmitAMDGPUWorkGroupSize(*this, 2);
17616 
17617   // amdgcn grid size
17618   case AMDGPU::BI__builtin_amdgcn_grid_size_x:
17619     return EmitAMDGPUGridSize(*this, 0);
17620   case AMDGPU::BI__builtin_amdgcn_grid_size_y:
17621     return EmitAMDGPUGridSize(*this, 1);
17622   case AMDGPU::BI__builtin_amdgcn_grid_size_z:
17623     return EmitAMDGPUGridSize(*this, 2);
17624 
17625   // r600 intrinsics
17626   case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
17627   case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
17628     return emitUnaryBuiltin(*this, E, Intrinsic::r600_recipsqrt_ieee);
17629   case AMDGPU::BI__builtin_r600_read_tidig_x:
17630     return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_x, 0, 1024);
17631   case AMDGPU::BI__builtin_r600_read_tidig_y:
17632     return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_y, 0, 1024);
17633   case AMDGPU::BI__builtin_r600_read_tidig_z:
17634     return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_z, 0, 1024);
17635   case AMDGPU::BI__builtin_amdgcn_alignbit: {
17636     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
17637     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
17638     llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
17639     Function *F = CGM.getIntrinsic(Intrinsic::fshr, Src0->getType());
17640     return Builder.CreateCall(F, { Src0, Src1, Src2 });
17641   }
17642   case AMDGPU::BI__builtin_amdgcn_fence: {
17643     ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(0)),
17644                             EmitScalarExpr(E->getArg(1)), AO, SSID);
17645     return Builder.CreateFence(AO, SSID);
17646   }
17647   case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
17648   case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
17649   case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
17650   case AMDGPU::BI__builtin_amdgcn_atomic_dec64: {
17651     llvm::AtomicRMWInst::BinOp BinOp;
17652     switch (BuiltinID) {
17653     case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
17654     case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
17655       BinOp = llvm::AtomicRMWInst::UIncWrap;
17656       break;
17657     case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
17658     case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
17659       BinOp = llvm::AtomicRMWInst::UDecWrap;
17660       break;
17661     }
17662 
17663     Value *Ptr = EmitScalarExpr(E->getArg(0));
17664     Value *Val = EmitScalarExpr(E->getArg(1));
17665 
17666     ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
17667                             EmitScalarExpr(E->getArg(3)), AO, SSID);
17668 
17669     QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
17670     bool Volatile =
17671         PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
17672 
17673     llvm::AtomicRMWInst *RMW =
17674         Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
17675     if (Volatile)
17676       RMW->setVolatile(true);
17677     return RMW;
17678   }
17679   case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
17680   case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: {
17681     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
17682     llvm::Type *ResultType = ConvertType(E->getType());
17683     // s_sendmsg_rtn is mangled using return type only.
17684     Function *F =
17685         CGM.getIntrinsic(Intrinsic::amdgcn_s_sendmsg_rtn, {ResultType});
17686     return Builder.CreateCall(F, {Arg});
17687   }
17688   default:
17689     return nullptr;
17690   }
17691 }
17692 
17693 /// Handle a SystemZ function in which the final argument is a pointer
17694 /// to an int that receives the post-instruction CC value.  At the LLVM level
17695 /// this is represented as a function that returns a {result, cc} pair.
17696 static Value *EmitSystemZIntrinsicWithCC(CodeGenFunction &CGF,
17697                                          unsigned IntrinsicID,
17698                                          const CallExpr *E) {
17699   unsigned NumArgs = E->getNumArgs() - 1;
17700   SmallVector<Value *, 8> Args(NumArgs);
17701   for (unsigned I = 0; I < NumArgs; ++I)
17702     Args[I] = CGF.EmitScalarExpr(E->getArg(I));
17703   Address CCPtr = CGF.EmitPointerWithAlignment(E->getArg(NumArgs));
17704   Function *F = CGF.CGM.getIntrinsic(IntrinsicID);
17705   Value *Call = CGF.Builder.CreateCall(F, Args);
17706   Value *CC = CGF.Builder.CreateExtractValue(Call, 1);
17707   CGF.Builder.CreateStore(CC, CCPtr);
17708   return CGF.Builder.CreateExtractValue(Call, 0);
17709 }
17710 
17711 Value *CodeGenFunction::EmitSystemZBuiltinExpr(unsigned BuiltinID,
17712                                                const CallExpr *E) {
17713   switch (BuiltinID) {
17714   case SystemZ::BI__builtin_tbegin: {
17715     Value *TDB = EmitScalarExpr(E->getArg(0));
17716     Value *Control = llvm::ConstantInt::get(Int32Ty, 0xff0c);
17717     Function *F = CGM.getIntrinsic(Intrinsic::s390_tbegin);
17718     return Builder.CreateCall(F, {TDB, Control});
17719   }
17720   case SystemZ::BI__builtin_tbegin_nofloat: {
17721     Value *TDB = EmitScalarExpr(E->getArg(0));
17722     Value *Control = llvm::ConstantInt::get(Int32Ty, 0xff0c);
17723     Function *F = CGM.getIntrinsic(Intrinsic::s390_tbegin_nofloat);
17724     return Builder.CreateCall(F, {TDB, Control});
17725   }
17726   case SystemZ::BI__builtin_tbeginc: {
17727     Value *TDB = llvm::ConstantPointerNull::get(Int8PtrTy);
17728     Value *Control = llvm::ConstantInt::get(Int32Ty, 0xff08);
17729     Function *F = CGM.getIntrinsic(Intrinsic::s390_tbeginc);
17730     return Builder.CreateCall(F, {TDB, Control});
17731   }
17732   case SystemZ::BI__builtin_tabort: {
17733     Value *Data = EmitScalarExpr(E->getArg(0));
17734     Function *F = CGM.getIntrinsic(Intrinsic::s390_tabort);
17735     return Builder.CreateCall(F, Builder.CreateSExt(Data, Int64Ty, "tabort"));
17736   }
17737   case SystemZ::BI__builtin_non_tx_store: {
17738     Value *Address = EmitScalarExpr(E->getArg(0));
17739     Value *Data = EmitScalarExpr(E->getArg(1));
17740     Function *F = CGM.getIntrinsic(Intrinsic::s390_ntstg);
17741     return Builder.CreateCall(F, {Data, Address});
17742   }
17743 
17744   // Vector builtins.  Note that most vector builtins are mapped automatically
17745   // to target-specific LLVM intrinsics.  The ones handled specially here can
17746   // be represented via standard LLVM IR, which is preferable to enable common
17747   // LLVM optimizations.
17748 
17749   case SystemZ::BI__builtin_s390_vpopctb:
17750   case SystemZ::BI__builtin_s390_vpopcth:
17751   case SystemZ::BI__builtin_s390_vpopctf:
17752   case SystemZ::BI__builtin_s390_vpopctg: {
17753     llvm::Type *ResultType = ConvertType(E->getType());
17754     Value *X = EmitScalarExpr(E->getArg(0));
17755     Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ResultType);
17756     return Builder.CreateCall(F, X);
17757   }
17758 
17759   case SystemZ::BI__builtin_s390_vclzb:
17760   case SystemZ::BI__builtin_s390_vclzh:
17761   case SystemZ::BI__builtin_s390_vclzf:
17762   case SystemZ::BI__builtin_s390_vclzg: {
17763     llvm::Type *ResultType = ConvertType(E->getType());
17764     Value *X = EmitScalarExpr(E->getArg(0));
17765     Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
17766     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ResultType);
17767     return Builder.CreateCall(F, {X, Undef});
17768   }
17769 
17770   case SystemZ::BI__builtin_s390_vctzb:
17771   case SystemZ::BI__builtin_s390_vctzh:
17772   case SystemZ::BI__builtin_s390_vctzf:
17773   case SystemZ::BI__builtin_s390_vctzg: {
17774     llvm::Type *ResultType = ConvertType(E->getType());
17775     Value *X = EmitScalarExpr(E->getArg(0));
17776     Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
17777     Function *F = CGM.getIntrinsic(Intrinsic::cttz, ResultType);
17778     return Builder.CreateCall(F, {X, Undef});
17779   }
17780 
17781   case SystemZ::BI__builtin_s390_vfsqsb:
17782   case SystemZ::BI__builtin_s390_vfsqdb: {
17783     llvm::Type *ResultType = ConvertType(E->getType());
17784     Value *X = EmitScalarExpr(E->getArg(0));
17785     if (Builder.getIsFPConstrained()) {
17786       Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt, ResultType);
17787       return Builder.CreateConstrainedFPCall(F, { X });
17788     } else {
17789       Function *F = CGM.getIntrinsic(Intrinsic::sqrt, ResultType);
17790       return Builder.CreateCall(F, X);
17791     }
17792   }
17793   case SystemZ::BI__builtin_s390_vfmasb:
17794   case SystemZ::BI__builtin_s390_vfmadb: {
17795     llvm::Type *ResultType = ConvertType(E->getType());
17796     Value *X = EmitScalarExpr(E->getArg(0));
17797     Value *Y = EmitScalarExpr(E->getArg(1));
17798     Value *Z = EmitScalarExpr(E->getArg(2));
17799     if (Builder.getIsFPConstrained()) {
17800       Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, ResultType);
17801       return Builder.CreateConstrainedFPCall(F, {X, Y, Z});
17802     } else {
17803       Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
17804       return Builder.CreateCall(F, {X, Y, Z});
17805     }
17806   }
17807   case SystemZ::BI__builtin_s390_vfmssb:
17808   case SystemZ::BI__builtin_s390_vfmsdb: {
17809     llvm::Type *ResultType = ConvertType(E->getType());
17810     Value *X = EmitScalarExpr(E->getArg(0));
17811     Value *Y = EmitScalarExpr(E->getArg(1));
17812     Value *Z = EmitScalarExpr(E->getArg(2));
17813     if (Builder.getIsFPConstrained()) {
17814       Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, ResultType);
17815       return Builder.CreateConstrainedFPCall(F, {X, Y, Builder.CreateFNeg(Z, "neg")});
17816     } else {
17817       Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
17818       return Builder.CreateCall(F, {X, Y, Builder.CreateFNeg(Z, "neg")});
17819     }
17820   }
17821   case SystemZ::BI__builtin_s390_vfnmasb:
17822   case SystemZ::BI__builtin_s390_vfnmadb: {
17823     llvm::Type *ResultType = ConvertType(E->getType());
17824     Value *X = EmitScalarExpr(E->getArg(0));
17825     Value *Y = EmitScalarExpr(E->getArg(1));
17826     Value *Z = EmitScalarExpr(E->getArg(2));
17827     if (Builder.getIsFPConstrained()) {
17828       Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, ResultType);
17829       return Builder.CreateFNeg(Builder.CreateConstrainedFPCall(F, {X, Y,  Z}), "neg");
17830     } else {
17831       Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
17832       return Builder.CreateFNeg(Builder.CreateCall(F, {X, Y, Z}), "neg");
17833     }
17834   }
17835   case SystemZ::BI__builtin_s390_vfnmssb:
17836   case SystemZ::BI__builtin_s390_vfnmsdb: {
17837     llvm::Type *ResultType = ConvertType(E->getType());
17838     Value *X = EmitScalarExpr(E->getArg(0));
17839     Value *Y = EmitScalarExpr(E->getArg(1));
17840     Value *Z = EmitScalarExpr(E->getArg(2));
17841     if (Builder.getIsFPConstrained()) {
17842       Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, ResultType);
17843       Value *NegZ = Builder.CreateFNeg(Z, "sub");
17844       return Builder.CreateFNeg(Builder.CreateConstrainedFPCall(F, {X, Y, NegZ}));
17845     } else {
17846       Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
17847       Value *NegZ = Builder.CreateFNeg(Z, "neg");
17848       return Builder.CreateFNeg(Builder.CreateCall(F, {X, Y, NegZ}));
17849     }
17850   }
17851   case SystemZ::BI__builtin_s390_vflpsb:
17852   case SystemZ::BI__builtin_s390_vflpdb: {
17853     llvm::Type *ResultType = ConvertType(E->getType());
17854     Value *X = EmitScalarExpr(E->getArg(0));
17855     Function *F = CGM.getIntrinsic(Intrinsic::fabs, ResultType);
17856     return Builder.CreateCall(F, X);
17857   }
17858   case SystemZ::BI__builtin_s390_vflnsb:
17859   case SystemZ::BI__builtin_s390_vflndb: {
17860     llvm::Type *ResultType = ConvertType(E->getType());
17861     Value *X = EmitScalarExpr(E->getArg(0));
17862     Function *F = CGM.getIntrinsic(Intrinsic::fabs, ResultType);
17863     return Builder.CreateFNeg(Builder.CreateCall(F, X), "neg");
17864   }
17865   case SystemZ::BI__builtin_s390_vfisb:
17866   case SystemZ::BI__builtin_s390_vfidb: {
17867     llvm::Type *ResultType = ConvertType(E->getType());
17868     Value *X = EmitScalarExpr(E->getArg(0));
17869     // Constant-fold the M4 and M5 mask arguments.
17870     llvm::APSInt M4 = *E->getArg(1)->getIntegerConstantExpr(getContext());
17871     llvm::APSInt M5 = *E->getArg(2)->getIntegerConstantExpr(getContext());
17872     // Check whether this instance can be represented via a LLVM standard
17873     // intrinsic.  We only support some combinations of M4 and M5.
17874     Intrinsic::ID ID = Intrinsic::not_intrinsic;
17875     Intrinsic::ID CI;
17876     switch (M4.getZExtValue()) {
17877     default: break;
17878     case 0:  // IEEE-inexact exception allowed
17879       switch (M5.getZExtValue()) {
17880       default: break;
17881       case 0: ID = Intrinsic::rint;
17882               CI = Intrinsic::experimental_constrained_rint; break;
17883       }
17884       break;
17885     case 4:  // IEEE-inexact exception suppressed
17886       switch (M5.getZExtValue()) {
17887       default: break;
17888       case 0: ID = Intrinsic::nearbyint;
17889               CI = Intrinsic::experimental_constrained_nearbyint; break;
17890       case 1: ID = Intrinsic::round;
17891               CI = Intrinsic::experimental_constrained_round; break;
17892       case 5: ID = Intrinsic::trunc;
17893               CI = Intrinsic::experimental_constrained_trunc; break;
17894       case 6: ID = Intrinsic::ceil;
17895               CI = Intrinsic::experimental_constrained_ceil; break;
17896       case 7: ID = Intrinsic::floor;
17897               CI = Intrinsic::experimental_constrained_floor; break;
17898       }
17899       break;
17900     }
17901     if (ID != Intrinsic::not_intrinsic) {
17902       if (Builder.getIsFPConstrained()) {
17903         Function *F = CGM.getIntrinsic(CI, ResultType);
17904         return Builder.CreateConstrainedFPCall(F, X);
17905       } else {
17906         Function *F = CGM.getIntrinsic(ID, ResultType);
17907         return Builder.CreateCall(F, X);
17908       }
17909     }
17910     switch (BuiltinID) { // FIXME: constrained version?
17911       case SystemZ::BI__builtin_s390_vfisb: ID = Intrinsic::s390_vfisb; break;
17912       case SystemZ::BI__builtin_s390_vfidb: ID = Intrinsic::s390_vfidb; break;
17913       default: llvm_unreachable("Unknown BuiltinID");
17914     }
17915     Function *F = CGM.getIntrinsic(ID);
17916     Value *M4Value = llvm::ConstantInt::get(getLLVMContext(), M4);
17917     Value *M5Value = llvm::ConstantInt::get(getLLVMContext(), M5);
17918     return Builder.CreateCall(F, {X, M4Value, M5Value});
17919   }
17920   case SystemZ::BI__builtin_s390_vfmaxsb:
17921   case SystemZ::BI__builtin_s390_vfmaxdb: {
17922     llvm::Type *ResultType = ConvertType(E->getType());
17923     Value *X = EmitScalarExpr(E->getArg(0));
17924     Value *Y = EmitScalarExpr(E->getArg(1));
17925     // Constant-fold the M4 mask argument.
17926     llvm::APSInt M4 = *E->getArg(2)->getIntegerConstantExpr(getContext());
17927     // Check whether this instance can be represented via a LLVM standard
17928     // intrinsic.  We only support some values of M4.
17929     Intrinsic::ID ID = Intrinsic::not_intrinsic;
17930     Intrinsic::ID CI;
17931     switch (M4.getZExtValue()) {
17932     default: break;
17933     case 4: ID = Intrinsic::maxnum;
17934             CI = Intrinsic::experimental_constrained_maxnum; break;
17935     }
17936     if (ID != Intrinsic::not_intrinsic) {
17937       if (Builder.getIsFPConstrained()) {
17938         Function *F = CGM.getIntrinsic(CI, ResultType);
17939         return Builder.CreateConstrainedFPCall(F, {X, Y});
17940       } else {
17941         Function *F = CGM.getIntrinsic(ID, ResultType);
17942         return Builder.CreateCall(F, {X, Y});
17943       }
17944     }
17945     switch (BuiltinID) {
17946       case SystemZ::BI__builtin_s390_vfmaxsb: ID = Intrinsic::s390_vfmaxsb; break;
17947       case SystemZ::BI__builtin_s390_vfmaxdb: ID = Intrinsic::s390_vfmaxdb; break;
17948       default: llvm_unreachable("Unknown BuiltinID");
17949     }
17950     Function *F = CGM.getIntrinsic(ID);
17951     Value *M4Value = llvm::ConstantInt::get(getLLVMContext(), M4);
17952     return Builder.CreateCall(F, {X, Y, M4Value});
17953   }
17954   case SystemZ::BI__builtin_s390_vfminsb:
17955   case SystemZ::BI__builtin_s390_vfmindb: {
17956     llvm::Type *ResultType = ConvertType(E->getType());
17957     Value *X = EmitScalarExpr(E->getArg(0));
17958     Value *Y = EmitScalarExpr(E->getArg(1));
17959     // Constant-fold the M4 mask argument.
17960     llvm::APSInt M4 = *E->getArg(2)->getIntegerConstantExpr(getContext());
17961     // Check whether this instance can be represented via a LLVM standard
17962     // intrinsic.  We only support some values of M4.
17963     Intrinsic::ID ID = Intrinsic::not_intrinsic;
17964     Intrinsic::ID CI;
17965     switch (M4.getZExtValue()) {
17966     default: break;
17967     case 4: ID = Intrinsic::minnum;
17968             CI = Intrinsic::experimental_constrained_minnum; break;
17969     }
17970     if (ID != Intrinsic::not_intrinsic) {
17971       if (Builder.getIsFPConstrained()) {
17972         Function *F = CGM.getIntrinsic(CI, ResultType);
17973         return Builder.CreateConstrainedFPCall(F, {X, Y});
17974       } else {
17975         Function *F = CGM.getIntrinsic(ID, ResultType);
17976         return Builder.CreateCall(F, {X, Y});
17977       }
17978     }
17979     switch (BuiltinID) {
17980       case SystemZ::BI__builtin_s390_vfminsb: ID = Intrinsic::s390_vfminsb; break;
17981       case SystemZ::BI__builtin_s390_vfmindb: ID = Intrinsic::s390_vfmindb; break;
17982       default: llvm_unreachable("Unknown BuiltinID");
17983     }
17984     Function *F = CGM.getIntrinsic(ID);
17985     Value *M4Value = llvm::ConstantInt::get(getLLVMContext(), M4);
17986     return Builder.CreateCall(F, {X, Y, M4Value});
17987   }
17988 
17989   case SystemZ::BI__builtin_s390_vlbrh:
17990   case SystemZ::BI__builtin_s390_vlbrf:
17991   case SystemZ::BI__builtin_s390_vlbrg: {
17992     llvm::Type *ResultType = ConvertType(E->getType());
17993     Value *X = EmitScalarExpr(E->getArg(0));
17994     Function *F = CGM.getIntrinsic(Intrinsic::bswap, ResultType);
17995     return Builder.CreateCall(F, X);
17996   }
17997 
17998   // Vector intrinsics that output the post-instruction CC value.
17999 
18000 #define INTRINSIC_WITH_CC(NAME) \
18001     case SystemZ::BI__builtin_##NAME: \
18002       return EmitSystemZIntrinsicWithCC(*this, Intrinsic::NAME, E)
18003 
18004   INTRINSIC_WITH_CC(s390_vpkshs);
18005   INTRINSIC_WITH_CC(s390_vpksfs);
18006   INTRINSIC_WITH_CC(s390_vpksgs);
18007 
18008   INTRINSIC_WITH_CC(s390_vpklshs);
18009   INTRINSIC_WITH_CC(s390_vpklsfs);
18010   INTRINSIC_WITH_CC(s390_vpklsgs);
18011 
18012   INTRINSIC_WITH_CC(s390_vceqbs);
18013   INTRINSIC_WITH_CC(s390_vceqhs);
18014   INTRINSIC_WITH_CC(s390_vceqfs);
18015   INTRINSIC_WITH_CC(s390_vceqgs);
18016 
18017   INTRINSIC_WITH_CC(s390_vchbs);
18018   INTRINSIC_WITH_CC(s390_vchhs);
18019   INTRINSIC_WITH_CC(s390_vchfs);
18020   INTRINSIC_WITH_CC(s390_vchgs);
18021 
18022   INTRINSIC_WITH_CC(s390_vchlbs);
18023   INTRINSIC_WITH_CC(s390_vchlhs);
18024   INTRINSIC_WITH_CC(s390_vchlfs);
18025   INTRINSIC_WITH_CC(s390_vchlgs);
18026 
18027   INTRINSIC_WITH_CC(s390_vfaebs);
18028   INTRINSIC_WITH_CC(s390_vfaehs);
18029   INTRINSIC_WITH_CC(s390_vfaefs);
18030 
18031   INTRINSIC_WITH_CC(s390_vfaezbs);
18032   INTRINSIC_WITH_CC(s390_vfaezhs);
18033   INTRINSIC_WITH_CC(s390_vfaezfs);
18034 
18035   INTRINSIC_WITH_CC(s390_vfeebs);
18036   INTRINSIC_WITH_CC(s390_vfeehs);
18037   INTRINSIC_WITH_CC(s390_vfeefs);
18038 
18039   INTRINSIC_WITH_CC(s390_vfeezbs);
18040   INTRINSIC_WITH_CC(s390_vfeezhs);
18041   INTRINSIC_WITH_CC(s390_vfeezfs);
18042 
18043   INTRINSIC_WITH_CC(s390_vfenebs);
18044   INTRINSIC_WITH_CC(s390_vfenehs);
18045   INTRINSIC_WITH_CC(s390_vfenefs);
18046 
18047   INTRINSIC_WITH_CC(s390_vfenezbs);
18048   INTRINSIC_WITH_CC(s390_vfenezhs);
18049   INTRINSIC_WITH_CC(s390_vfenezfs);
18050 
18051   INTRINSIC_WITH_CC(s390_vistrbs);
18052   INTRINSIC_WITH_CC(s390_vistrhs);
18053   INTRINSIC_WITH_CC(s390_vistrfs);
18054 
18055   INTRINSIC_WITH_CC(s390_vstrcbs);
18056   INTRINSIC_WITH_CC(s390_vstrchs);
18057   INTRINSIC_WITH_CC(s390_vstrcfs);
18058 
18059   INTRINSIC_WITH_CC(s390_vstrczbs);
18060   INTRINSIC_WITH_CC(s390_vstrczhs);
18061   INTRINSIC_WITH_CC(s390_vstrczfs);
18062 
18063   INTRINSIC_WITH_CC(s390_vfcesbs);
18064   INTRINSIC_WITH_CC(s390_vfcedbs);
18065   INTRINSIC_WITH_CC(s390_vfchsbs);
18066   INTRINSIC_WITH_CC(s390_vfchdbs);
18067   INTRINSIC_WITH_CC(s390_vfchesbs);
18068   INTRINSIC_WITH_CC(s390_vfchedbs);
18069 
18070   INTRINSIC_WITH_CC(s390_vftcisb);
18071   INTRINSIC_WITH_CC(s390_vftcidb);
18072 
18073   INTRINSIC_WITH_CC(s390_vstrsb);
18074   INTRINSIC_WITH_CC(s390_vstrsh);
18075   INTRINSIC_WITH_CC(s390_vstrsf);
18076 
18077   INTRINSIC_WITH_CC(s390_vstrszb);
18078   INTRINSIC_WITH_CC(s390_vstrszh);
18079   INTRINSIC_WITH_CC(s390_vstrszf);
18080 
18081 #undef INTRINSIC_WITH_CC
18082 
18083   default:
18084     return nullptr;
18085   }
18086 }
18087 
18088 namespace {
18089 // Helper classes for mapping MMA builtins to particular LLVM intrinsic variant.
18090 struct NVPTXMmaLdstInfo {
18091   unsigned NumResults;  // Number of elements to load/store
18092   // Intrinsic IDs for row/col variants. 0 if particular layout is unsupported.
18093   unsigned IID_col;
18094   unsigned IID_row;
18095 };
18096 
18097 #define MMA_INTR(geom_op_type, layout) \
18098   Intrinsic::nvvm_wmma_##geom_op_type##_##layout##_stride
18099 #define MMA_LDST(n, geom_op_type)                                              \
18100   { n, MMA_INTR(geom_op_type, col), MMA_INTR(geom_op_type, row) }
18101 
18102 static NVPTXMmaLdstInfo getNVPTXMmaLdstInfo(unsigned BuiltinID) {
18103   switch (BuiltinID) {
18104   // FP MMA loads
18105   case NVPTX::BI__hmma_m16n16k16_ld_a:
18106     return MMA_LDST(8, m16n16k16_load_a_f16);
18107   case NVPTX::BI__hmma_m16n16k16_ld_b:
18108     return MMA_LDST(8, m16n16k16_load_b_f16);
18109   case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
18110     return MMA_LDST(4, m16n16k16_load_c_f16);
18111   case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
18112     return MMA_LDST(8, m16n16k16_load_c_f32);
18113   case NVPTX::BI__hmma_m32n8k16_ld_a:
18114     return MMA_LDST(8, m32n8k16_load_a_f16);
18115   case NVPTX::BI__hmma_m32n8k16_ld_b:
18116     return MMA_LDST(8, m32n8k16_load_b_f16);
18117   case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
18118     return MMA_LDST(4, m32n8k16_load_c_f16);
18119   case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
18120     return MMA_LDST(8, m32n8k16_load_c_f32);
18121   case NVPTX::BI__hmma_m8n32k16_ld_a:
18122     return MMA_LDST(8, m8n32k16_load_a_f16);
18123   case NVPTX::BI__hmma_m8n32k16_ld_b:
18124     return MMA_LDST(8, m8n32k16_load_b_f16);
18125   case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
18126     return MMA_LDST(4, m8n32k16_load_c_f16);
18127   case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
18128     return MMA_LDST(8, m8n32k16_load_c_f32);
18129 
18130   // Integer MMA loads
18131   case NVPTX::BI__imma_m16n16k16_ld_a_s8:
18132     return MMA_LDST(2, m16n16k16_load_a_s8);
18133   case NVPTX::BI__imma_m16n16k16_ld_a_u8:
18134     return MMA_LDST(2, m16n16k16_load_a_u8);
18135   case NVPTX::BI__imma_m16n16k16_ld_b_s8:
18136     return MMA_LDST(2, m16n16k16_load_b_s8);
18137   case NVPTX::BI__imma_m16n16k16_ld_b_u8:
18138     return MMA_LDST(2, m16n16k16_load_b_u8);
18139   case NVPTX::BI__imma_m16n16k16_ld_c:
18140     return MMA_LDST(8, m16n16k16_load_c_s32);
18141   case NVPTX::BI__imma_m32n8k16_ld_a_s8:
18142     return MMA_LDST(4, m32n8k16_load_a_s8);
18143   case NVPTX::BI__imma_m32n8k16_ld_a_u8:
18144     return MMA_LDST(4, m32n8k16_load_a_u8);
18145   case NVPTX::BI__imma_m32n8k16_ld_b_s8:
18146     return MMA_LDST(1, m32n8k16_load_b_s8);
18147   case NVPTX::BI__imma_m32n8k16_ld_b_u8:
18148     return MMA_LDST(1, m32n8k16_load_b_u8);
18149   case NVPTX::BI__imma_m32n8k16_ld_c:
18150     return MMA_LDST(8, m32n8k16_load_c_s32);
18151   case NVPTX::BI__imma_m8n32k16_ld_a_s8:
18152     return MMA_LDST(1, m8n32k16_load_a_s8);
18153   case NVPTX::BI__imma_m8n32k16_ld_a_u8:
18154     return MMA_LDST(1, m8n32k16_load_a_u8);
18155   case NVPTX::BI__imma_m8n32k16_ld_b_s8:
18156     return MMA_LDST(4, m8n32k16_load_b_s8);
18157   case NVPTX::BI__imma_m8n32k16_ld_b_u8:
18158     return MMA_LDST(4, m8n32k16_load_b_u8);
18159   case NVPTX::BI__imma_m8n32k16_ld_c:
18160     return MMA_LDST(8, m8n32k16_load_c_s32);
18161 
18162   // Sub-integer MMA loads.
18163   // Only row/col layout is supported by A/B fragments.
18164   case NVPTX::BI__imma_m8n8k32_ld_a_s4:
18165     return {1, 0, MMA_INTR(m8n8k32_load_a_s4, row)};
18166   case NVPTX::BI__imma_m8n8k32_ld_a_u4:
18167     return {1, 0, MMA_INTR(m8n8k32_load_a_u4, row)};
18168   case NVPTX::BI__imma_m8n8k32_ld_b_s4:
18169     return {1, MMA_INTR(m8n8k32_load_b_s4, col), 0};
18170   case NVPTX::BI__imma_m8n8k32_ld_b_u4:
18171     return {1, MMA_INTR(m8n8k32_load_b_u4, col), 0};
18172   case NVPTX::BI__imma_m8n8k32_ld_c:
18173     return MMA_LDST(2, m8n8k32_load_c_s32);
18174   case NVPTX::BI__bmma_m8n8k128_ld_a_b1:
18175     return {1, 0, MMA_INTR(m8n8k128_load_a_b1, row)};
18176   case NVPTX::BI__bmma_m8n8k128_ld_b_b1:
18177     return {1, MMA_INTR(m8n8k128_load_b_b1, col), 0};
18178   case NVPTX::BI__bmma_m8n8k128_ld_c:
18179     return MMA_LDST(2, m8n8k128_load_c_s32);
18180 
18181   // Double MMA loads
18182   case NVPTX::BI__dmma_m8n8k4_ld_a:
18183     return MMA_LDST(1, m8n8k4_load_a_f64);
18184   case NVPTX::BI__dmma_m8n8k4_ld_b:
18185     return MMA_LDST(1, m8n8k4_load_b_f64);
18186   case NVPTX::BI__dmma_m8n8k4_ld_c:
18187     return MMA_LDST(2, m8n8k4_load_c_f64);
18188 
18189   // Alternate float MMA loads
18190   case NVPTX::BI__mma_bf16_m16n16k16_ld_a:
18191     return MMA_LDST(4, m16n16k16_load_a_bf16);
18192   case NVPTX::BI__mma_bf16_m16n16k16_ld_b:
18193     return MMA_LDST(4, m16n16k16_load_b_bf16);
18194   case NVPTX::BI__mma_bf16_m8n32k16_ld_a:
18195     return MMA_LDST(2, m8n32k16_load_a_bf16);
18196   case NVPTX::BI__mma_bf16_m8n32k16_ld_b:
18197     return MMA_LDST(8, m8n32k16_load_b_bf16);
18198   case NVPTX::BI__mma_bf16_m32n8k16_ld_a:
18199     return MMA_LDST(8, m32n8k16_load_a_bf16);
18200   case NVPTX::BI__mma_bf16_m32n8k16_ld_b:
18201     return MMA_LDST(2, m32n8k16_load_b_bf16);
18202   case NVPTX::BI__mma_tf32_m16n16k8_ld_a:
18203     return MMA_LDST(4, m16n16k8_load_a_tf32);
18204   case NVPTX::BI__mma_tf32_m16n16k8_ld_b:
18205     return MMA_LDST(4, m16n16k8_load_b_tf32);
18206   case NVPTX::BI__mma_tf32_m16n16k8_ld_c:
18207     return MMA_LDST(8, m16n16k8_load_c_f32);
18208 
18209   // NOTE: We need to follow inconsitent naming scheme used by NVCC.  Unlike
18210   // PTX and LLVM IR where stores always use fragment D, NVCC builtins always
18211   // use fragment C for both loads and stores.
18212   // FP MMA stores.
18213   case NVPTX::BI__hmma_m16n16k16_st_c_f16:
18214     return MMA_LDST(4, m16n16k16_store_d_f16);
18215   case NVPTX::BI__hmma_m16n16k16_st_c_f32:
18216     return MMA_LDST(8, m16n16k16_store_d_f32);
18217   case NVPTX::BI__hmma_m32n8k16_st_c_f16:
18218     return MMA_LDST(4, m32n8k16_store_d_f16);
18219   case NVPTX::BI__hmma_m32n8k16_st_c_f32:
18220     return MMA_LDST(8, m32n8k16_store_d_f32);
18221   case NVPTX::BI__hmma_m8n32k16_st_c_f16:
18222     return MMA_LDST(4, m8n32k16_store_d_f16);
18223   case NVPTX::BI__hmma_m8n32k16_st_c_f32:
18224     return MMA_LDST(8, m8n32k16_store_d_f32);
18225 
18226   // Integer and sub-integer MMA stores.
18227   // Another naming quirk. Unlike other MMA builtins that use PTX types in the
18228   // name, integer loads/stores use LLVM's i32.
18229   case NVPTX::BI__imma_m16n16k16_st_c_i32:
18230     return MMA_LDST(8, m16n16k16_store_d_s32);
18231   case NVPTX::BI__imma_m32n8k16_st_c_i32:
18232     return MMA_LDST(8, m32n8k16_store_d_s32);
18233   case NVPTX::BI__imma_m8n32k16_st_c_i32:
18234     return MMA_LDST(8, m8n32k16_store_d_s32);
18235   case NVPTX::BI__imma_m8n8k32_st_c_i32:
18236     return MMA_LDST(2, m8n8k32_store_d_s32);
18237   case NVPTX::BI__bmma_m8n8k128_st_c_i32:
18238     return MMA_LDST(2, m8n8k128_store_d_s32);
18239 
18240   // Double MMA store
18241   case NVPTX::BI__dmma_m8n8k4_st_c_f64:
18242     return MMA_LDST(2, m8n8k4_store_d_f64);
18243 
18244   // Alternate float MMA store
18245   case NVPTX::BI__mma_m16n16k8_st_c_f32:
18246     return MMA_LDST(8, m16n16k8_store_d_f32);
18247 
18248   default:
18249     llvm_unreachable("Unknown MMA builtin");
18250   }
18251 }
18252 #undef MMA_LDST
18253 #undef MMA_INTR
18254 
18255 
18256 struct NVPTXMmaInfo {
18257   unsigned NumEltsA;
18258   unsigned NumEltsB;
18259   unsigned NumEltsC;
18260   unsigned NumEltsD;
18261 
18262   // Variants are ordered by layout-A/layout-B/satf, where 'row' has priority
18263   // over 'col' for layout. The index of non-satf variants is expected to match
18264   // the undocumented layout constants used by CUDA's mma.hpp.
18265   std::array<unsigned, 8> Variants;
18266 
18267   unsigned getMMAIntrinsic(int Layout, bool Satf) {
18268     unsigned Index = Layout + 4 * Satf;
18269     if (Index >= Variants.size())
18270       return 0;
18271     return Variants[Index];
18272   }
18273 };
18274 
18275   // Returns an intrinsic that matches Layout and Satf for valid combinations of
18276   // Layout and Satf, 0 otherwise.
18277 static NVPTXMmaInfo getNVPTXMmaInfo(unsigned BuiltinID) {
18278   // clang-format off
18279 #define MMA_VARIANTS(geom, type)                                    \
18280       Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type,             \
18281       Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type,             \
18282       Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type,             \
18283       Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type
18284 #define MMA_SATF_VARIANTS(geom, type)                               \
18285       MMA_VARIANTS(geom, type),                                     \
18286       Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type##_satfinite, \
18287       Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
18288       Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type##_satfinite, \
18289       Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type##_satfinite
18290 // Sub-integer MMA only supports row.col layout.
18291 #define MMA_VARIANTS_I4(geom, type) \
18292       0, \
18293       Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type,             \
18294       0, \
18295       0, \
18296       0, \
18297       Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
18298       0, \
18299       0
18300 // b1 MMA does not support .satfinite.
18301 #define MMA_VARIANTS_B1_XOR(geom, type) \
18302       0, \
18303       Intrinsic::nvvm_wmma_##geom##_mma_xor_popc_row_col_##type,             \
18304       0, \
18305       0, \
18306       0, \
18307       0, \
18308       0, \
18309       0
18310 #define MMA_VARIANTS_B1_AND(geom, type) \
18311       0, \
18312       Intrinsic::nvvm_wmma_##geom##_mma_and_popc_row_col_##type,             \
18313       0, \
18314       0, \
18315       0, \
18316       0, \
18317       0, \
18318       0
18319   // clang-format on
18320   switch (BuiltinID) {
18321   // FP MMA
18322   // Note that 'type' argument of MMA_SATF_VARIANTS uses D_C notation, while
18323   // NumEltsN of return value are ordered as A,B,C,D.
18324   case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
18325     return {8, 8, 4, 4, {{MMA_SATF_VARIANTS(m16n16k16, f16_f16)}}};
18326   case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
18327     return {8, 8, 4, 8, {{MMA_SATF_VARIANTS(m16n16k16, f32_f16)}}};
18328   case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
18329     return {8, 8, 8, 4, {{MMA_SATF_VARIANTS(m16n16k16, f16_f32)}}};
18330   case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
18331     return {8, 8, 8, 8, {{MMA_SATF_VARIANTS(m16n16k16, f32_f32)}}};
18332   case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
18333     return {8, 8, 4, 4, {{MMA_SATF_VARIANTS(m32n8k16, f16_f16)}}};
18334   case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
18335     return {8, 8, 4, 8, {{MMA_SATF_VARIANTS(m32n8k16, f32_f16)}}};
18336   case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
18337     return {8, 8, 8, 4, {{MMA_SATF_VARIANTS(m32n8k16, f16_f32)}}};
18338   case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
18339     return {8, 8, 8, 8, {{MMA_SATF_VARIANTS(m32n8k16, f32_f32)}}};
18340   case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
18341     return {8, 8, 4, 4, {{MMA_SATF_VARIANTS(m8n32k16, f16_f16)}}};
18342   case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
18343     return {8, 8, 4, 8, {{MMA_SATF_VARIANTS(m8n32k16, f32_f16)}}};
18344   case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
18345     return {8, 8, 8, 4, {{MMA_SATF_VARIANTS(m8n32k16, f16_f32)}}};
18346   case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
18347     return {8, 8, 8, 8, {{MMA_SATF_VARIANTS(m8n32k16, f32_f32)}}};
18348 
18349   // Integer MMA
18350   case NVPTX::BI__imma_m16n16k16_mma_s8:
18351     return {2, 2, 8, 8, {{MMA_SATF_VARIANTS(m16n16k16, s8)}}};
18352   case NVPTX::BI__imma_m16n16k16_mma_u8:
18353     return {2, 2, 8, 8, {{MMA_SATF_VARIANTS(m16n16k16, u8)}}};
18354   case NVPTX::BI__imma_m32n8k16_mma_s8:
18355     return {4, 1, 8, 8, {{MMA_SATF_VARIANTS(m32n8k16, s8)}}};
18356   case NVPTX::BI__imma_m32n8k16_mma_u8:
18357     return {4, 1, 8, 8, {{MMA_SATF_VARIANTS(m32n8k16, u8)}}};
18358   case NVPTX::BI__imma_m8n32k16_mma_s8:
18359     return {1, 4, 8, 8, {{MMA_SATF_VARIANTS(m8n32k16, s8)}}};
18360   case NVPTX::BI__imma_m8n32k16_mma_u8:
18361     return {1, 4, 8, 8, {{MMA_SATF_VARIANTS(m8n32k16, u8)}}};
18362 
18363   // Sub-integer MMA
18364   case NVPTX::BI__imma_m8n8k32_mma_s4:
18365     return {1, 1, 2, 2, {{MMA_VARIANTS_I4(m8n8k32, s4)}}};
18366   case NVPTX::BI__imma_m8n8k32_mma_u4:
18367     return {1, 1, 2, 2, {{MMA_VARIANTS_I4(m8n8k32, u4)}}};
18368   case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1:
18369     return {1, 1, 2, 2, {{MMA_VARIANTS_B1_XOR(m8n8k128, b1)}}};
18370   case NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1:
18371     return {1, 1, 2, 2, {{MMA_VARIANTS_B1_AND(m8n8k128, b1)}}};
18372 
18373   // Double MMA
18374   case NVPTX::BI__dmma_m8n8k4_mma_f64:
18375     return {1, 1, 2, 2, {{MMA_VARIANTS(m8n8k4, f64)}}};
18376 
18377   // Alternate FP MMA
18378   case NVPTX::BI__mma_bf16_m16n16k16_mma_f32:
18379     return {4, 4, 8, 8, {{MMA_VARIANTS(m16n16k16, bf16)}}};
18380   case NVPTX::BI__mma_bf16_m8n32k16_mma_f32:
18381     return {2, 8, 8, 8, {{MMA_VARIANTS(m8n32k16, bf16)}}};
18382   case NVPTX::BI__mma_bf16_m32n8k16_mma_f32:
18383     return {8, 2, 8, 8, {{MMA_VARIANTS(m32n8k16, bf16)}}};
18384   case NVPTX::BI__mma_tf32_m16n16k8_mma_f32:
18385     return {4, 4, 8, 8, {{MMA_VARIANTS(m16n16k8, tf32)}}};
18386   default:
18387     llvm_unreachable("Unexpected builtin ID.");
18388   }
18389 #undef MMA_VARIANTS
18390 #undef MMA_SATF_VARIANTS
18391 #undef MMA_VARIANTS_I4
18392 #undef MMA_VARIANTS_B1_AND
18393 #undef MMA_VARIANTS_B1_XOR
18394 }
18395 
18396 static Value *MakeLdgLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
18397                          const CallExpr *E) {
18398   Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
18399   QualType ArgType = E->getArg(0)->getType();
18400   clang::CharUnits Align = CGF.CGM.getNaturalPointeeTypeAlignment(ArgType);
18401   llvm::Type *ElemTy = CGF.ConvertTypeForMem(ArgType->getPointeeType());
18402   return CGF.Builder.CreateCall(
18403       CGF.CGM.getIntrinsic(IntrinsicID, {ElemTy, Ptr->getType()}),
18404       {Ptr, ConstantInt::get(CGF.Builder.getInt32Ty(), Align.getQuantity())});
18405 }
18406 
18407 static Value *MakeScopedAtomic(unsigned IntrinsicID, CodeGenFunction &CGF,
18408                                const CallExpr *E) {
18409   Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
18410   llvm::Type *ElemTy =
18411       CGF.ConvertTypeForMem(E->getArg(0)->getType()->getPointeeType());
18412   return CGF.Builder.CreateCall(
18413       CGF.CGM.getIntrinsic(IntrinsicID, {ElemTy, Ptr->getType()}),
18414       {Ptr, CGF.EmitScalarExpr(E->getArg(1))});
18415 }
18416 
18417 static Value *MakeCpAsync(unsigned IntrinsicID, unsigned IntrinsicIDS,
18418                           CodeGenFunction &CGF, const CallExpr *E,
18419                           int SrcSize) {
18420   return E->getNumArgs() == 3
18421              ? CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IntrinsicIDS),
18422                                       {CGF.EmitScalarExpr(E->getArg(0)),
18423                                        CGF.EmitScalarExpr(E->getArg(1)),
18424                                        CGF.EmitScalarExpr(E->getArg(2))})
18425              : CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IntrinsicID),
18426                                       {CGF.EmitScalarExpr(E->getArg(0)),
18427                                        CGF.EmitScalarExpr(E->getArg(1))});
18428 }
18429 
18430 static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
18431                            const CallExpr *E, CodeGenFunction &CGF) {
18432   auto &C = CGF.CGM.getContext();
18433   if (!(C.getLangOpts().NativeHalfType ||
18434         !C.getTargetInfo().useFP16ConversionIntrinsics())) {
18435     CGF.CGM.Error(E->getExprLoc(), C.BuiltinInfo.getName(BuiltinID).str() +
18436                                        " requires native half type support.");
18437     return nullptr;
18438   }
18439 
18440   if (IntrinsicID == Intrinsic::nvvm_ldg_global_f ||
18441       IntrinsicID == Intrinsic::nvvm_ldu_global_f)
18442     return MakeLdgLdu(IntrinsicID, CGF, E);
18443 
18444   SmallVector<Value *, 16> Args;
18445   auto *F = CGF.CGM.getIntrinsic(IntrinsicID);
18446   auto *FTy = F->getFunctionType();
18447   unsigned ICEArguments = 0;
18448   ASTContext::GetBuiltinTypeError Error;
18449   C.GetBuiltinType(BuiltinID, Error, &ICEArguments);
18450   assert(Error == ASTContext::GE_None && "Should not codegen an error");
18451   for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) {
18452     assert((ICEArguments & (1 << i)) == 0);
18453     auto *ArgValue = CGF.EmitScalarExpr(E->getArg(i));
18454     auto *PTy = FTy->getParamType(i);
18455     if (PTy != ArgValue->getType())
18456       ArgValue = CGF.Builder.CreateBitCast(ArgValue, PTy);
18457     Args.push_back(ArgValue);
18458   }
18459 
18460   return CGF.Builder.CreateCall(F, Args);
18461 }
18462 } // namespace
18463 
18464 Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
18465                                              const CallExpr *E) {
18466   switch (BuiltinID) {
18467   case NVPTX::BI__nvvm_atom_add_gen_i:
18468   case NVPTX::BI__nvvm_atom_add_gen_l:
18469   case NVPTX::BI__nvvm_atom_add_gen_ll:
18470     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Add, E);
18471 
18472   case NVPTX::BI__nvvm_atom_sub_gen_i:
18473   case NVPTX::BI__nvvm_atom_sub_gen_l:
18474   case NVPTX::BI__nvvm_atom_sub_gen_ll:
18475     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Sub, E);
18476 
18477   case NVPTX::BI__nvvm_atom_and_gen_i:
18478   case NVPTX::BI__nvvm_atom_and_gen_l:
18479   case NVPTX::BI__nvvm_atom_and_gen_ll:
18480     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::And, E);
18481 
18482   case NVPTX::BI__nvvm_atom_or_gen_i:
18483   case NVPTX::BI__nvvm_atom_or_gen_l:
18484   case NVPTX::BI__nvvm_atom_or_gen_ll:
18485     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Or, E);
18486 
18487   case NVPTX::BI__nvvm_atom_xor_gen_i:
18488   case NVPTX::BI__nvvm_atom_xor_gen_l:
18489   case NVPTX::BI__nvvm_atom_xor_gen_ll:
18490     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xor, E);
18491 
18492   case NVPTX::BI__nvvm_atom_xchg_gen_i:
18493   case NVPTX::BI__nvvm_atom_xchg_gen_l:
18494   case NVPTX::BI__nvvm_atom_xchg_gen_ll:
18495     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xchg, E);
18496 
18497   case NVPTX::BI__nvvm_atom_max_gen_i:
18498   case NVPTX::BI__nvvm_atom_max_gen_l:
18499   case NVPTX::BI__nvvm_atom_max_gen_ll:
18500     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Max, E);
18501 
18502   case NVPTX::BI__nvvm_atom_max_gen_ui:
18503   case NVPTX::BI__nvvm_atom_max_gen_ul:
18504   case NVPTX::BI__nvvm_atom_max_gen_ull:
18505     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMax, E);
18506 
18507   case NVPTX::BI__nvvm_atom_min_gen_i:
18508   case NVPTX::BI__nvvm_atom_min_gen_l:
18509   case NVPTX::BI__nvvm_atom_min_gen_ll:
18510     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Min, E);
18511 
18512   case NVPTX::BI__nvvm_atom_min_gen_ui:
18513   case NVPTX::BI__nvvm_atom_min_gen_ul:
18514   case NVPTX::BI__nvvm_atom_min_gen_ull:
18515     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMin, E);
18516 
18517   case NVPTX::BI__nvvm_atom_cas_gen_i:
18518   case NVPTX::BI__nvvm_atom_cas_gen_l:
18519   case NVPTX::BI__nvvm_atom_cas_gen_ll:
18520     // __nvvm_atom_cas_gen_* should return the old value rather than the
18521     // success flag.
18522     return MakeAtomicCmpXchgValue(*this, E, /*ReturnBool=*/false);
18523 
18524   case NVPTX::BI__nvvm_atom_add_gen_f:
18525   case NVPTX::BI__nvvm_atom_add_gen_d: {
18526     Value *Ptr = EmitScalarExpr(E->getArg(0));
18527     Value *Val = EmitScalarExpr(E->getArg(1));
18528     return Builder.CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, Ptr, Val,
18529                                    AtomicOrdering::SequentiallyConsistent);
18530   }
18531 
18532   case NVPTX::BI__nvvm_atom_inc_gen_ui: {
18533     Value *Ptr = EmitScalarExpr(E->getArg(0));
18534     Value *Val = EmitScalarExpr(E->getArg(1));
18535     Function *FnALI32 =
18536         CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_inc_32, Ptr->getType());
18537     return Builder.CreateCall(FnALI32, {Ptr, Val});
18538   }
18539 
18540   case NVPTX::BI__nvvm_atom_dec_gen_ui: {
18541     Value *Ptr = EmitScalarExpr(E->getArg(0));
18542     Value *Val = EmitScalarExpr(E->getArg(1));
18543     Function *FnALD32 =
18544         CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_dec_32, Ptr->getType());
18545     return Builder.CreateCall(FnALD32, {Ptr, Val});
18546   }
18547 
18548   case NVPTX::BI__nvvm_ldg_c:
18549   case NVPTX::BI__nvvm_ldg_sc:
18550   case NVPTX::BI__nvvm_ldg_c2:
18551   case NVPTX::BI__nvvm_ldg_sc2:
18552   case NVPTX::BI__nvvm_ldg_c4:
18553   case NVPTX::BI__nvvm_ldg_sc4:
18554   case NVPTX::BI__nvvm_ldg_s:
18555   case NVPTX::BI__nvvm_ldg_s2:
18556   case NVPTX::BI__nvvm_ldg_s4:
18557   case NVPTX::BI__nvvm_ldg_i:
18558   case NVPTX::BI__nvvm_ldg_i2:
18559   case NVPTX::BI__nvvm_ldg_i4:
18560   case NVPTX::BI__nvvm_ldg_l:
18561   case NVPTX::BI__nvvm_ldg_l2:
18562   case NVPTX::BI__nvvm_ldg_ll:
18563   case NVPTX::BI__nvvm_ldg_ll2:
18564   case NVPTX::BI__nvvm_ldg_uc:
18565   case NVPTX::BI__nvvm_ldg_uc2:
18566   case NVPTX::BI__nvvm_ldg_uc4:
18567   case NVPTX::BI__nvvm_ldg_us:
18568   case NVPTX::BI__nvvm_ldg_us2:
18569   case NVPTX::BI__nvvm_ldg_us4:
18570   case NVPTX::BI__nvvm_ldg_ui:
18571   case NVPTX::BI__nvvm_ldg_ui2:
18572   case NVPTX::BI__nvvm_ldg_ui4:
18573   case NVPTX::BI__nvvm_ldg_ul:
18574   case NVPTX::BI__nvvm_ldg_ul2:
18575   case NVPTX::BI__nvvm_ldg_ull:
18576   case NVPTX::BI__nvvm_ldg_ull2:
18577     // PTX Interoperability section 2.2: "For a vector with an even number of
18578     // elements, its alignment is set to number of elements times the alignment
18579     // of its member: n*alignof(t)."
18580     return MakeLdgLdu(Intrinsic::nvvm_ldg_global_i, *this, E);
18581   case NVPTX::BI__nvvm_ldg_f:
18582   case NVPTX::BI__nvvm_ldg_f2:
18583   case NVPTX::BI__nvvm_ldg_f4:
18584   case NVPTX::BI__nvvm_ldg_d:
18585   case NVPTX::BI__nvvm_ldg_d2:
18586     return MakeLdgLdu(Intrinsic::nvvm_ldg_global_f, *this, E);
18587 
18588   case NVPTX::BI__nvvm_ldu_c:
18589   case NVPTX::BI__nvvm_ldu_sc:
18590   case NVPTX::BI__nvvm_ldu_c2:
18591   case NVPTX::BI__nvvm_ldu_sc2:
18592   case NVPTX::BI__nvvm_ldu_c4:
18593   case NVPTX::BI__nvvm_ldu_sc4:
18594   case NVPTX::BI__nvvm_ldu_s:
18595   case NVPTX::BI__nvvm_ldu_s2:
18596   case NVPTX::BI__nvvm_ldu_s4:
18597   case NVPTX::BI__nvvm_ldu_i:
18598   case NVPTX::BI__nvvm_ldu_i2:
18599   case NVPTX::BI__nvvm_ldu_i4:
18600   case NVPTX::BI__nvvm_ldu_l:
18601   case NVPTX::BI__nvvm_ldu_l2:
18602   case NVPTX::BI__nvvm_ldu_ll:
18603   case NVPTX::BI__nvvm_ldu_ll2:
18604   case NVPTX::BI__nvvm_ldu_uc:
18605   case NVPTX::BI__nvvm_ldu_uc2:
18606   case NVPTX::BI__nvvm_ldu_uc4:
18607   case NVPTX::BI__nvvm_ldu_us:
18608   case NVPTX::BI__nvvm_ldu_us2:
18609   case NVPTX::BI__nvvm_ldu_us4:
18610   case NVPTX::BI__nvvm_ldu_ui:
18611   case NVPTX::BI__nvvm_ldu_ui2:
18612   case NVPTX::BI__nvvm_ldu_ui4:
18613   case NVPTX::BI__nvvm_ldu_ul:
18614   case NVPTX::BI__nvvm_ldu_ul2:
18615   case NVPTX::BI__nvvm_ldu_ull:
18616   case NVPTX::BI__nvvm_ldu_ull2:
18617     return MakeLdgLdu(Intrinsic::nvvm_ldu_global_i, *this, E);
18618   case NVPTX::BI__nvvm_ldu_f:
18619   case NVPTX::BI__nvvm_ldu_f2:
18620   case NVPTX::BI__nvvm_ldu_f4:
18621   case NVPTX::BI__nvvm_ldu_d:
18622   case NVPTX::BI__nvvm_ldu_d2:
18623     return MakeLdgLdu(Intrinsic::nvvm_ldu_global_f, *this, E);
18624 
18625   case NVPTX::BI__nvvm_atom_cta_add_gen_i:
18626   case NVPTX::BI__nvvm_atom_cta_add_gen_l:
18627   case NVPTX::BI__nvvm_atom_cta_add_gen_ll:
18628     return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_cta, *this, E);
18629   case NVPTX::BI__nvvm_atom_sys_add_gen_i:
18630   case NVPTX::BI__nvvm_atom_sys_add_gen_l:
18631   case NVPTX::BI__nvvm_atom_sys_add_gen_ll:
18632     return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_sys, *this, E);
18633   case NVPTX::BI__nvvm_atom_cta_add_gen_f:
18634   case NVPTX::BI__nvvm_atom_cta_add_gen_d:
18635     return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_cta, *this, E);
18636   case NVPTX::BI__nvvm_atom_sys_add_gen_f:
18637   case NVPTX::BI__nvvm_atom_sys_add_gen_d:
18638     return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_sys, *this, E);
18639   case NVPTX::BI__nvvm_atom_cta_xchg_gen_i:
18640   case NVPTX::BI__nvvm_atom_cta_xchg_gen_l:
18641   case NVPTX::BI__nvvm_atom_cta_xchg_gen_ll:
18642     return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_cta, *this, E);
18643   case NVPTX::BI__nvvm_atom_sys_xchg_gen_i:
18644   case NVPTX::BI__nvvm_atom_sys_xchg_gen_l:
18645   case NVPTX::BI__nvvm_atom_sys_xchg_gen_ll:
18646     return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_sys, *this, E);
18647   case NVPTX::BI__nvvm_atom_cta_max_gen_i:
18648   case NVPTX::BI__nvvm_atom_cta_max_gen_ui:
18649   case NVPTX::BI__nvvm_atom_cta_max_gen_l:
18650   case NVPTX::BI__nvvm_atom_cta_max_gen_ul:
18651   case NVPTX::BI__nvvm_atom_cta_max_gen_ll:
18652   case NVPTX::BI__nvvm_atom_cta_max_gen_ull:
18653     return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_cta, *this, E);
18654   case NVPTX::BI__nvvm_atom_sys_max_gen_i:
18655   case NVPTX::BI__nvvm_atom_sys_max_gen_ui:
18656   case NVPTX::BI__nvvm_atom_sys_max_gen_l:
18657   case NVPTX::BI__nvvm_atom_sys_max_gen_ul:
18658   case NVPTX::BI__nvvm_atom_sys_max_gen_ll:
18659   case NVPTX::BI__nvvm_atom_sys_max_gen_ull:
18660     return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_sys, *this, E);
18661   case NVPTX::BI__nvvm_atom_cta_min_gen_i:
18662   case NVPTX::BI__nvvm_atom_cta_min_gen_ui:
18663   case NVPTX::BI__nvvm_atom_cta_min_gen_l:
18664   case NVPTX::BI__nvvm_atom_cta_min_gen_ul:
18665   case NVPTX::BI__nvvm_atom_cta_min_gen_ll:
18666   case NVPTX::BI__nvvm_atom_cta_min_gen_ull:
18667     return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_cta, *this, E);
18668   case NVPTX::BI__nvvm_atom_sys_min_gen_i:
18669   case NVPTX::BI__nvvm_atom_sys_min_gen_ui:
18670   case NVPTX::BI__nvvm_atom_sys_min_gen_l:
18671   case NVPTX::BI__nvvm_atom_sys_min_gen_ul:
18672   case NVPTX::BI__nvvm_atom_sys_min_gen_ll:
18673   case NVPTX::BI__nvvm_atom_sys_min_gen_ull:
18674     return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_sys, *this, E);
18675   case NVPTX::BI__nvvm_atom_cta_inc_gen_ui:
18676     return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_cta, *this, E);
18677   case NVPTX::BI__nvvm_atom_cta_dec_gen_ui:
18678     return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_cta, *this, E);
18679   case NVPTX::BI__nvvm_atom_sys_inc_gen_ui:
18680     return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_sys, *this, E);
18681   case NVPTX::BI__nvvm_atom_sys_dec_gen_ui:
18682     return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_sys, *this, E);
18683   case NVPTX::BI__nvvm_atom_cta_and_gen_i:
18684   case NVPTX::BI__nvvm_atom_cta_and_gen_l:
18685   case NVPTX::BI__nvvm_atom_cta_and_gen_ll:
18686     return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_cta, *this, E);
18687   case NVPTX::BI__nvvm_atom_sys_and_gen_i:
18688   case NVPTX::BI__nvvm_atom_sys_and_gen_l:
18689   case NVPTX::BI__nvvm_atom_sys_and_gen_ll:
18690     return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_sys, *this, E);
18691   case NVPTX::BI__nvvm_atom_cta_or_gen_i:
18692   case NVPTX::BI__nvvm_atom_cta_or_gen_l:
18693   case NVPTX::BI__nvvm_atom_cta_or_gen_ll:
18694     return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_cta, *this, E);
18695   case NVPTX::BI__nvvm_atom_sys_or_gen_i:
18696   case NVPTX::BI__nvvm_atom_sys_or_gen_l:
18697   case NVPTX::BI__nvvm_atom_sys_or_gen_ll:
18698     return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_sys, *this, E);
18699   case NVPTX::BI__nvvm_atom_cta_xor_gen_i:
18700   case NVPTX::BI__nvvm_atom_cta_xor_gen_l:
18701   case NVPTX::BI__nvvm_atom_cta_xor_gen_ll:
18702     return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_cta, *this, E);
18703   case NVPTX::BI__nvvm_atom_sys_xor_gen_i:
18704   case NVPTX::BI__nvvm_atom_sys_xor_gen_l:
18705   case NVPTX::BI__nvvm_atom_sys_xor_gen_ll:
18706     return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_sys, *this, E);
18707   case NVPTX::BI__nvvm_atom_cta_cas_gen_i:
18708   case NVPTX::BI__nvvm_atom_cta_cas_gen_l:
18709   case NVPTX::BI__nvvm_atom_cta_cas_gen_ll: {
18710     Value *Ptr = EmitScalarExpr(E->getArg(0));
18711     llvm::Type *ElemTy =
18712         ConvertTypeForMem(E->getArg(0)->getType()->getPointeeType());
18713     return Builder.CreateCall(
18714         CGM.getIntrinsic(
18715             Intrinsic::nvvm_atomic_cas_gen_i_cta, {ElemTy, Ptr->getType()}),
18716         {Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))});
18717   }
18718   case NVPTX::BI__nvvm_atom_sys_cas_gen_i:
18719   case NVPTX::BI__nvvm_atom_sys_cas_gen_l:
18720   case NVPTX::BI__nvvm_atom_sys_cas_gen_ll: {
18721     Value *Ptr = EmitScalarExpr(E->getArg(0));
18722     llvm::Type *ElemTy =
18723         ConvertTypeForMem(E->getArg(0)->getType()->getPointeeType());
18724     return Builder.CreateCall(
18725         CGM.getIntrinsic(
18726             Intrinsic::nvvm_atomic_cas_gen_i_sys, {ElemTy, Ptr->getType()}),
18727         {Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))});
18728   }
18729   case NVPTX::BI__nvvm_match_all_sync_i32p:
18730   case NVPTX::BI__nvvm_match_all_sync_i64p: {
18731     Value *Mask = EmitScalarExpr(E->getArg(0));
18732     Value *Val = EmitScalarExpr(E->getArg(1));
18733     Address PredOutPtr = EmitPointerWithAlignment(E->getArg(2));
18734     Value *ResultPair = Builder.CreateCall(
18735         CGM.getIntrinsic(BuiltinID == NVPTX::BI__nvvm_match_all_sync_i32p
18736                              ? Intrinsic::nvvm_match_all_sync_i32p
18737                              : Intrinsic::nvvm_match_all_sync_i64p),
18738         {Mask, Val});
18739     Value *Pred = Builder.CreateZExt(Builder.CreateExtractValue(ResultPair, 1),
18740                                      PredOutPtr.getElementType());
18741     Builder.CreateStore(Pred, PredOutPtr);
18742     return Builder.CreateExtractValue(ResultPair, 0);
18743   }
18744 
18745   // FP MMA loads
18746   case NVPTX::BI__hmma_m16n16k16_ld_a:
18747   case NVPTX::BI__hmma_m16n16k16_ld_b:
18748   case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
18749   case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
18750   case NVPTX::BI__hmma_m32n8k16_ld_a:
18751   case NVPTX::BI__hmma_m32n8k16_ld_b:
18752   case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
18753   case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
18754   case NVPTX::BI__hmma_m8n32k16_ld_a:
18755   case NVPTX::BI__hmma_m8n32k16_ld_b:
18756   case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
18757   case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
18758   // Integer MMA loads.
18759   case NVPTX::BI__imma_m16n16k16_ld_a_s8:
18760   case NVPTX::BI__imma_m16n16k16_ld_a_u8:
18761   case NVPTX::BI__imma_m16n16k16_ld_b_s8:
18762   case NVPTX::BI__imma_m16n16k16_ld_b_u8:
18763   case NVPTX::BI__imma_m16n16k16_ld_c:
18764   case NVPTX::BI__imma_m32n8k16_ld_a_s8:
18765   case NVPTX::BI__imma_m32n8k16_ld_a_u8:
18766   case NVPTX::BI__imma_m32n8k16_ld_b_s8:
18767   case NVPTX::BI__imma_m32n8k16_ld_b_u8:
18768   case NVPTX::BI__imma_m32n8k16_ld_c:
18769   case NVPTX::BI__imma_m8n32k16_ld_a_s8:
18770   case NVPTX::BI__imma_m8n32k16_ld_a_u8:
18771   case NVPTX::BI__imma_m8n32k16_ld_b_s8:
18772   case NVPTX::BI__imma_m8n32k16_ld_b_u8:
18773   case NVPTX::BI__imma_m8n32k16_ld_c:
18774   // Sub-integer MMA loads.
18775   case NVPTX::BI__imma_m8n8k32_ld_a_s4:
18776   case NVPTX::BI__imma_m8n8k32_ld_a_u4:
18777   case NVPTX::BI__imma_m8n8k32_ld_b_s4:
18778   case NVPTX::BI__imma_m8n8k32_ld_b_u4:
18779   case NVPTX::BI__imma_m8n8k32_ld_c:
18780   case NVPTX::BI__bmma_m8n8k128_ld_a_b1:
18781   case NVPTX::BI__bmma_m8n8k128_ld_b_b1:
18782   case NVPTX::BI__bmma_m8n8k128_ld_c:
18783   // Double MMA loads.
18784   case NVPTX::BI__dmma_m8n8k4_ld_a:
18785   case NVPTX::BI__dmma_m8n8k4_ld_b:
18786   case NVPTX::BI__dmma_m8n8k4_ld_c:
18787   // Alternate float MMA loads.
18788   case NVPTX::BI__mma_bf16_m16n16k16_ld_a:
18789   case NVPTX::BI__mma_bf16_m16n16k16_ld_b:
18790   case NVPTX::BI__mma_bf16_m8n32k16_ld_a:
18791   case NVPTX::BI__mma_bf16_m8n32k16_ld_b:
18792   case NVPTX::BI__mma_bf16_m32n8k16_ld_a:
18793   case NVPTX::BI__mma_bf16_m32n8k16_ld_b:
18794   case NVPTX::BI__mma_tf32_m16n16k8_ld_a:
18795   case NVPTX::BI__mma_tf32_m16n16k8_ld_b:
18796   case NVPTX::BI__mma_tf32_m16n16k8_ld_c: {
18797     Address Dst = EmitPointerWithAlignment(E->getArg(0));
18798     Value *Src = EmitScalarExpr(E->getArg(1));
18799     Value *Ldm = EmitScalarExpr(E->getArg(2));
18800     std::optional<llvm::APSInt> isColMajorArg =
18801         E->getArg(3)->getIntegerConstantExpr(getContext());
18802     if (!isColMajorArg)
18803       return nullptr;
18804     bool isColMajor = isColMajorArg->getSExtValue();
18805     NVPTXMmaLdstInfo II = getNVPTXMmaLdstInfo(BuiltinID);
18806     unsigned IID = isColMajor ? II.IID_col : II.IID_row;
18807     if (IID == 0)
18808       return nullptr;
18809 
18810     Value *Result =
18811         Builder.CreateCall(CGM.getIntrinsic(IID, Src->getType()), {Src, Ldm});
18812 
18813     // Save returned values.
18814     assert(II.NumResults);
18815     if (II.NumResults == 1) {
18816       Builder.CreateAlignedStore(Result, Dst.getPointer(),
18817                                  CharUnits::fromQuantity(4));
18818     } else {
18819       for (unsigned i = 0; i < II.NumResults; ++i) {
18820         Builder.CreateAlignedStore(
18821             Builder.CreateBitCast(Builder.CreateExtractValue(Result, i),
18822                                   Dst.getElementType()),
18823             Builder.CreateGEP(Dst.getElementType(), Dst.getPointer(),
18824                               llvm::ConstantInt::get(IntTy, i)),
18825             CharUnits::fromQuantity(4));
18826       }
18827     }
18828     return Result;
18829   }
18830 
18831   case NVPTX::BI__hmma_m16n16k16_st_c_f16:
18832   case NVPTX::BI__hmma_m16n16k16_st_c_f32:
18833   case NVPTX::BI__hmma_m32n8k16_st_c_f16:
18834   case NVPTX::BI__hmma_m32n8k16_st_c_f32:
18835   case NVPTX::BI__hmma_m8n32k16_st_c_f16:
18836   case NVPTX::BI__hmma_m8n32k16_st_c_f32:
18837   case NVPTX::BI__imma_m16n16k16_st_c_i32:
18838   case NVPTX::BI__imma_m32n8k16_st_c_i32:
18839   case NVPTX::BI__imma_m8n32k16_st_c_i32:
18840   case NVPTX::BI__imma_m8n8k32_st_c_i32:
18841   case NVPTX::BI__bmma_m8n8k128_st_c_i32:
18842   case NVPTX::BI__dmma_m8n8k4_st_c_f64:
18843   case NVPTX::BI__mma_m16n16k8_st_c_f32: {
18844     Value *Dst = EmitScalarExpr(E->getArg(0));
18845     Address Src = EmitPointerWithAlignment(E->getArg(1));
18846     Value *Ldm = EmitScalarExpr(E->getArg(2));
18847     std::optional<llvm::APSInt> isColMajorArg =
18848         E->getArg(3)->getIntegerConstantExpr(getContext());
18849     if (!isColMajorArg)
18850       return nullptr;
18851     bool isColMajor = isColMajorArg->getSExtValue();
18852     NVPTXMmaLdstInfo II = getNVPTXMmaLdstInfo(BuiltinID);
18853     unsigned IID = isColMajor ? II.IID_col : II.IID_row;
18854     if (IID == 0)
18855       return nullptr;
18856     Function *Intrinsic =
18857         CGM.getIntrinsic(IID, Dst->getType());
18858     llvm::Type *ParamType = Intrinsic->getFunctionType()->getParamType(1);
18859     SmallVector<Value *, 10> Values = {Dst};
18860     for (unsigned i = 0; i < II.NumResults; ++i) {
18861       Value *V = Builder.CreateAlignedLoad(
18862           Src.getElementType(),
18863           Builder.CreateGEP(Src.getElementType(), Src.getPointer(),
18864                             llvm::ConstantInt::get(IntTy, i)),
18865           CharUnits::fromQuantity(4));
18866       Values.push_back(Builder.CreateBitCast(V, ParamType));
18867     }
18868     Values.push_back(Ldm);
18869     Value *Result = Builder.CreateCall(Intrinsic, Values);
18870     return Result;
18871   }
18872 
18873   // BI__hmma_m16n16k16_mma_<Dtype><CType>(d, a, b, c, layout, satf) -->
18874   // Intrinsic::nvvm_wmma_m16n16k16_mma_sync<layout A,B><DType><CType><Satf>
18875   case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
18876   case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
18877   case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
18878   case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
18879   case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
18880   case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
18881   case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
18882   case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
18883   case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
18884   case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
18885   case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
18886   case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
18887   case NVPTX::BI__imma_m16n16k16_mma_s8:
18888   case NVPTX::BI__imma_m16n16k16_mma_u8:
18889   case NVPTX::BI__imma_m32n8k16_mma_s8:
18890   case NVPTX::BI__imma_m32n8k16_mma_u8:
18891   case NVPTX::BI__imma_m8n32k16_mma_s8:
18892   case NVPTX::BI__imma_m8n32k16_mma_u8:
18893   case NVPTX::BI__imma_m8n8k32_mma_s4:
18894   case NVPTX::BI__imma_m8n8k32_mma_u4:
18895   case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1:
18896   case NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1:
18897   case NVPTX::BI__dmma_m8n8k4_mma_f64:
18898   case NVPTX::BI__mma_bf16_m16n16k16_mma_f32:
18899   case NVPTX::BI__mma_bf16_m8n32k16_mma_f32:
18900   case NVPTX::BI__mma_bf16_m32n8k16_mma_f32:
18901   case NVPTX::BI__mma_tf32_m16n16k8_mma_f32: {
18902     Address Dst = EmitPointerWithAlignment(E->getArg(0));
18903     Address SrcA = EmitPointerWithAlignment(E->getArg(1));
18904     Address SrcB = EmitPointerWithAlignment(E->getArg(2));
18905     Address SrcC = EmitPointerWithAlignment(E->getArg(3));
18906     std::optional<llvm::APSInt> LayoutArg =
18907         E->getArg(4)->getIntegerConstantExpr(getContext());
18908     if (!LayoutArg)
18909       return nullptr;
18910     int Layout = LayoutArg->getSExtValue();
18911     if (Layout < 0 || Layout > 3)
18912       return nullptr;
18913     llvm::APSInt SatfArg;
18914     if (BuiltinID == NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1 ||
18915         BuiltinID == NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1)
18916       SatfArg = 0;  // .b1 does not have satf argument.
18917     else if (std::optional<llvm::APSInt> OptSatfArg =
18918                  E->getArg(5)->getIntegerConstantExpr(getContext()))
18919       SatfArg = *OptSatfArg;
18920     else
18921       return nullptr;
18922     bool Satf = SatfArg.getSExtValue();
18923     NVPTXMmaInfo MI = getNVPTXMmaInfo(BuiltinID);
18924     unsigned IID = MI.getMMAIntrinsic(Layout, Satf);
18925     if (IID == 0)  // Unsupported combination of Layout/Satf.
18926       return nullptr;
18927 
18928     SmallVector<Value *, 24> Values;
18929     Function *Intrinsic = CGM.getIntrinsic(IID);
18930     llvm::Type *AType = Intrinsic->getFunctionType()->getParamType(0);
18931     // Load A
18932     for (unsigned i = 0; i < MI.NumEltsA; ++i) {
18933       Value *V = Builder.CreateAlignedLoad(
18934           SrcA.getElementType(),
18935           Builder.CreateGEP(SrcA.getElementType(), SrcA.getPointer(),
18936                             llvm::ConstantInt::get(IntTy, i)),
18937           CharUnits::fromQuantity(4));
18938       Values.push_back(Builder.CreateBitCast(V, AType));
18939     }
18940     // Load B
18941     llvm::Type *BType = Intrinsic->getFunctionType()->getParamType(MI.NumEltsA);
18942     for (unsigned i = 0; i < MI.NumEltsB; ++i) {
18943       Value *V = Builder.CreateAlignedLoad(
18944           SrcB.getElementType(),
18945           Builder.CreateGEP(SrcB.getElementType(), SrcB.getPointer(),
18946                             llvm::ConstantInt::get(IntTy, i)),
18947           CharUnits::fromQuantity(4));
18948       Values.push_back(Builder.CreateBitCast(V, BType));
18949     }
18950     // Load C
18951     llvm::Type *CType =
18952         Intrinsic->getFunctionType()->getParamType(MI.NumEltsA + MI.NumEltsB);
18953     for (unsigned i = 0; i < MI.NumEltsC; ++i) {
18954       Value *V = Builder.CreateAlignedLoad(
18955           SrcC.getElementType(),
18956           Builder.CreateGEP(SrcC.getElementType(), SrcC.getPointer(),
18957                             llvm::ConstantInt::get(IntTy, i)),
18958           CharUnits::fromQuantity(4));
18959       Values.push_back(Builder.CreateBitCast(V, CType));
18960     }
18961     Value *Result = Builder.CreateCall(Intrinsic, Values);
18962     llvm::Type *DType = Dst.getElementType();
18963     for (unsigned i = 0; i < MI.NumEltsD; ++i)
18964       Builder.CreateAlignedStore(
18965           Builder.CreateBitCast(Builder.CreateExtractValue(Result, i), DType),
18966           Builder.CreateGEP(Dst.getElementType(), Dst.getPointer(),
18967                             llvm::ConstantInt::get(IntTy, i)),
18968           CharUnits::fromQuantity(4));
18969     return Result;
18970   }
18971   // The following builtins require half type support
18972   case NVPTX::BI__nvvm_ex2_approx_f16:
18973     return MakeHalfType(Intrinsic::nvvm_ex2_approx_f16, BuiltinID, E, *this);
18974   case NVPTX::BI__nvvm_ex2_approx_f16x2:
18975     return MakeHalfType(Intrinsic::nvvm_ex2_approx_f16x2, BuiltinID, E, *this);
18976   case NVPTX::BI__nvvm_ff2f16x2_rn:
18977     return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rn, BuiltinID, E, *this);
18978   case NVPTX::BI__nvvm_ff2f16x2_rn_relu:
18979     return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rn_relu, BuiltinID, E, *this);
18980   case NVPTX::BI__nvvm_ff2f16x2_rz:
18981     return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rz, BuiltinID, E, *this);
18982   case NVPTX::BI__nvvm_ff2f16x2_rz_relu:
18983     return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rz_relu, BuiltinID, E, *this);
18984   case NVPTX::BI__nvvm_fma_rn_f16:
18985     return MakeHalfType(Intrinsic::nvvm_fma_rn_f16, BuiltinID, E, *this);
18986   case NVPTX::BI__nvvm_fma_rn_f16x2:
18987     return MakeHalfType(Intrinsic::nvvm_fma_rn_f16x2, BuiltinID, E, *this);
18988   case NVPTX::BI__nvvm_fma_rn_ftz_f16:
18989     return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_f16, BuiltinID, E, *this);
18990   case NVPTX::BI__nvvm_fma_rn_ftz_f16x2:
18991     return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_f16x2, BuiltinID, E, *this);
18992   case NVPTX::BI__nvvm_fma_rn_ftz_relu_f16:
18993     return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_relu_f16, BuiltinID, E,
18994                         *this);
18995   case NVPTX::BI__nvvm_fma_rn_ftz_relu_f16x2:
18996     return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_relu_f16x2, BuiltinID, E,
18997                         *this);
18998   case NVPTX::BI__nvvm_fma_rn_ftz_sat_f16:
18999     return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_sat_f16, BuiltinID, E,
19000                         *this);
19001   case NVPTX::BI__nvvm_fma_rn_ftz_sat_f16x2:
19002     return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_sat_f16x2, BuiltinID, E,
19003                         *this);
19004   case NVPTX::BI__nvvm_fma_rn_relu_f16:
19005     return MakeHalfType(Intrinsic::nvvm_fma_rn_relu_f16, BuiltinID, E, *this);
19006   case NVPTX::BI__nvvm_fma_rn_relu_f16x2:
19007     return MakeHalfType(Intrinsic::nvvm_fma_rn_relu_f16x2, BuiltinID, E, *this);
19008   case NVPTX::BI__nvvm_fma_rn_sat_f16:
19009     return MakeHalfType(Intrinsic::nvvm_fma_rn_sat_f16, BuiltinID, E, *this);
19010   case NVPTX::BI__nvvm_fma_rn_sat_f16x2:
19011     return MakeHalfType(Intrinsic::nvvm_fma_rn_sat_f16x2, BuiltinID, E, *this);
19012   case NVPTX::BI__nvvm_fmax_f16:
19013     return MakeHalfType(Intrinsic::nvvm_fmax_f16, BuiltinID, E, *this);
19014   case NVPTX::BI__nvvm_fmax_f16x2:
19015     return MakeHalfType(Intrinsic::nvvm_fmax_f16x2, BuiltinID, E, *this);
19016   case NVPTX::BI__nvvm_fmax_ftz_f16:
19017     return MakeHalfType(Intrinsic::nvvm_fmax_ftz_f16, BuiltinID, E, *this);
19018   case NVPTX::BI__nvvm_fmax_ftz_f16x2:
19019     return MakeHalfType(Intrinsic::nvvm_fmax_ftz_f16x2, BuiltinID, E, *this);
19020   case NVPTX::BI__nvvm_fmax_ftz_nan_f16:
19021     return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_f16, BuiltinID, E, *this);
19022   case NVPTX::BI__nvvm_fmax_ftz_nan_f16x2:
19023     return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_f16x2, BuiltinID, E,
19024                         *this);
19025   case NVPTX::BI__nvvm_fmax_ftz_nan_xorsign_abs_f16:
19026     return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f16, BuiltinID,
19027                         E, *this);
19028   case NVPTX::BI__nvvm_fmax_ftz_nan_xorsign_abs_f16x2:
19029     return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f16x2,
19030                         BuiltinID, E, *this);
19031   case NVPTX::BI__nvvm_fmax_ftz_xorsign_abs_f16:
19032     return MakeHalfType(Intrinsic::nvvm_fmax_ftz_xorsign_abs_f16, BuiltinID, E,
19033                         *this);
19034   case NVPTX::BI__nvvm_fmax_ftz_xorsign_abs_f16x2:
19035     return MakeHalfType(Intrinsic::nvvm_fmax_ftz_xorsign_abs_f16x2, BuiltinID,
19036                         E, *this);
19037   case NVPTX::BI__nvvm_fmax_nan_f16:
19038     return MakeHalfType(Intrinsic::nvvm_fmax_nan_f16, BuiltinID, E, *this);
19039   case NVPTX::BI__nvvm_fmax_nan_f16x2:
19040     return MakeHalfType(Intrinsic::nvvm_fmax_nan_f16x2, BuiltinID, E, *this);
19041   case NVPTX::BI__nvvm_fmax_nan_xorsign_abs_f16:
19042     return MakeHalfType(Intrinsic::nvvm_fmax_nan_xorsign_abs_f16, BuiltinID, E,
19043                         *this);
19044   case NVPTX::BI__nvvm_fmax_nan_xorsign_abs_f16x2:
19045     return MakeHalfType(Intrinsic::nvvm_fmax_nan_xorsign_abs_f16x2, BuiltinID,
19046                         E, *this);
19047   case NVPTX::BI__nvvm_fmax_xorsign_abs_f16:
19048     return MakeHalfType(Intrinsic::nvvm_fmax_xorsign_abs_f16, BuiltinID, E,
19049                         *this);
19050   case NVPTX::BI__nvvm_fmax_xorsign_abs_f16x2:
19051     return MakeHalfType(Intrinsic::nvvm_fmax_xorsign_abs_f16x2, BuiltinID, E,
19052                         *this);
19053   case NVPTX::BI__nvvm_fmin_f16:
19054     return MakeHalfType(Intrinsic::nvvm_fmin_f16, BuiltinID, E, *this);
19055   case NVPTX::BI__nvvm_fmin_f16x2:
19056     return MakeHalfType(Intrinsic::nvvm_fmin_f16x2, BuiltinID, E, *this);
19057   case NVPTX::BI__nvvm_fmin_ftz_f16:
19058     return MakeHalfType(Intrinsic::nvvm_fmin_ftz_f16, BuiltinID, E, *this);
19059   case NVPTX::BI__nvvm_fmin_ftz_f16x2:
19060     return MakeHalfType(Intrinsic::nvvm_fmin_ftz_f16x2, BuiltinID, E, *this);
19061   case NVPTX::BI__nvvm_fmin_ftz_nan_f16:
19062     return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_f16, BuiltinID, E, *this);
19063   case NVPTX::BI__nvvm_fmin_ftz_nan_f16x2:
19064     return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_f16x2, BuiltinID, E,
19065                         *this);
19066   case NVPTX::BI__nvvm_fmin_ftz_nan_xorsign_abs_f16:
19067     return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f16, BuiltinID,
19068                         E, *this);
19069   case NVPTX::BI__nvvm_fmin_ftz_nan_xorsign_abs_f16x2:
19070     return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f16x2,
19071                         BuiltinID, E, *this);
19072   case NVPTX::BI__nvvm_fmin_ftz_xorsign_abs_f16:
19073     return MakeHalfType(Intrinsic::nvvm_fmin_ftz_xorsign_abs_f16, BuiltinID, E,
19074                         *this);
19075   case NVPTX::BI__nvvm_fmin_ftz_xorsign_abs_f16x2:
19076     return MakeHalfType(Intrinsic::nvvm_fmin_ftz_xorsign_abs_f16x2, BuiltinID,
19077                         E, *this);
19078   case NVPTX::BI__nvvm_fmin_nan_f16:
19079     return MakeHalfType(Intrinsic::nvvm_fmin_nan_f16, BuiltinID, E, *this);
19080   case NVPTX::BI__nvvm_fmin_nan_f16x2:
19081     return MakeHalfType(Intrinsic::nvvm_fmin_nan_f16x2, BuiltinID, E, *this);
19082   case NVPTX::BI__nvvm_fmin_nan_xorsign_abs_f16:
19083     return MakeHalfType(Intrinsic::nvvm_fmin_nan_xorsign_abs_f16, BuiltinID, E,
19084                         *this);
19085   case NVPTX::BI__nvvm_fmin_nan_xorsign_abs_f16x2:
19086     return MakeHalfType(Intrinsic::nvvm_fmin_nan_xorsign_abs_f16x2, BuiltinID,
19087                         E, *this);
19088   case NVPTX::BI__nvvm_fmin_xorsign_abs_f16:
19089     return MakeHalfType(Intrinsic::nvvm_fmin_xorsign_abs_f16, BuiltinID, E,
19090                         *this);
19091   case NVPTX::BI__nvvm_fmin_xorsign_abs_f16x2:
19092     return MakeHalfType(Intrinsic::nvvm_fmin_xorsign_abs_f16x2, BuiltinID, E,
19093                         *this);
19094   case NVPTX::BI__nvvm_ldg_h:
19095     return MakeHalfType(Intrinsic::nvvm_ldg_global_f, BuiltinID, E, *this);
19096   case NVPTX::BI__nvvm_ldg_h2:
19097     return MakeHalfType(Intrinsic::nvvm_ldg_global_f, BuiltinID, E, *this);
19098   case NVPTX::BI__nvvm_ldu_h:
19099     return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
19100   case NVPTX::BI__nvvm_ldu_h2: {
19101     return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
19102   }
19103   case NVPTX::BI__nvvm_cp_async_ca_shared_global_4:
19104     return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4,
19105                        Intrinsic::nvvm_cp_async_ca_shared_global_4_s, *this, E,
19106                        4);
19107   case NVPTX::BI__nvvm_cp_async_ca_shared_global_8:
19108     return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_8,
19109                        Intrinsic::nvvm_cp_async_ca_shared_global_8_s, *this, E,
19110                        8);
19111   case NVPTX::BI__nvvm_cp_async_ca_shared_global_16:
19112     return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_16,
19113                        Intrinsic::nvvm_cp_async_ca_shared_global_16_s, *this, E,
19114                        16);
19115   case NVPTX::BI__nvvm_cp_async_cg_shared_global_16:
19116     return MakeCpAsync(Intrinsic::nvvm_cp_async_cg_shared_global_16,
19117                        Intrinsic::nvvm_cp_async_cg_shared_global_16_s, *this, E,
19118                        16);
19119   case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_x:
19120     return Builder.CreateCall(
19121         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_x));
19122   case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_y:
19123     return Builder.CreateCall(
19124         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_y));
19125   case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_z:
19126     return Builder.CreateCall(
19127         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_z));
19128   case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_w:
19129     return Builder.CreateCall(
19130         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_w));
19131   case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_x:
19132     return Builder.CreateCall(
19133         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_x));
19134   case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_y:
19135     return Builder.CreateCall(
19136         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_y));
19137   case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_z:
19138     return Builder.CreateCall(
19139         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_z));
19140   case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_w:
19141     return Builder.CreateCall(
19142         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_w));
19143   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_x:
19144     return Builder.CreateCall(
19145         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_x));
19146   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_y:
19147     return Builder.CreateCall(
19148         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_y));
19149   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_z:
19150     return Builder.CreateCall(
19151         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_z));
19152   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_w:
19153     return Builder.CreateCall(
19154         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_w));
19155   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_x:
19156     return Builder.CreateCall(
19157         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_x));
19158   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_y:
19159     return Builder.CreateCall(
19160         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_y));
19161   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_z:
19162     return Builder.CreateCall(
19163         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_z));
19164   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_w:
19165     return Builder.CreateCall(
19166         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_w));
19167   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctarank:
19168     return Builder.CreateCall(
19169         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctarank));
19170   case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctarank:
19171     return Builder.CreateCall(
19172         CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctarank));
19173   case NVPTX::BI__nvvm_is_explicit_cluster:
19174     return Builder.CreateCall(
19175         CGM.getIntrinsic(Intrinsic::nvvm_is_explicit_cluster));
19176   case NVPTX::BI__nvvm_isspacep_shared_cluster:
19177     return Builder.CreateCall(
19178         CGM.getIntrinsic(Intrinsic::nvvm_isspacep_shared_cluster),
19179         EmitScalarExpr(E->getArg(0)));
19180   case NVPTX::BI__nvvm_mapa:
19181     return Builder.CreateCall(
19182         CGM.getIntrinsic(Intrinsic::nvvm_mapa),
19183         {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))});
19184   case NVPTX::BI__nvvm_mapa_shared_cluster:
19185     return Builder.CreateCall(
19186         CGM.getIntrinsic(Intrinsic::nvvm_mapa_shared_cluster),
19187         {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))});
19188   case NVPTX::BI__nvvm_getctarank:
19189     return Builder.CreateCall(
19190         CGM.getIntrinsic(Intrinsic::nvvm_getctarank),
19191         EmitScalarExpr(E->getArg(0)));
19192   case NVPTX::BI__nvvm_getctarank_shared_cluster:
19193     return Builder.CreateCall(
19194         CGM.getIntrinsic(Intrinsic::nvvm_getctarank_shared_cluster),
19195         EmitScalarExpr(E->getArg(0)));
19196   case NVPTX::BI__nvvm_barrier_cluster_arrive:
19197     return Builder.CreateCall(
19198         CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_arrive));
19199   case NVPTX::BI__nvvm_barrier_cluster_arrive_relaxed:
19200     return Builder.CreateCall(
19201         CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_arrive_relaxed));
19202   case NVPTX::BI__nvvm_barrier_cluster_wait:
19203     return Builder.CreateCall(
19204         CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_wait));
19205   case NVPTX::BI__nvvm_fence_sc_cluster:
19206     return Builder.CreateCall(
19207         CGM.getIntrinsic(Intrinsic::nvvm_fence_sc_cluster));
19208   default:
19209     return nullptr;
19210   }
19211 }
19212 
19213 namespace {
19214 struct BuiltinAlignArgs {
19215   llvm::Value *Src = nullptr;
19216   llvm::Type *SrcType = nullptr;
19217   llvm::Value *Alignment = nullptr;
19218   llvm::Value *Mask = nullptr;
19219   llvm::IntegerType *IntType = nullptr;
19220 
19221   BuiltinAlignArgs(const CallExpr *E, CodeGenFunction &CGF) {
19222     QualType AstType = E->getArg(0)->getType();
19223     if (AstType->isArrayType())
19224       Src = CGF.EmitArrayToPointerDecay(E->getArg(0)).getPointer();
19225     else
19226       Src = CGF.EmitScalarExpr(E->getArg(0));
19227     SrcType = Src->getType();
19228     if (SrcType->isPointerTy()) {
19229       IntType = IntegerType::get(
19230           CGF.getLLVMContext(),
19231           CGF.CGM.getDataLayout().getIndexTypeSizeInBits(SrcType));
19232     } else {
19233       assert(SrcType->isIntegerTy());
19234       IntType = cast<llvm::IntegerType>(SrcType);
19235     }
19236     Alignment = CGF.EmitScalarExpr(E->getArg(1));
19237     Alignment = CGF.Builder.CreateZExtOrTrunc(Alignment, IntType, "alignment");
19238     auto *One = llvm::ConstantInt::get(IntType, 1);
19239     Mask = CGF.Builder.CreateSub(Alignment, One, "mask");
19240   }
19241 };
19242 } // namespace
19243 
19244 /// Generate (x & (y-1)) == 0.
19245 RValue CodeGenFunction::EmitBuiltinIsAligned(const CallExpr *E) {
19246   BuiltinAlignArgs Args(E, *this);
19247   llvm::Value *SrcAddress = Args.Src;
19248   if (Args.SrcType->isPointerTy())
19249     SrcAddress =
19250         Builder.CreateBitOrPointerCast(Args.Src, Args.IntType, "src_addr");
19251   return RValue::get(Builder.CreateICmpEQ(
19252       Builder.CreateAnd(SrcAddress, Args.Mask, "set_bits"),
19253       llvm::Constant::getNullValue(Args.IntType), "is_aligned"));
19254 }
19255 
19256 /// Generate (x & ~(y-1)) to align down or ((x+(y-1)) & ~(y-1)) to align up.
19257 /// Note: For pointer types we can avoid ptrtoint/inttoptr pairs by using the
19258 /// llvm.ptrmask intrinsic (with a GEP before in the align_up case).
19259 /// TODO: actually use ptrmask once most optimization passes know about it.
19260 RValue CodeGenFunction::EmitBuiltinAlignTo(const CallExpr *E, bool AlignUp) {
19261   BuiltinAlignArgs Args(E, *this);
19262   llvm::Value *SrcAddr = Args.Src;
19263   if (Args.Src->getType()->isPointerTy())
19264     SrcAddr = Builder.CreatePtrToInt(Args.Src, Args.IntType, "intptr");
19265   llvm::Value *SrcForMask = SrcAddr;
19266   if (AlignUp) {
19267     // When aligning up we have to first add the mask to ensure we go over the
19268     // next alignment value and then align down to the next valid multiple.
19269     // By adding the mask, we ensure that align_up on an already aligned
19270     // value will not change the value.
19271     SrcForMask = Builder.CreateAdd(SrcForMask, Args.Mask, "over_boundary");
19272   }
19273   // Invert the mask to only clear the lower bits.
19274   llvm::Value *InvertedMask = Builder.CreateNot(Args.Mask, "inverted_mask");
19275   llvm::Value *Result =
19276       Builder.CreateAnd(SrcForMask, InvertedMask, "aligned_result");
19277   if (Args.Src->getType()->isPointerTy()) {
19278     /// TODO: Use ptrmask instead of ptrtoint+gep once it is optimized well.
19279     // Result = Builder.CreateIntrinsic(
19280     //  Intrinsic::ptrmask, {Args.SrcType, SrcForMask->getType(), Args.IntType},
19281     //  {SrcForMask, NegatedMask}, nullptr, "aligned_result");
19282     Result->setName("aligned_intptr");
19283     llvm::Value *Difference = Builder.CreateSub(Result, SrcAddr, "diff");
19284     // The result must point to the same underlying allocation. This means we
19285     // can use an inbounds GEP to enable better optimization.
19286     if (getLangOpts().isSignedOverflowDefined())
19287       Result =
19288           Builder.CreateGEP(Int8Ty, Args.Src, Difference, "aligned_result");
19289     else
19290       Result = EmitCheckedInBoundsGEP(Int8Ty, Args.Src, Difference,
19291                                       /*SignedIndices=*/true,
19292                                       /*isSubtraction=*/!AlignUp,
19293                                       E->getExprLoc(), "aligned_result");
19294     // Emit an alignment assumption to ensure that the new alignment is
19295     // propagated to loads/stores, etc.
19296     emitAlignmentAssumption(Result, E, E->getExprLoc(), Args.Alignment);
19297   }
19298   assert(Result->getType() == Args.SrcType);
19299   return RValue::get(Result);
19300 }
19301 
19302 Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
19303                                                    const CallExpr *E) {
19304   switch (BuiltinID) {
19305   case WebAssembly::BI__builtin_wasm_memory_size: {
19306     llvm::Type *ResultType = ConvertType(E->getType());
19307     Value *I = EmitScalarExpr(E->getArg(0));
19308     Function *Callee =
19309         CGM.getIntrinsic(Intrinsic::wasm_memory_size, ResultType);
19310     return Builder.CreateCall(Callee, I);
19311   }
19312   case WebAssembly::BI__builtin_wasm_memory_grow: {
19313     llvm::Type *ResultType = ConvertType(E->getType());
19314     Value *Args[] = {EmitScalarExpr(E->getArg(0)),
19315                      EmitScalarExpr(E->getArg(1))};
19316     Function *Callee =
19317         CGM.getIntrinsic(Intrinsic::wasm_memory_grow, ResultType);
19318     return Builder.CreateCall(Callee, Args);
19319   }
19320   case WebAssembly::BI__builtin_wasm_tls_size: {
19321     llvm::Type *ResultType = ConvertType(E->getType());
19322     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_tls_size, ResultType);
19323     return Builder.CreateCall(Callee);
19324   }
19325   case WebAssembly::BI__builtin_wasm_tls_align: {
19326     llvm::Type *ResultType = ConvertType(E->getType());
19327     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_tls_align, ResultType);
19328     return Builder.CreateCall(Callee);
19329   }
19330   case WebAssembly::BI__builtin_wasm_tls_base: {
19331     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_tls_base);
19332     return Builder.CreateCall(Callee);
19333   }
19334   case WebAssembly::BI__builtin_wasm_throw: {
19335     Value *Tag = EmitScalarExpr(E->getArg(0));
19336     Value *Obj = EmitScalarExpr(E->getArg(1));
19337     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_throw);
19338     return Builder.CreateCall(Callee, {Tag, Obj});
19339   }
19340   case WebAssembly::BI__builtin_wasm_rethrow: {
19341     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_rethrow);
19342     return Builder.CreateCall(Callee);
19343   }
19344   case WebAssembly::BI__builtin_wasm_memory_atomic_wait32: {
19345     Value *Addr = EmitScalarExpr(E->getArg(0));
19346     Value *Expected = EmitScalarExpr(E->getArg(1));
19347     Value *Timeout = EmitScalarExpr(E->getArg(2));
19348     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_memory_atomic_wait32);
19349     return Builder.CreateCall(Callee, {Addr, Expected, Timeout});
19350   }
19351   case WebAssembly::BI__builtin_wasm_memory_atomic_wait64: {
19352     Value *Addr = EmitScalarExpr(E->getArg(0));
19353     Value *Expected = EmitScalarExpr(E->getArg(1));
19354     Value *Timeout = EmitScalarExpr(E->getArg(2));
19355     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_memory_atomic_wait64);
19356     return Builder.CreateCall(Callee, {Addr, Expected, Timeout});
19357   }
19358   case WebAssembly::BI__builtin_wasm_memory_atomic_notify: {
19359     Value *Addr = EmitScalarExpr(E->getArg(0));
19360     Value *Count = EmitScalarExpr(E->getArg(1));
19361     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_memory_atomic_notify);
19362     return Builder.CreateCall(Callee, {Addr, Count});
19363   }
19364   case WebAssembly::BI__builtin_wasm_trunc_s_i32_f32:
19365   case WebAssembly::BI__builtin_wasm_trunc_s_i32_f64:
19366   case WebAssembly::BI__builtin_wasm_trunc_s_i64_f32:
19367   case WebAssembly::BI__builtin_wasm_trunc_s_i64_f64: {
19368     Value *Src = EmitScalarExpr(E->getArg(0));
19369     llvm::Type *ResT = ConvertType(E->getType());
19370     Function *Callee =
19371         CGM.getIntrinsic(Intrinsic::wasm_trunc_signed, {ResT, Src->getType()});
19372     return Builder.CreateCall(Callee, {Src});
19373   }
19374   case WebAssembly::BI__builtin_wasm_trunc_u_i32_f32:
19375   case WebAssembly::BI__builtin_wasm_trunc_u_i32_f64:
19376   case WebAssembly::BI__builtin_wasm_trunc_u_i64_f32:
19377   case WebAssembly::BI__builtin_wasm_trunc_u_i64_f64: {
19378     Value *Src = EmitScalarExpr(E->getArg(0));
19379     llvm::Type *ResT = ConvertType(E->getType());
19380     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_trunc_unsigned,
19381                                         {ResT, Src->getType()});
19382     return Builder.CreateCall(Callee, {Src});
19383   }
19384   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i32_f32:
19385   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i32_f64:
19386   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i64_f32:
19387   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i64_f64:
19388   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i32x4_f32x4: {
19389     Value *Src = EmitScalarExpr(E->getArg(0));
19390     llvm::Type *ResT = ConvertType(E->getType());
19391     Function *Callee =
19392         CGM.getIntrinsic(Intrinsic::fptosi_sat, {ResT, Src->getType()});
19393     return Builder.CreateCall(Callee, {Src});
19394   }
19395   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32_f32:
19396   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32_f64:
19397   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i64_f32:
19398   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i64_f64:
19399   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32x4_f32x4: {
19400     Value *Src = EmitScalarExpr(E->getArg(0));
19401     llvm::Type *ResT = ConvertType(E->getType());
19402     Function *Callee =
19403         CGM.getIntrinsic(Intrinsic::fptoui_sat, {ResT, Src->getType()});
19404     return Builder.CreateCall(Callee, {Src});
19405   }
19406   case WebAssembly::BI__builtin_wasm_min_f32:
19407   case WebAssembly::BI__builtin_wasm_min_f64:
19408   case WebAssembly::BI__builtin_wasm_min_f32x4:
19409   case WebAssembly::BI__builtin_wasm_min_f64x2: {
19410     Value *LHS = EmitScalarExpr(E->getArg(0));
19411     Value *RHS = EmitScalarExpr(E->getArg(1));
19412     Function *Callee =
19413         CGM.getIntrinsic(Intrinsic::minimum, ConvertType(E->getType()));
19414     return Builder.CreateCall(Callee, {LHS, RHS});
19415   }
19416   case WebAssembly::BI__builtin_wasm_max_f32:
19417   case WebAssembly::BI__builtin_wasm_max_f64:
19418   case WebAssembly::BI__builtin_wasm_max_f32x4:
19419   case WebAssembly::BI__builtin_wasm_max_f64x2: {
19420     Value *LHS = EmitScalarExpr(E->getArg(0));
19421     Value *RHS = EmitScalarExpr(E->getArg(1));
19422     Function *Callee =
19423         CGM.getIntrinsic(Intrinsic::maximum, ConvertType(E->getType()));
19424     return Builder.CreateCall(Callee, {LHS, RHS});
19425   }
19426   case WebAssembly::BI__builtin_wasm_pmin_f32x4:
19427   case WebAssembly::BI__builtin_wasm_pmin_f64x2: {
19428     Value *LHS = EmitScalarExpr(E->getArg(0));
19429     Value *RHS = EmitScalarExpr(E->getArg(1));
19430     Function *Callee =
19431         CGM.getIntrinsic(Intrinsic::wasm_pmin, ConvertType(E->getType()));
19432     return Builder.CreateCall(Callee, {LHS, RHS});
19433   }
19434   case WebAssembly::BI__builtin_wasm_pmax_f32x4:
19435   case WebAssembly::BI__builtin_wasm_pmax_f64x2: {
19436     Value *LHS = EmitScalarExpr(E->getArg(0));
19437     Value *RHS = EmitScalarExpr(E->getArg(1));
19438     Function *Callee =
19439         CGM.getIntrinsic(Intrinsic::wasm_pmax, ConvertType(E->getType()));
19440     return Builder.CreateCall(Callee, {LHS, RHS});
19441   }
19442   case WebAssembly::BI__builtin_wasm_ceil_f32x4:
19443   case WebAssembly::BI__builtin_wasm_floor_f32x4:
19444   case WebAssembly::BI__builtin_wasm_trunc_f32x4:
19445   case WebAssembly::BI__builtin_wasm_nearest_f32x4:
19446   case WebAssembly::BI__builtin_wasm_ceil_f64x2:
19447   case WebAssembly::BI__builtin_wasm_floor_f64x2:
19448   case WebAssembly::BI__builtin_wasm_trunc_f64x2:
19449   case WebAssembly::BI__builtin_wasm_nearest_f64x2: {
19450     unsigned IntNo;
19451     switch (BuiltinID) {
19452     case WebAssembly::BI__builtin_wasm_ceil_f32x4:
19453     case WebAssembly::BI__builtin_wasm_ceil_f64x2:
19454       IntNo = Intrinsic::ceil;
19455       break;
19456     case WebAssembly::BI__builtin_wasm_floor_f32x4:
19457     case WebAssembly::BI__builtin_wasm_floor_f64x2:
19458       IntNo = Intrinsic::floor;
19459       break;
19460     case WebAssembly::BI__builtin_wasm_trunc_f32x4:
19461     case WebAssembly::BI__builtin_wasm_trunc_f64x2:
19462       IntNo = Intrinsic::trunc;
19463       break;
19464     case WebAssembly::BI__builtin_wasm_nearest_f32x4:
19465     case WebAssembly::BI__builtin_wasm_nearest_f64x2:
19466       IntNo = Intrinsic::nearbyint;
19467       break;
19468     default:
19469       llvm_unreachable("unexpected builtin ID");
19470     }
19471     Value *Value = EmitScalarExpr(E->getArg(0));
19472     Function *Callee = CGM.getIntrinsic(IntNo, ConvertType(E->getType()));
19473     return Builder.CreateCall(Callee, Value);
19474   }
19475   case WebAssembly::BI__builtin_wasm_ref_null_extern: {
19476     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_ref_null_extern);
19477     return Builder.CreateCall(Callee);
19478   }
19479   case WebAssembly::BI__builtin_wasm_ref_null_func: {
19480     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_ref_null_func);
19481     return Builder.CreateCall(Callee);
19482   }
19483   case WebAssembly::BI__builtin_wasm_swizzle_i8x16: {
19484     Value *Src = EmitScalarExpr(E->getArg(0));
19485     Value *Indices = EmitScalarExpr(E->getArg(1));
19486     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_swizzle);
19487     return Builder.CreateCall(Callee, {Src, Indices});
19488   }
19489   case WebAssembly::BI__builtin_wasm_add_sat_s_i8x16:
19490   case WebAssembly::BI__builtin_wasm_add_sat_u_i8x16:
19491   case WebAssembly::BI__builtin_wasm_add_sat_s_i16x8:
19492   case WebAssembly::BI__builtin_wasm_add_sat_u_i16x8:
19493   case WebAssembly::BI__builtin_wasm_sub_sat_s_i8x16:
19494   case WebAssembly::BI__builtin_wasm_sub_sat_u_i8x16:
19495   case WebAssembly::BI__builtin_wasm_sub_sat_s_i16x8:
19496   case WebAssembly::BI__builtin_wasm_sub_sat_u_i16x8: {
19497     unsigned IntNo;
19498     switch (BuiltinID) {
19499     case WebAssembly::BI__builtin_wasm_add_sat_s_i8x16:
19500     case WebAssembly::BI__builtin_wasm_add_sat_s_i16x8:
19501       IntNo = Intrinsic::sadd_sat;
19502       break;
19503     case WebAssembly::BI__builtin_wasm_add_sat_u_i8x16:
19504     case WebAssembly::BI__builtin_wasm_add_sat_u_i16x8:
19505       IntNo = Intrinsic::uadd_sat;
19506       break;
19507     case WebAssembly::BI__builtin_wasm_sub_sat_s_i8x16:
19508     case WebAssembly::BI__builtin_wasm_sub_sat_s_i16x8:
19509       IntNo = Intrinsic::wasm_sub_sat_signed;
19510       break;
19511     case WebAssembly::BI__builtin_wasm_sub_sat_u_i8x16:
19512     case WebAssembly::BI__builtin_wasm_sub_sat_u_i16x8:
19513       IntNo = Intrinsic::wasm_sub_sat_unsigned;
19514       break;
19515     default:
19516       llvm_unreachable("unexpected builtin ID");
19517     }
19518     Value *LHS = EmitScalarExpr(E->getArg(0));
19519     Value *RHS = EmitScalarExpr(E->getArg(1));
19520     Function *Callee = CGM.getIntrinsic(IntNo, ConvertType(E->getType()));
19521     return Builder.CreateCall(Callee, {LHS, RHS});
19522   }
19523   case WebAssembly::BI__builtin_wasm_abs_i8x16:
19524   case WebAssembly::BI__builtin_wasm_abs_i16x8:
19525   case WebAssembly::BI__builtin_wasm_abs_i32x4:
19526   case WebAssembly::BI__builtin_wasm_abs_i64x2: {
19527     Value *Vec = EmitScalarExpr(E->getArg(0));
19528     Value *Neg = Builder.CreateNeg(Vec, "neg");
19529     Constant *Zero = llvm::Constant::getNullValue(Vec->getType());
19530     Value *ICmp = Builder.CreateICmpSLT(Vec, Zero, "abscond");
19531     return Builder.CreateSelect(ICmp, Neg, Vec, "abs");
19532   }
19533   case WebAssembly::BI__builtin_wasm_min_s_i8x16:
19534   case WebAssembly::BI__builtin_wasm_min_u_i8x16:
19535   case WebAssembly::BI__builtin_wasm_max_s_i8x16:
19536   case WebAssembly::BI__builtin_wasm_max_u_i8x16:
19537   case WebAssembly::BI__builtin_wasm_min_s_i16x8:
19538   case WebAssembly::BI__builtin_wasm_min_u_i16x8:
19539   case WebAssembly::BI__builtin_wasm_max_s_i16x8:
19540   case WebAssembly::BI__builtin_wasm_max_u_i16x8:
19541   case WebAssembly::BI__builtin_wasm_min_s_i32x4:
19542   case WebAssembly::BI__builtin_wasm_min_u_i32x4:
19543   case WebAssembly::BI__builtin_wasm_max_s_i32x4:
19544   case WebAssembly::BI__builtin_wasm_max_u_i32x4: {
19545     Value *LHS = EmitScalarExpr(E->getArg(0));
19546     Value *RHS = EmitScalarExpr(E->getArg(1));
19547     Value *ICmp;
19548     switch (BuiltinID) {
19549     case WebAssembly::BI__builtin_wasm_min_s_i8x16:
19550     case WebAssembly::BI__builtin_wasm_min_s_i16x8:
19551     case WebAssembly::BI__builtin_wasm_min_s_i32x4:
19552       ICmp = Builder.CreateICmpSLT(LHS, RHS);
19553       break;
19554     case WebAssembly::BI__builtin_wasm_min_u_i8x16:
19555     case WebAssembly::BI__builtin_wasm_min_u_i16x8:
19556     case WebAssembly::BI__builtin_wasm_min_u_i32x4:
19557       ICmp = Builder.CreateICmpULT(LHS, RHS);
19558       break;
19559     case WebAssembly::BI__builtin_wasm_max_s_i8x16:
19560     case WebAssembly::BI__builtin_wasm_max_s_i16x8:
19561     case WebAssembly::BI__builtin_wasm_max_s_i32x4:
19562       ICmp = Builder.CreateICmpSGT(LHS, RHS);
19563       break;
19564     case WebAssembly::BI__builtin_wasm_max_u_i8x16:
19565     case WebAssembly::BI__builtin_wasm_max_u_i16x8:
19566     case WebAssembly::BI__builtin_wasm_max_u_i32x4:
19567       ICmp = Builder.CreateICmpUGT(LHS, RHS);
19568       break;
19569     default:
19570       llvm_unreachable("unexpected builtin ID");
19571     }
19572     return Builder.CreateSelect(ICmp, LHS, RHS);
19573   }
19574   case WebAssembly::BI__builtin_wasm_avgr_u_i8x16:
19575   case WebAssembly::BI__builtin_wasm_avgr_u_i16x8: {
19576     Value *LHS = EmitScalarExpr(E->getArg(0));
19577     Value *RHS = EmitScalarExpr(E->getArg(1));
19578     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_avgr_unsigned,
19579                                         ConvertType(E->getType()));
19580     return Builder.CreateCall(Callee, {LHS, RHS});
19581   }
19582   case WebAssembly::BI__builtin_wasm_q15mulr_sat_s_i16x8: {
19583     Value *LHS = EmitScalarExpr(E->getArg(0));
19584     Value *RHS = EmitScalarExpr(E->getArg(1));
19585     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_q15mulr_sat_signed);
19586     return Builder.CreateCall(Callee, {LHS, RHS});
19587   }
19588   case WebAssembly::BI__builtin_wasm_extadd_pairwise_i8x16_s_i16x8:
19589   case WebAssembly::BI__builtin_wasm_extadd_pairwise_i8x16_u_i16x8:
19590   case WebAssembly::BI__builtin_wasm_extadd_pairwise_i16x8_s_i32x4:
19591   case WebAssembly::BI__builtin_wasm_extadd_pairwise_i16x8_u_i32x4: {
19592     Value *Vec = EmitScalarExpr(E->getArg(0));
19593     unsigned IntNo;
19594     switch (BuiltinID) {
19595     case WebAssembly::BI__builtin_wasm_extadd_pairwise_i8x16_s_i16x8:
19596     case WebAssembly::BI__builtin_wasm_extadd_pairwise_i16x8_s_i32x4:
19597       IntNo = Intrinsic::wasm_extadd_pairwise_signed;
19598       break;
19599     case WebAssembly::BI__builtin_wasm_extadd_pairwise_i8x16_u_i16x8:
19600     case WebAssembly::BI__builtin_wasm_extadd_pairwise_i16x8_u_i32x4:
19601       IntNo = Intrinsic::wasm_extadd_pairwise_unsigned;
19602       break;
19603     default:
19604       llvm_unreachable("unexpected builtin ID");
19605     }
19606 
19607     Function *Callee = CGM.getIntrinsic(IntNo, ConvertType(E->getType()));
19608     return Builder.CreateCall(Callee, Vec);
19609   }
19610   case WebAssembly::BI__builtin_wasm_bitselect: {
19611     Value *V1 = EmitScalarExpr(E->getArg(0));
19612     Value *V2 = EmitScalarExpr(E->getArg(1));
19613     Value *C = EmitScalarExpr(E->getArg(2));
19614     Function *Callee =
19615         CGM.getIntrinsic(Intrinsic::wasm_bitselect, ConvertType(E->getType()));
19616     return Builder.CreateCall(Callee, {V1, V2, C});
19617   }
19618   case WebAssembly::BI__builtin_wasm_dot_s_i32x4_i16x8: {
19619     Value *LHS = EmitScalarExpr(E->getArg(0));
19620     Value *RHS = EmitScalarExpr(E->getArg(1));
19621     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_dot);
19622     return Builder.CreateCall(Callee, {LHS, RHS});
19623   }
19624   case WebAssembly::BI__builtin_wasm_popcnt_i8x16: {
19625     Value *Vec = EmitScalarExpr(E->getArg(0));
19626     Function *Callee =
19627         CGM.getIntrinsic(Intrinsic::ctpop, ConvertType(E->getType()));
19628     return Builder.CreateCall(Callee, {Vec});
19629   }
19630   case WebAssembly::BI__builtin_wasm_any_true_v128:
19631   case WebAssembly::BI__builtin_wasm_all_true_i8x16:
19632   case WebAssembly::BI__builtin_wasm_all_true_i16x8:
19633   case WebAssembly::BI__builtin_wasm_all_true_i32x4:
19634   case WebAssembly::BI__builtin_wasm_all_true_i64x2: {
19635     unsigned IntNo;
19636     switch (BuiltinID) {
19637     case WebAssembly::BI__builtin_wasm_any_true_v128:
19638       IntNo = Intrinsic::wasm_anytrue;
19639       break;
19640     case WebAssembly::BI__builtin_wasm_all_true_i8x16:
19641     case WebAssembly::BI__builtin_wasm_all_true_i16x8:
19642     case WebAssembly::BI__builtin_wasm_all_true_i32x4:
19643     case WebAssembly::BI__builtin_wasm_all_true_i64x2:
19644       IntNo = Intrinsic::wasm_alltrue;
19645       break;
19646     default:
19647       llvm_unreachable("unexpected builtin ID");
19648     }
19649     Value *Vec = EmitScalarExpr(E->getArg(0));
19650     Function *Callee = CGM.getIntrinsic(IntNo, Vec->getType());
19651     return Builder.CreateCall(Callee, {Vec});
19652   }
19653   case WebAssembly::BI__builtin_wasm_bitmask_i8x16:
19654   case WebAssembly::BI__builtin_wasm_bitmask_i16x8:
19655   case WebAssembly::BI__builtin_wasm_bitmask_i32x4:
19656   case WebAssembly::BI__builtin_wasm_bitmask_i64x2: {
19657     Value *Vec = EmitScalarExpr(E->getArg(0));
19658     Function *Callee =
19659         CGM.getIntrinsic(Intrinsic::wasm_bitmask, Vec->getType());
19660     return Builder.CreateCall(Callee, {Vec});
19661   }
19662   case WebAssembly::BI__builtin_wasm_abs_f32x4:
19663   case WebAssembly::BI__builtin_wasm_abs_f64x2: {
19664     Value *Vec = EmitScalarExpr(E->getArg(0));
19665     Function *Callee = CGM.getIntrinsic(Intrinsic::fabs, Vec->getType());
19666     return Builder.CreateCall(Callee, {Vec});
19667   }
19668   case WebAssembly::BI__builtin_wasm_sqrt_f32x4:
19669   case WebAssembly::BI__builtin_wasm_sqrt_f64x2: {
19670     Value *Vec = EmitScalarExpr(E->getArg(0));
19671     Function *Callee = CGM.getIntrinsic(Intrinsic::sqrt, Vec->getType());
19672     return Builder.CreateCall(Callee, {Vec});
19673   }
19674   case WebAssembly::BI__builtin_wasm_narrow_s_i8x16_i16x8:
19675   case WebAssembly::BI__builtin_wasm_narrow_u_i8x16_i16x8:
19676   case WebAssembly::BI__builtin_wasm_narrow_s_i16x8_i32x4:
19677   case WebAssembly::BI__builtin_wasm_narrow_u_i16x8_i32x4: {
19678     Value *Low = EmitScalarExpr(E->getArg(0));
19679     Value *High = EmitScalarExpr(E->getArg(1));
19680     unsigned IntNo;
19681     switch (BuiltinID) {
19682     case WebAssembly::BI__builtin_wasm_narrow_s_i8x16_i16x8:
19683     case WebAssembly::BI__builtin_wasm_narrow_s_i16x8_i32x4:
19684       IntNo = Intrinsic::wasm_narrow_signed;
19685       break;
19686     case WebAssembly::BI__builtin_wasm_narrow_u_i8x16_i16x8:
19687     case WebAssembly::BI__builtin_wasm_narrow_u_i16x8_i32x4:
19688       IntNo = Intrinsic::wasm_narrow_unsigned;
19689       break;
19690     default:
19691       llvm_unreachable("unexpected builtin ID");
19692     }
19693     Function *Callee =
19694         CGM.getIntrinsic(IntNo, {ConvertType(E->getType()), Low->getType()});
19695     return Builder.CreateCall(Callee, {Low, High});
19696   }
19697   case WebAssembly::BI__builtin_wasm_trunc_sat_s_zero_f64x2_i32x4:
19698   case WebAssembly::BI__builtin_wasm_trunc_sat_u_zero_f64x2_i32x4: {
19699     Value *Vec = EmitScalarExpr(E->getArg(0));
19700     unsigned IntNo;
19701     switch (BuiltinID) {
19702     case WebAssembly::BI__builtin_wasm_trunc_sat_s_zero_f64x2_i32x4:
19703       IntNo = Intrinsic::fptosi_sat;
19704       break;
19705     case WebAssembly::BI__builtin_wasm_trunc_sat_u_zero_f64x2_i32x4:
19706       IntNo = Intrinsic::fptoui_sat;
19707       break;
19708     default:
19709       llvm_unreachable("unexpected builtin ID");
19710     }
19711     llvm::Type *SrcT = Vec->getType();
19712     llvm::Type *TruncT = SrcT->getWithNewType(Builder.getInt32Ty());
19713     Function *Callee = CGM.getIntrinsic(IntNo, {TruncT, SrcT});
19714     Value *Trunc = Builder.CreateCall(Callee, Vec);
19715     Value *Splat = Constant::getNullValue(TruncT);
19716     return Builder.CreateShuffleVector(Trunc, Splat, ArrayRef<int>{0, 1, 2, 3});
19717   }
19718   case WebAssembly::BI__builtin_wasm_shuffle_i8x16: {
19719     Value *Ops[18];
19720     size_t OpIdx = 0;
19721     Ops[OpIdx++] = EmitScalarExpr(E->getArg(0));
19722     Ops[OpIdx++] = EmitScalarExpr(E->getArg(1));
19723     while (OpIdx < 18) {
19724       std::optional<llvm::APSInt> LaneConst =
19725           E->getArg(OpIdx)->getIntegerConstantExpr(getContext());
19726       assert(LaneConst && "Constant arg isn't actually constant?");
19727       Ops[OpIdx++] = llvm::ConstantInt::get(getLLVMContext(), *LaneConst);
19728     }
19729     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_shuffle);
19730     return Builder.CreateCall(Callee, Ops);
19731   }
19732   case WebAssembly::BI__builtin_wasm_relaxed_madd_f32x4:
19733   case WebAssembly::BI__builtin_wasm_relaxed_nmadd_f32x4:
19734   case WebAssembly::BI__builtin_wasm_relaxed_madd_f64x2:
19735   case WebAssembly::BI__builtin_wasm_relaxed_nmadd_f64x2: {
19736     Value *A = EmitScalarExpr(E->getArg(0));
19737     Value *B = EmitScalarExpr(E->getArg(1));
19738     Value *C = EmitScalarExpr(E->getArg(2));
19739     unsigned IntNo;
19740     switch (BuiltinID) {
19741     case WebAssembly::BI__builtin_wasm_relaxed_madd_f32x4:
19742     case WebAssembly::BI__builtin_wasm_relaxed_madd_f64x2:
19743       IntNo = Intrinsic::wasm_relaxed_madd;
19744       break;
19745     case WebAssembly::BI__builtin_wasm_relaxed_nmadd_f32x4:
19746     case WebAssembly::BI__builtin_wasm_relaxed_nmadd_f64x2:
19747       IntNo = Intrinsic::wasm_relaxed_nmadd;
19748       break;
19749     default:
19750       llvm_unreachable("unexpected builtin ID");
19751     }
19752     Function *Callee = CGM.getIntrinsic(IntNo, A->getType());
19753     return Builder.CreateCall(Callee, {A, B, C});
19754   }
19755   case WebAssembly::BI__builtin_wasm_relaxed_laneselect_i8x16:
19756   case WebAssembly::BI__builtin_wasm_relaxed_laneselect_i16x8:
19757   case WebAssembly::BI__builtin_wasm_relaxed_laneselect_i32x4:
19758   case WebAssembly::BI__builtin_wasm_relaxed_laneselect_i64x2: {
19759     Value *A = EmitScalarExpr(E->getArg(0));
19760     Value *B = EmitScalarExpr(E->getArg(1));
19761     Value *C = EmitScalarExpr(E->getArg(2));
19762     Function *Callee =
19763         CGM.getIntrinsic(Intrinsic::wasm_relaxed_laneselect, A->getType());
19764     return Builder.CreateCall(Callee, {A, B, C});
19765   }
19766   case WebAssembly::BI__builtin_wasm_relaxed_swizzle_i8x16: {
19767     Value *Src = EmitScalarExpr(E->getArg(0));
19768     Value *Indices = EmitScalarExpr(E->getArg(1));
19769     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_relaxed_swizzle);
19770     return Builder.CreateCall(Callee, {Src, Indices});
19771   }
19772   case WebAssembly::BI__builtin_wasm_relaxed_min_f32x4:
19773   case WebAssembly::BI__builtin_wasm_relaxed_max_f32x4:
19774   case WebAssembly::BI__builtin_wasm_relaxed_min_f64x2:
19775   case WebAssembly::BI__builtin_wasm_relaxed_max_f64x2: {
19776     Value *LHS = EmitScalarExpr(E->getArg(0));
19777     Value *RHS = EmitScalarExpr(E->getArg(1));
19778     unsigned IntNo;
19779     switch (BuiltinID) {
19780     case WebAssembly::BI__builtin_wasm_relaxed_min_f32x4:
19781     case WebAssembly::BI__builtin_wasm_relaxed_min_f64x2:
19782       IntNo = Intrinsic::wasm_relaxed_min;
19783       break;
19784     case WebAssembly::BI__builtin_wasm_relaxed_max_f32x4:
19785     case WebAssembly::BI__builtin_wasm_relaxed_max_f64x2:
19786       IntNo = Intrinsic::wasm_relaxed_max;
19787       break;
19788     default:
19789       llvm_unreachable("unexpected builtin ID");
19790     }
19791     Function *Callee = CGM.getIntrinsic(IntNo, LHS->getType());
19792     return Builder.CreateCall(Callee, {LHS, RHS});
19793   }
19794   case WebAssembly::BI__builtin_wasm_relaxed_trunc_s_i32x4_f32x4:
19795   case WebAssembly::BI__builtin_wasm_relaxed_trunc_u_i32x4_f32x4:
19796   case WebAssembly::BI__builtin_wasm_relaxed_trunc_s_zero_i32x4_f64x2:
19797   case WebAssembly::BI__builtin_wasm_relaxed_trunc_u_zero_i32x4_f64x2: {
19798     Value *Vec = EmitScalarExpr(E->getArg(0));
19799     unsigned IntNo;
19800     switch (BuiltinID) {
19801     case WebAssembly::BI__builtin_wasm_relaxed_trunc_s_i32x4_f32x4:
19802       IntNo = Intrinsic::wasm_relaxed_trunc_signed;
19803       break;
19804     case WebAssembly::BI__builtin_wasm_relaxed_trunc_u_i32x4_f32x4:
19805       IntNo = Intrinsic::wasm_relaxed_trunc_unsigned;
19806       break;
19807     case WebAssembly::BI__builtin_wasm_relaxed_trunc_s_zero_i32x4_f64x2:
19808       IntNo = Intrinsic::wasm_relaxed_trunc_signed_zero;
19809       break;
19810     case WebAssembly::BI__builtin_wasm_relaxed_trunc_u_zero_i32x4_f64x2:
19811       IntNo = Intrinsic::wasm_relaxed_trunc_unsigned_zero;
19812       break;
19813     default:
19814       llvm_unreachable("unexpected builtin ID");
19815     }
19816     Function *Callee = CGM.getIntrinsic(IntNo);
19817     return Builder.CreateCall(Callee, {Vec});
19818   }
19819   case WebAssembly::BI__builtin_wasm_relaxed_q15mulr_s_i16x8: {
19820     Value *LHS = EmitScalarExpr(E->getArg(0));
19821     Value *RHS = EmitScalarExpr(E->getArg(1));
19822     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_relaxed_q15mulr_signed);
19823     return Builder.CreateCall(Callee, {LHS, RHS});
19824   }
19825   case WebAssembly::BI__builtin_wasm_relaxed_dot_i8x16_i7x16_s_i16x8: {
19826     Value *LHS = EmitScalarExpr(E->getArg(0));
19827     Value *RHS = EmitScalarExpr(E->getArg(1));
19828     Function *Callee =
19829         CGM.getIntrinsic(Intrinsic::wasm_relaxed_dot_i8x16_i7x16_signed);
19830     return Builder.CreateCall(Callee, {LHS, RHS});
19831   }
19832   case WebAssembly::BI__builtin_wasm_relaxed_dot_i8x16_i7x16_add_s_i32x4: {
19833     Value *LHS = EmitScalarExpr(E->getArg(0));
19834     Value *RHS = EmitScalarExpr(E->getArg(1));
19835     Value *Acc = EmitScalarExpr(E->getArg(2));
19836     Function *Callee =
19837         CGM.getIntrinsic(Intrinsic::wasm_relaxed_dot_i8x16_i7x16_add_signed);
19838     return Builder.CreateCall(Callee, {LHS, RHS, Acc});
19839   }
19840   case WebAssembly::BI__builtin_wasm_relaxed_dot_bf16x8_add_f32_f32x4: {
19841     Value *LHS = EmitScalarExpr(E->getArg(0));
19842     Value *RHS = EmitScalarExpr(E->getArg(1));
19843     Value *Acc = EmitScalarExpr(E->getArg(2));
19844     Function *Callee =
19845         CGM.getIntrinsic(Intrinsic::wasm_relaxed_dot_bf16x8_add_f32);
19846     return Builder.CreateCall(Callee, {LHS, RHS, Acc});
19847   }
19848   case WebAssembly::BI__builtin_wasm_table_get: {
19849     assert(E->getArg(0)->getType()->isArrayType());
19850     Value *Table = EmitArrayToPointerDecay(E->getArg(0)).getPointer();
19851     Value *Index = EmitScalarExpr(E->getArg(1));
19852     Function *Callee;
19853     if (E->getType().isWebAssemblyExternrefType())
19854       Callee = CGM.getIntrinsic(Intrinsic::wasm_table_get_externref);
19855     else if (E->getType().isWebAssemblyFuncrefType())
19856       Callee = CGM.getIntrinsic(Intrinsic::wasm_table_get_funcref);
19857     else
19858       llvm_unreachable(
19859           "Unexpected reference type for __builtin_wasm_table_get");
19860     return Builder.CreateCall(Callee, {Table, Index});
19861   }
19862   case WebAssembly::BI__builtin_wasm_table_set: {
19863     assert(E->getArg(0)->getType()->isArrayType());
19864     Value *Table = EmitArrayToPointerDecay(E->getArg(0)).getPointer();
19865     Value *Index = EmitScalarExpr(E->getArg(1));
19866     Value *Val = EmitScalarExpr(E->getArg(2));
19867     Function *Callee;
19868     if (E->getArg(2)->getType().isWebAssemblyExternrefType())
19869       Callee = CGM.getIntrinsic(Intrinsic::wasm_table_set_externref);
19870     else if (E->getArg(2)->getType().isWebAssemblyFuncrefType())
19871       Callee = CGM.getIntrinsic(Intrinsic::wasm_table_set_funcref);
19872     else
19873       llvm_unreachable(
19874           "Unexpected reference type for __builtin_wasm_table_set");
19875     return Builder.CreateCall(Callee, {Table, Index, Val});
19876   }
19877   case WebAssembly::BI__builtin_wasm_table_size: {
19878     assert(E->getArg(0)->getType()->isArrayType());
19879     Value *Value = EmitArrayToPointerDecay(E->getArg(0)).getPointer();
19880     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_table_size);
19881     return Builder.CreateCall(Callee, Value);
19882   }
19883   case WebAssembly::BI__builtin_wasm_table_grow: {
19884     assert(E->getArg(0)->getType()->isArrayType());
19885     Value *Table = EmitArrayToPointerDecay(E->getArg(0)).getPointer();
19886     Value *Val = EmitScalarExpr(E->getArg(1));
19887     Value *NElems = EmitScalarExpr(E->getArg(2));
19888 
19889     Function *Callee;
19890     if (E->getArg(1)->getType().isWebAssemblyExternrefType())
19891       Callee = CGM.getIntrinsic(Intrinsic::wasm_table_grow_externref);
19892     else if (E->getArg(2)->getType().isWebAssemblyFuncrefType())
19893       Callee = CGM.getIntrinsic(Intrinsic::wasm_table_fill_funcref);
19894     else
19895       llvm_unreachable(
19896           "Unexpected reference type for __builtin_wasm_table_grow");
19897 
19898     return Builder.CreateCall(Callee, {Table, Val, NElems});
19899   }
19900   case WebAssembly::BI__builtin_wasm_table_fill: {
19901     assert(E->getArg(0)->getType()->isArrayType());
19902     Value *Table = EmitArrayToPointerDecay(E->getArg(0)).getPointer();
19903     Value *Index = EmitScalarExpr(E->getArg(1));
19904     Value *Val = EmitScalarExpr(E->getArg(2));
19905     Value *NElems = EmitScalarExpr(E->getArg(3));
19906 
19907     Function *Callee;
19908     if (E->getArg(2)->getType().isWebAssemblyExternrefType())
19909       Callee = CGM.getIntrinsic(Intrinsic::wasm_table_fill_externref);
19910     else if (E->getArg(2)->getType().isWebAssemblyFuncrefType())
19911       Callee = CGM.getIntrinsic(Intrinsic::wasm_table_fill_funcref);
19912     else
19913       llvm_unreachable(
19914           "Unexpected reference type for __builtin_wasm_table_fill");
19915 
19916     return Builder.CreateCall(Callee, {Table, Index, Val, NElems});
19917   }
19918   case WebAssembly::BI__builtin_wasm_table_copy: {
19919     assert(E->getArg(0)->getType()->isArrayType());
19920     Value *TableX = EmitArrayToPointerDecay(E->getArg(0)).getPointer();
19921     Value *TableY = EmitArrayToPointerDecay(E->getArg(1)).getPointer();
19922     Value *DstIdx = EmitScalarExpr(E->getArg(2));
19923     Value *SrcIdx = EmitScalarExpr(E->getArg(3));
19924     Value *NElems = EmitScalarExpr(E->getArg(4));
19925 
19926     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_table_copy);
19927 
19928     return Builder.CreateCall(Callee, {TableX, TableY, SrcIdx, DstIdx, NElems});
19929   }
19930   default:
19931     return nullptr;
19932   }
19933 }
19934 
19935 static std::pair<Intrinsic::ID, unsigned>
19936 getIntrinsicForHexagonNonClangBuiltin(unsigned BuiltinID) {
19937   struct Info {
19938     unsigned BuiltinID;
19939     Intrinsic::ID IntrinsicID;
19940     unsigned VecLen;
19941   };
19942   static Info Infos[] = {
19943 #define CUSTOM_BUILTIN_MAPPING(x,s) \
19944   { Hexagon::BI__builtin_HEXAGON_##x, Intrinsic::hexagon_##x, s },
19945     CUSTOM_BUILTIN_MAPPING(L2_loadrub_pci, 0)
19946     CUSTOM_BUILTIN_MAPPING(L2_loadrb_pci, 0)
19947     CUSTOM_BUILTIN_MAPPING(L2_loadruh_pci, 0)
19948     CUSTOM_BUILTIN_MAPPING(L2_loadrh_pci, 0)
19949     CUSTOM_BUILTIN_MAPPING(L2_loadri_pci, 0)
19950     CUSTOM_BUILTIN_MAPPING(L2_loadrd_pci, 0)
19951     CUSTOM_BUILTIN_MAPPING(L2_loadrub_pcr, 0)
19952     CUSTOM_BUILTIN_MAPPING(L2_loadrb_pcr, 0)
19953     CUSTOM_BUILTIN_MAPPING(L2_loadruh_pcr, 0)
19954     CUSTOM_BUILTIN_MAPPING(L2_loadrh_pcr, 0)
19955     CUSTOM_BUILTIN_MAPPING(L2_loadri_pcr, 0)
19956     CUSTOM_BUILTIN_MAPPING(L2_loadrd_pcr, 0)
19957     CUSTOM_BUILTIN_MAPPING(S2_storerb_pci, 0)
19958     CUSTOM_BUILTIN_MAPPING(S2_storerh_pci, 0)
19959     CUSTOM_BUILTIN_MAPPING(S2_storerf_pci, 0)
19960     CUSTOM_BUILTIN_MAPPING(S2_storeri_pci, 0)
19961     CUSTOM_BUILTIN_MAPPING(S2_storerd_pci, 0)
19962     CUSTOM_BUILTIN_MAPPING(S2_storerb_pcr, 0)
19963     CUSTOM_BUILTIN_MAPPING(S2_storerh_pcr, 0)
19964     CUSTOM_BUILTIN_MAPPING(S2_storerf_pcr, 0)
19965     CUSTOM_BUILTIN_MAPPING(S2_storeri_pcr, 0)
19966     CUSTOM_BUILTIN_MAPPING(S2_storerd_pcr, 0)
19967     // Legacy builtins that take a vector in place of a vector predicate.
19968     CUSTOM_BUILTIN_MAPPING(V6_vmaskedstoreq, 64)
19969     CUSTOM_BUILTIN_MAPPING(V6_vmaskedstorenq, 64)
19970     CUSTOM_BUILTIN_MAPPING(V6_vmaskedstorentq, 64)
19971     CUSTOM_BUILTIN_MAPPING(V6_vmaskedstorentnq, 64)
19972     CUSTOM_BUILTIN_MAPPING(V6_vmaskedstoreq_128B, 128)
19973     CUSTOM_BUILTIN_MAPPING(V6_vmaskedstorenq_128B, 128)
19974     CUSTOM_BUILTIN_MAPPING(V6_vmaskedstorentq_128B, 128)
19975     CUSTOM_BUILTIN_MAPPING(V6_vmaskedstorentnq_128B, 128)
19976 #include "clang/Basic/BuiltinsHexagonMapCustomDep.def"
19977 #undef CUSTOM_BUILTIN_MAPPING
19978   };
19979 
19980   auto CmpInfo = [] (Info A, Info B) { return A.BuiltinID < B.BuiltinID; };
19981   static const bool SortOnce = (llvm::sort(Infos, CmpInfo), true);
19982   (void)SortOnce;
19983 
19984   const Info *F = llvm::lower_bound(Infos, Info{BuiltinID, 0, 0}, CmpInfo);
19985   if (F == std::end(Infos) || F->BuiltinID != BuiltinID)
19986     return {Intrinsic::not_intrinsic, 0};
19987 
19988   return {F->IntrinsicID, F->VecLen};
19989 }
19990 
19991 Value *CodeGenFunction::EmitHexagonBuiltinExpr(unsigned BuiltinID,
19992                                                const CallExpr *E) {
19993   Intrinsic::ID ID;
19994   unsigned VecLen;
19995   std::tie(ID, VecLen) = getIntrinsicForHexagonNonClangBuiltin(BuiltinID);
19996 
19997   auto MakeCircOp = [this, E](unsigned IntID, bool IsLoad) {
19998     // The base pointer is passed by address, so it needs to be loaded.
19999     Address A = EmitPointerWithAlignment(E->getArg(0));
20000     Address BP = Address(Builder.CreateBitCast(
20001         A.getPointer(), Int8PtrPtrTy), Int8PtrTy, A.getAlignment());
20002     llvm::Value *Base = Builder.CreateLoad(BP);
20003     // The treatment of both loads and stores is the same: the arguments for
20004     // the builtin are the same as the arguments for the intrinsic.
20005     // Load:
20006     //   builtin(Base, Inc, Mod, Start) -> intr(Base, Inc, Mod, Start)
20007     //   builtin(Base, Mod, Start)      -> intr(Base, Mod, Start)
20008     // Store:
20009     //   builtin(Base, Inc, Mod, Val, Start) -> intr(Base, Inc, Mod, Val, Start)
20010     //   builtin(Base, Mod, Val, Start)      -> intr(Base, Mod, Val, Start)
20011     SmallVector<llvm::Value*,5> Ops = { Base };
20012     for (unsigned i = 1, e = E->getNumArgs(); i != e; ++i)
20013       Ops.push_back(EmitScalarExpr(E->getArg(i)));
20014 
20015     llvm::Value *Result = Builder.CreateCall(CGM.getIntrinsic(IntID), Ops);
20016     // The load intrinsics generate two results (Value, NewBase), stores
20017     // generate one (NewBase). The new base address needs to be stored.
20018     llvm::Value *NewBase = IsLoad ? Builder.CreateExtractValue(Result, 1)
20019                                   : Result;
20020     llvm::Value *LV = EmitScalarExpr(E->getArg(0));
20021     Address Dest = EmitPointerWithAlignment(E->getArg(0));
20022     llvm::Value *RetVal =
20023         Builder.CreateAlignedStore(NewBase, LV, Dest.getAlignment());
20024     if (IsLoad)
20025       RetVal = Builder.CreateExtractValue(Result, 0);
20026     return RetVal;
20027   };
20028 
20029   // Handle the conversion of bit-reverse load intrinsics to bit code.
20030   // The intrinsic call after this function only reads from memory and the
20031   // write to memory is dealt by the store instruction.
20032   auto MakeBrevLd = [this, E](unsigned IntID, llvm::Type *DestTy) {
20033     // The intrinsic generates one result, which is the new value for the base
20034     // pointer. It needs to be returned. The result of the load instruction is
20035     // passed to intrinsic by address, so the value needs to be stored.
20036     llvm::Value *BaseAddress =
20037         Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), Int8PtrTy);
20038 
20039     // Expressions like &(*pt++) will be incremented per evaluation.
20040     // EmitPointerWithAlignment and EmitScalarExpr evaluates the expression
20041     // per call.
20042     Address DestAddr = EmitPointerWithAlignment(E->getArg(1));
20043     DestAddr = Address(Builder.CreateBitCast(DestAddr.getPointer(), Int8PtrTy),
20044                        Int8Ty, DestAddr.getAlignment());
20045     llvm::Value *DestAddress = DestAddr.getPointer();
20046 
20047     // Operands are Base, Dest, Modifier.
20048     // The intrinsic format in LLVM IR is defined as
20049     // { ValueType, i8* } (i8*, i32).
20050     llvm::Value *Result = Builder.CreateCall(
20051         CGM.getIntrinsic(IntID), {BaseAddress, EmitScalarExpr(E->getArg(2))});
20052 
20053     // The value needs to be stored as the variable is passed by reference.
20054     llvm::Value *DestVal = Builder.CreateExtractValue(Result, 0);
20055 
20056     // The store needs to be truncated to fit the destination type.
20057     // While i32 and i64 are natively supported on Hexagon, i8 and i16 needs
20058     // to be handled with stores of respective destination type.
20059     DestVal = Builder.CreateTrunc(DestVal, DestTy);
20060 
20061     Builder.CreateAlignedStore(DestVal, DestAddress, DestAddr.getAlignment());
20062     // The updated value of the base pointer is returned.
20063     return Builder.CreateExtractValue(Result, 1);
20064   };
20065 
20066   auto V2Q = [this, VecLen] (llvm::Value *Vec) {
20067     Intrinsic::ID ID = VecLen == 128 ? Intrinsic::hexagon_V6_vandvrt_128B
20068                                      : Intrinsic::hexagon_V6_vandvrt;
20069     return Builder.CreateCall(CGM.getIntrinsic(ID),
20070                               {Vec, Builder.getInt32(-1)});
20071   };
20072   auto Q2V = [this, VecLen] (llvm::Value *Pred) {
20073     Intrinsic::ID ID = VecLen == 128 ? Intrinsic::hexagon_V6_vandqrt_128B
20074                                      : Intrinsic::hexagon_V6_vandqrt;
20075     return Builder.CreateCall(CGM.getIntrinsic(ID),
20076                               {Pred, Builder.getInt32(-1)});
20077   };
20078 
20079   switch (BuiltinID) {
20080   // These intrinsics return a tuple {Vector, VectorPred} in LLVM IR,
20081   // and the corresponding C/C++ builtins use loads/stores to update
20082   // the predicate.
20083   case Hexagon::BI__builtin_HEXAGON_V6_vaddcarry:
20084   case Hexagon::BI__builtin_HEXAGON_V6_vaddcarry_128B:
20085   case Hexagon::BI__builtin_HEXAGON_V6_vsubcarry:
20086   case Hexagon::BI__builtin_HEXAGON_V6_vsubcarry_128B: {
20087     // Get the type from the 0-th argument.
20088     llvm::Type *VecType = ConvertType(E->getArg(0)->getType());
20089     Address PredAddr =
20090         EmitPointerWithAlignment(E->getArg(2)).withElementType(VecType);
20091     llvm::Value *PredIn = V2Q(Builder.CreateLoad(PredAddr));
20092     llvm::Value *Result = Builder.CreateCall(CGM.getIntrinsic(ID),
20093         {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)), PredIn});
20094 
20095     llvm::Value *PredOut = Builder.CreateExtractValue(Result, 1);
20096     Builder.CreateAlignedStore(Q2V(PredOut), PredAddr.getPointer(),
20097         PredAddr.getAlignment());
20098     return Builder.CreateExtractValue(Result, 0);
20099   }
20100   // These are identical to the builtins above, except they don't consume
20101   // input carry, only generate carry-out. Since they still produce two
20102   // outputs, generate the store of the predicate, but no load.
20103   case Hexagon::BI__builtin_HEXAGON_V6_vaddcarryo:
20104   case Hexagon::BI__builtin_HEXAGON_V6_vaddcarryo_128B:
20105   case Hexagon::BI__builtin_HEXAGON_V6_vsubcarryo:
20106   case Hexagon::BI__builtin_HEXAGON_V6_vsubcarryo_128B: {
20107     // Get the type from the 0-th argument.
20108     llvm::Type *VecType = ConvertType(E->getArg(0)->getType());
20109     Address PredAddr =
20110         EmitPointerWithAlignment(E->getArg(2)).withElementType(VecType);
20111     llvm::Value *Result = Builder.CreateCall(CGM.getIntrinsic(ID),
20112         {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))});
20113 
20114     llvm::Value *PredOut = Builder.CreateExtractValue(Result, 1);
20115     Builder.CreateAlignedStore(Q2V(PredOut), PredAddr.getPointer(),
20116         PredAddr.getAlignment());
20117     return Builder.CreateExtractValue(Result, 0);
20118   }
20119 
20120   case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstoreq:
20121   case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstorenq:
20122   case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstorentq:
20123   case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstorentnq:
20124   case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstoreq_128B:
20125   case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstorenq_128B:
20126   case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstorentq_128B:
20127   case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstorentnq_128B: {
20128     SmallVector<llvm::Value*,4> Ops;
20129     const Expr *PredOp = E->getArg(0);
20130     // There will be an implicit cast to a boolean vector. Strip it.
20131     if (auto *Cast = dyn_cast<ImplicitCastExpr>(PredOp)) {
20132       if (Cast->getCastKind() == CK_BitCast)
20133         PredOp = Cast->getSubExpr();
20134       Ops.push_back(V2Q(EmitScalarExpr(PredOp)));
20135     }
20136     for (int i = 1, e = E->getNumArgs(); i != e; ++i)
20137       Ops.push_back(EmitScalarExpr(E->getArg(i)));
20138     return Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
20139   }
20140 
20141   case Hexagon::BI__builtin_HEXAGON_L2_loadrub_pci:
20142   case Hexagon::BI__builtin_HEXAGON_L2_loadrb_pci:
20143   case Hexagon::BI__builtin_HEXAGON_L2_loadruh_pci:
20144   case Hexagon::BI__builtin_HEXAGON_L2_loadrh_pci:
20145   case Hexagon::BI__builtin_HEXAGON_L2_loadri_pci:
20146   case Hexagon::BI__builtin_HEXAGON_L2_loadrd_pci:
20147   case Hexagon::BI__builtin_HEXAGON_L2_loadrub_pcr:
20148   case Hexagon::BI__builtin_HEXAGON_L2_loadrb_pcr:
20149   case Hexagon::BI__builtin_HEXAGON_L2_loadruh_pcr:
20150   case Hexagon::BI__builtin_HEXAGON_L2_loadrh_pcr:
20151   case Hexagon::BI__builtin_HEXAGON_L2_loadri_pcr:
20152   case Hexagon::BI__builtin_HEXAGON_L2_loadrd_pcr:
20153     return MakeCircOp(ID, /*IsLoad=*/true);
20154   case Hexagon::BI__builtin_HEXAGON_S2_storerb_pci:
20155   case Hexagon::BI__builtin_HEXAGON_S2_storerh_pci:
20156   case Hexagon::BI__builtin_HEXAGON_S2_storerf_pci:
20157   case Hexagon::BI__builtin_HEXAGON_S2_storeri_pci:
20158   case Hexagon::BI__builtin_HEXAGON_S2_storerd_pci:
20159   case Hexagon::BI__builtin_HEXAGON_S2_storerb_pcr:
20160   case Hexagon::BI__builtin_HEXAGON_S2_storerh_pcr:
20161   case Hexagon::BI__builtin_HEXAGON_S2_storerf_pcr:
20162   case Hexagon::BI__builtin_HEXAGON_S2_storeri_pcr:
20163   case Hexagon::BI__builtin_HEXAGON_S2_storerd_pcr:
20164     return MakeCircOp(ID, /*IsLoad=*/false);
20165   case Hexagon::BI__builtin_brev_ldub:
20166     return MakeBrevLd(Intrinsic::hexagon_L2_loadrub_pbr, Int8Ty);
20167   case Hexagon::BI__builtin_brev_ldb:
20168     return MakeBrevLd(Intrinsic::hexagon_L2_loadrb_pbr, Int8Ty);
20169   case Hexagon::BI__builtin_brev_lduh:
20170     return MakeBrevLd(Intrinsic::hexagon_L2_loadruh_pbr, Int16Ty);
20171   case Hexagon::BI__builtin_brev_ldh:
20172     return MakeBrevLd(Intrinsic::hexagon_L2_loadrh_pbr, Int16Ty);
20173   case Hexagon::BI__builtin_brev_ldw:
20174     return MakeBrevLd(Intrinsic::hexagon_L2_loadri_pbr, Int32Ty);
20175   case Hexagon::BI__builtin_brev_ldd:
20176     return MakeBrevLd(Intrinsic::hexagon_L2_loadrd_pbr, Int64Ty);
20177   } // switch
20178 
20179   return nullptr;
20180 }
20181 
20182 Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned BuiltinID,
20183                                              const CallExpr *E,
20184                                              ReturnValueSlot ReturnValue) {
20185   SmallVector<Value *, 4> Ops;
20186   llvm::Type *ResultType = ConvertType(E->getType());
20187 
20188   // Find out if any arguments are required to be integer constant expressions.
20189   unsigned ICEArguments = 0;
20190   ASTContext::GetBuiltinTypeError Error;
20191   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
20192   if (Error == ASTContext::GE_Missing_type) {
20193     // Vector intrinsics don't have a type string.
20194     assert(BuiltinID >= clang::RISCV::FirstRVVBuiltin &&
20195            BuiltinID <= clang::RISCV::LastRVVBuiltin);
20196     ICEArguments = 0;
20197     if (BuiltinID == RISCVVector::BI__builtin_rvv_vget_v ||
20198         BuiltinID == RISCVVector::BI__builtin_rvv_vset_v)
20199       ICEArguments = 1 << 1;
20200   } else {
20201     assert(Error == ASTContext::GE_None && "Unexpected error");
20202   }
20203 
20204   if (BuiltinID == RISCV::BI__builtin_riscv_ntl_load)
20205     ICEArguments |= (1 << 1);
20206   if (BuiltinID == RISCV::BI__builtin_riscv_ntl_store)
20207     ICEArguments |= (1 << 2);
20208 
20209   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
20210     // Handle aggregate argument, namely RVV tuple types in segment load/store
20211     if (hasAggregateEvaluationKind(E->getArg(i)->getType())) {
20212       LValue L = EmitAggExprToLValue(E->getArg(i));
20213       llvm::Value *AggValue = Builder.CreateLoad(L.getAddress(*this));
20214       Ops.push_back(AggValue);
20215       continue;
20216     }
20217 
20218     // If this is a normal argument, just emit it as a scalar.
20219     if ((ICEArguments & (1 << i)) == 0) {
20220       Ops.push_back(EmitScalarExpr(E->getArg(i)));
20221       continue;
20222     }
20223 
20224     // If this is required to be a constant, constant fold it so that we know
20225     // that the generated intrinsic gets a ConstantInt.
20226     Ops.push_back(llvm::ConstantInt::get(
20227         getLLVMContext(), *E->getArg(i)->getIntegerConstantExpr(getContext())));
20228   }
20229 
20230   Intrinsic::ID ID = Intrinsic::not_intrinsic;
20231   unsigned NF = 1;
20232   // The 0th bit simulates the `vta` of RVV
20233   // The 1st bit simulates the `vma` of RVV
20234   constexpr unsigned RVV_VTA = 0x1;
20235   constexpr unsigned RVV_VMA = 0x2;
20236   int PolicyAttrs = 0;
20237   bool IsMasked = false;
20238 
20239   // Required for overloaded intrinsics.
20240   llvm::SmallVector<llvm::Type *, 2> IntrinsicTypes;
20241   switch (BuiltinID) {
20242   default: llvm_unreachable("unexpected builtin ID");
20243   case RISCV::BI__builtin_riscv_orc_b_32:
20244   case RISCV::BI__builtin_riscv_orc_b_64:
20245   case RISCV::BI__builtin_riscv_clz_32:
20246   case RISCV::BI__builtin_riscv_clz_64:
20247   case RISCV::BI__builtin_riscv_ctz_32:
20248   case RISCV::BI__builtin_riscv_ctz_64:
20249   case RISCV::BI__builtin_riscv_clmul_32:
20250   case RISCV::BI__builtin_riscv_clmul_64:
20251   case RISCV::BI__builtin_riscv_clmulh_32:
20252   case RISCV::BI__builtin_riscv_clmulh_64:
20253   case RISCV::BI__builtin_riscv_clmulr_32:
20254   case RISCV::BI__builtin_riscv_clmulr_64:
20255   case RISCV::BI__builtin_riscv_xperm4_32:
20256   case RISCV::BI__builtin_riscv_xperm4_64:
20257   case RISCV::BI__builtin_riscv_xperm8_32:
20258   case RISCV::BI__builtin_riscv_xperm8_64:
20259   case RISCV::BI__builtin_riscv_brev8_32:
20260   case RISCV::BI__builtin_riscv_brev8_64:
20261   case RISCV::BI__builtin_riscv_zip_32:
20262   case RISCV::BI__builtin_riscv_unzip_32: {
20263     switch (BuiltinID) {
20264     default: llvm_unreachable("unexpected builtin ID");
20265     // Zbb
20266     case RISCV::BI__builtin_riscv_orc_b_32:
20267     case RISCV::BI__builtin_riscv_orc_b_64:
20268       ID = Intrinsic::riscv_orc_b;
20269       break;
20270     case RISCV::BI__builtin_riscv_clz_32:
20271     case RISCV::BI__builtin_riscv_clz_64: {
20272       Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType());
20273       Value *Result = Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)});
20274       if (Result->getType() != ResultType)
20275         Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
20276                                        "cast");
20277       return Result;
20278     }
20279     case RISCV::BI__builtin_riscv_ctz_32:
20280     case RISCV::BI__builtin_riscv_ctz_64: {
20281       Function *F = CGM.getIntrinsic(Intrinsic::cttz, Ops[0]->getType());
20282       Value *Result = Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)});
20283       if (Result->getType() != ResultType)
20284         Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
20285                                        "cast");
20286       return Result;
20287     }
20288 
20289     // Zbc
20290     case RISCV::BI__builtin_riscv_clmul_32:
20291     case RISCV::BI__builtin_riscv_clmul_64:
20292       ID = Intrinsic::riscv_clmul;
20293       break;
20294     case RISCV::BI__builtin_riscv_clmulh_32:
20295     case RISCV::BI__builtin_riscv_clmulh_64:
20296       ID = Intrinsic::riscv_clmulh;
20297       break;
20298     case RISCV::BI__builtin_riscv_clmulr_32:
20299     case RISCV::BI__builtin_riscv_clmulr_64:
20300       ID = Intrinsic::riscv_clmulr;
20301       break;
20302 
20303     // Zbkx
20304     case RISCV::BI__builtin_riscv_xperm8_32:
20305     case RISCV::BI__builtin_riscv_xperm8_64:
20306       ID = Intrinsic::riscv_xperm8;
20307       break;
20308     case RISCV::BI__builtin_riscv_xperm4_32:
20309     case RISCV::BI__builtin_riscv_xperm4_64:
20310       ID = Intrinsic::riscv_xperm4;
20311       break;
20312 
20313     // Zbkb
20314     case RISCV::BI__builtin_riscv_brev8_32:
20315     case RISCV::BI__builtin_riscv_brev8_64:
20316       ID = Intrinsic::riscv_brev8;
20317       break;
20318     case RISCV::BI__builtin_riscv_zip_32:
20319       ID = Intrinsic::riscv_zip;
20320       break;
20321     case RISCV::BI__builtin_riscv_unzip_32:
20322       ID = Intrinsic::riscv_unzip;
20323       break;
20324     }
20325 
20326     IntrinsicTypes = {ResultType};
20327     break;
20328   }
20329 
20330   // Zk builtins
20331 
20332   // Zknh
20333   case RISCV::BI__builtin_riscv_sha256sig0:
20334     ID = Intrinsic::riscv_sha256sig0;
20335     break;
20336   case RISCV::BI__builtin_riscv_sha256sig1:
20337     ID = Intrinsic::riscv_sha256sig1;
20338     break;
20339   case RISCV::BI__builtin_riscv_sha256sum0:
20340     ID = Intrinsic::riscv_sha256sum0;
20341     break;
20342   case RISCV::BI__builtin_riscv_sha256sum1:
20343     ID = Intrinsic::riscv_sha256sum1;
20344     break;
20345 
20346   // Zksed
20347   case RISCV::BI__builtin_riscv_sm4ks:
20348     ID = Intrinsic::riscv_sm4ks;
20349     break;
20350   case RISCV::BI__builtin_riscv_sm4ed:
20351     ID = Intrinsic::riscv_sm4ed;
20352     break;
20353 
20354   // Zksh
20355   case RISCV::BI__builtin_riscv_sm3p0:
20356     ID = Intrinsic::riscv_sm3p0;
20357     break;
20358   case RISCV::BI__builtin_riscv_sm3p1:
20359     ID = Intrinsic::riscv_sm3p1;
20360     break;
20361 
20362   // Zihintntl
20363   case RISCV::BI__builtin_riscv_ntl_load: {
20364     llvm::Type *ResTy = ConvertType(E->getType());
20365     ConstantInt *Mode = cast<ConstantInt>(Ops[1]);
20366 
20367     llvm::MDNode *RISCVDomainNode = llvm::MDNode::get(
20368         getLLVMContext(),
20369         llvm::ConstantAsMetadata::get(Builder.getInt32(Mode->getZExtValue())));
20370     llvm::MDNode *NontemporalNode = llvm::MDNode::get(
20371         getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
20372 
20373     int Width;
20374     if(ResTy->isScalableTy()) {
20375       const ScalableVectorType *SVTy = cast<ScalableVectorType>(ResTy);
20376       llvm::Type *ScalarTy = ResTy->getScalarType();
20377       Width = ScalarTy->getPrimitiveSizeInBits() *
20378               SVTy->getElementCount().getKnownMinValue();
20379     } else
20380       Width = ResTy->getPrimitiveSizeInBits();
20381     LoadInst *Load = Builder.CreateLoad(
20382         Address(Ops[0], ResTy, CharUnits::fromQuantity(Width / 8)));
20383 
20384     Load->setMetadata(llvm::LLVMContext::MD_nontemporal, NontemporalNode);
20385     Load->setMetadata(CGM.getModule().getMDKindID("riscv-nontemporal-domain"),
20386                       RISCVDomainNode);
20387 
20388     return Load;
20389   }
20390   case RISCV::BI__builtin_riscv_ntl_store: {
20391     ConstantInt *Mode = cast<ConstantInt>(Ops[2]);
20392 
20393     llvm::MDNode *RISCVDomainNode = llvm::MDNode::get(
20394         getLLVMContext(),
20395         llvm::ConstantAsMetadata::get(Builder.getInt32(Mode->getZExtValue())));
20396     llvm::MDNode *NontemporalNode = llvm::MDNode::get(
20397         getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
20398 
20399     Value *BC = Builder.CreateBitCast(
20400         Ops[0], llvm::PointerType::getUnqual(Ops[1]->getType()), "cast");
20401 
20402     StoreInst *Store = Builder.CreateDefaultAlignedStore(Ops[1], BC);
20403     Store->setMetadata(llvm::LLVMContext::MD_nontemporal, NontemporalNode);
20404     Store->setMetadata(CGM.getModule().getMDKindID("riscv-nontemporal-domain"),
20405                        RISCVDomainNode);
20406 
20407     return Store;
20408   }
20409 
20410   // Vector builtins are handled from here.
20411 #include "clang/Basic/riscv_vector_builtin_cg.inc"
20412   // SiFive Vector builtins are handled from here.
20413 #include "clang/Basic/riscv_sifive_vector_builtin_cg.inc"
20414   }
20415 
20416   assert(ID != Intrinsic::not_intrinsic);
20417 
20418   llvm::Function *F = CGM.getIntrinsic(ID, IntrinsicTypes);
20419   return Builder.CreateCall(F, Ops, "");
20420 }
20421 
20422 Value *CodeGenFunction::EmitLoongArchBuiltinExpr(unsigned BuiltinID,
20423                                                  const CallExpr *E) {
20424   SmallVector<Value *, 4> Ops;
20425 
20426   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++)
20427     Ops.push_back(EmitScalarExpr(E->getArg(i)));
20428 
20429   Intrinsic::ID ID = Intrinsic::not_intrinsic;
20430 
20431   switch (BuiltinID) {
20432   default:
20433     llvm_unreachable("unexpected builtin ID.");
20434   case LoongArch::BI__builtin_loongarch_cacop_d:
20435     ID = Intrinsic::loongarch_cacop_d;
20436     break;
20437   case LoongArch::BI__builtin_loongarch_cacop_w:
20438     ID = Intrinsic::loongarch_cacop_w;
20439     break;
20440   case LoongArch::BI__builtin_loongarch_dbar:
20441     ID = Intrinsic::loongarch_dbar;
20442     break;
20443   case LoongArch::BI__builtin_loongarch_break:
20444     ID = Intrinsic::loongarch_break;
20445     break;
20446   case LoongArch::BI__builtin_loongarch_ibar:
20447     ID = Intrinsic::loongarch_ibar;
20448     break;
20449   case LoongArch::BI__builtin_loongarch_movfcsr2gr:
20450     ID = Intrinsic::loongarch_movfcsr2gr;
20451     break;
20452   case LoongArch::BI__builtin_loongarch_movgr2fcsr:
20453     ID = Intrinsic::loongarch_movgr2fcsr;
20454     break;
20455   case LoongArch::BI__builtin_loongarch_syscall:
20456     ID = Intrinsic::loongarch_syscall;
20457     break;
20458   case LoongArch::BI__builtin_loongarch_crc_w_b_w:
20459     ID = Intrinsic::loongarch_crc_w_b_w;
20460     break;
20461   case LoongArch::BI__builtin_loongarch_crc_w_h_w:
20462     ID = Intrinsic::loongarch_crc_w_h_w;
20463     break;
20464   case LoongArch::BI__builtin_loongarch_crc_w_w_w:
20465     ID = Intrinsic::loongarch_crc_w_w_w;
20466     break;
20467   case LoongArch::BI__builtin_loongarch_crc_w_d_w:
20468     ID = Intrinsic::loongarch_crc_w_d_w;
20469     break;
20470   case LoongArch::BI__builtin_loongarch_crcc_w_b_w:
20471     ID = Intrinsic::loongarch_crcc_w_b_w;
20472     break;
20473   case LoongArch::BI__builtin_loongarch_crcc_w_h_w:
20474     ID = Intrinsic::loongarch_crcc_w_h_w;
20475     break;
20476   case LoongArch::BI__builtin_loongarch_crcc_w_w_w:
20477     ID = Intrinsic::loongarch_crcc_w_w_w;
20478     break;
20479   case LoongArch::BI__builtin_loongarch_crcc_w_d_w:
20480     ID = Intrinsic::loongarch_crcc_w_d_w;
20481     break;
20482   case LoongArch::BI__builtin_loongarch_csrrd_w:
20483     ID = Intrinsic::loongarch_csrrd_w;
20484     break;
20485   case LoongArch::BI__builtin_loongarch_csrwr_w:
20486     ID = Intrinsic::loongarch_csrwr_w;
20487     break;
20488   case LoongArch::BI__builtin_loongarch_csrxchg_w:
20489     ID = Intrinsic::loongarch_csrxchg_w;
20490     break;
20491   case LoongArch::BI__builtin_loongarch_csrrd_d:
20492     ID = Intrinsic::loongarch_csrrd_d;
20493     break;
20494   case LoongArch::BI__builtin_loongarch_csrwr_d:
20495     ID = Intrinsic::loongarch_csrwr_d;
20496     break;
20497   case LoongArch::BI__builtin_loongarch_csrxchg_d:
20498     ID = Intrinsic::loongarch_csrxchg_d;
20499     break;
20500   case LoongArch::BI__builtin_loongarch_iocsrrd_b:
20501     ID = Intrinsic::loongarch_iocsrrd_b;
20502     break;
20503   case LoongArch::BI__builtin_loongarch_iocsrrd_h:
20504     ID = Intrinsic::loongarch_iocsrrd_h;
20505     break;
20506   case LoongArch::BI__builtin_loongarch_iocsrrd_w:
20507     ID = Intrinsic::loongarch_iocsrrd_w;
20508     break;
20509   case LoongArch::BI__builtin_loongarch_iocsrrd_d:
20510     ID = Intrinsic::loongarch_iocsrrd_d;
20511     break;
20512   case LoongArch::BI__builtin_loongarch_iocsrwr_b:
20513     ID = Intrinsic::loongarch_iocsrwr_b;
20514     break;
20515   case LoongArch::BI__builtin_loongarch_iocsrwr_h:
20516     ID = Intrinsic::loongarch_iocsrwr_h;
20517     break;
20518   case LoongArch::BI__builtin_loongarch_iocsrwr_w:
20519     ID = Intrinsic::loongarch_iocsrwr_w;
20520     break;
20521   case LoongArch::BI__builtin_loongarch_iocsrwr_d:
20522     ID = Intrinsic::loongarch_iocsrwr_d;
20523     break;
20524   case LoongArch::BI__builtin_loongarch_cpucfg:
20525     ID = Intrinsic::loongarch_cpucfg;
20526     break;
20527   case LoongArch::BI__builtin_loongarch_asrtle_d:
20528     ID = Intrinsic::loongarch_asrtle_d;
20529     break;
20530   case LoongArch::BI__builtin_loongarch_asrtgt_d:
20531     ID = Intrinsic::loongarch_asrtgt_d;
20532     break;
20533   case LoongArch::BI__builtin_loongarch_lddir_d:
20534     ID = Intrinsic::loongarch_lddir_d;
20535     break;
20536   case LoongArch::BI__builtin_loongarch_ldpte_d:
20537     ID = Intrinsic::loongarch_ldpte_d;
20538     break;
20539     // TODO: Support more Intrinsics.
20540   }
20541 
20542   assert(ID != Intrinsic::not_intrinsic);
20543 
20544   llvm::Function *F = CGM.getIntrinsic(ID);
20545   return Builder.CreateCall(F, Ops);
20546 }
20547