1 //===-- VEISelLowering.cpp - VE DAG Lowering Implementation ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the interfaces that VE uses to lower LLVM code into a
10 // selection DAG.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "VEISelLowering.h"
15 #include "MCTargetDesc/VEMCExpr.h"
16 #include "VEInstrBuilder.h"
17 #include "VEMachineFunctionInfo.h"
18 #include "VERegisterInfo.h"
19 #include "VETargetMachine.h"
20 #include "llvm/ADT/StringSwitch.h"
21 #include "llvm/CodeGen/CallingConvLower.h"
22 #include "llvm/CodeGen/MachineFrameInfo.h"
23 #include "llvm/CodeGen/MachineFunction.h"
24 #include "llvm/CodeGen/MachineInstrBuilder.h"
25 #include "llvm/CodeGen/MachineJumpTableInfo.h"
26 #include "llvm/CodeGen/MachineModuleInfo.h"
27 #include "llvm/CodeGen/MachineRegisterInfo.h"
28 #include "llvm/CodeGen/SelectionDAG.h"
29 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
30 #include "llvm/IR/DerivedTypes.h"
31 #include "llvm/IR/Function.h"
32 #include "llvm/IR/Module.h"
33 #include "llvm/Support/ErrorHandling.h"
34 #include "llvm/Support/KnownBits.h"
35 using namespace llvm;
36 
37 #define DEBUG_TYPE "ve-lower"
38 
39 //===----------------------------------------------------------------------===//
40 // Calling Convention Implementation
41 //===----------------------------------------------------------------------===//
42 
43 #include "VEGenCallingConv.inc"
44 
45 CCAssignFn *getReturnCC(CallingConv::ID CallConv) {
46   switch (CallConv) {
47   default:
48     return RetCC_VE_C;
49   case CallingConv::Fast:
50     return RetCC_VE_Fast;
51   }
52 }
53 
54 CCAssignFn *getParamCC(CallingConv::ID CallConv, bool IsVarArg) {
55   if (IsVarArg)
56     return CC_VE2;
57   switch (CallConv) {
58   default:
59     return CC_VE_C;
60   case CallingConv::Fast:
61     return CC_VE_Fast;
62   }
63 }
64 
65 bool VETargetLowering::CanLowerReturn(
66     CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
67     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
68   CCAssignFn *RetCC = getReturnCC(CallConv);
69   SmallVector<CCValAssign, 16> RVLocs;
70   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
71   return CCInfo.CheckReturn(Outs, RetCC);
72 }
73 
74 static const MVT AllVectorVTs[] = {MVT::v256i32, MVT::v512i32, MVT::v256i64,
75                                    MVT::v256f32, MVT::v512f32, MVT::v256f64};
76 
77 static const MVT AllPackedVTs[] = {MVT::v512i32, MVT::v512f32};
78 
79 void VETargetLowering::initRegisterClasses() {
80   // Set up the register classes.
81   addRegisterClass(MVT::i32, &VE::I32RegClass);
82   addRegisterClass(MVT::i64, &VE::I64RegClass);
83   addRegisterClass(MVT::f32, &VE::F32RegClass);
84   addRegisterClass(MVT::f64, &VE::I64RegClass);
85   addRegisterClass(MVT::f128, &VE::F128RegClass);
86 
87   if (Subtarget->enableVPU()) {
88     for (MVT VecVT : AllVectorVTs)
89       addRegisterClass(VecVT, &VE::V64RegClass);
90     addRegisterClass(MVT::v256i1, &VE::VMRegClass);
91     addRegisterClass(MVT::v512i1, &VE::VM512RegClass);
92   }
93 }
94 
95 void VETargetLowering::initSPUActions() {
96   const auto &TM = getTargetMachine();
97   /// Load & Store {
98 
99   // VE doesn't have i1 sign extending load.
100   for (MVT VT : MVT::integer_valuetypes()) {
101     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
102     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
103     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
104     setTruncStoreAction(VT, MVT::i1, Expand);
105   }
106 
107   // VE doesn't have floating point extload/truncstore, so expand them.
108   for (MVT FPVT : MVT::fp_valuetypes()) {
109     for (MVT OtherFPVT : MVT::fp_valuetypes()) {
110       setLoadExtAction(ISD::EXTLOAD, FPVT, OtherFPVT, Expand);
111       setTruncStoreAction(FPVT, OtherFPVT, Expand);
112     }
113   }
114 
115   // VE doesn't have fp128 load/store, so expand them in custom lower.
116   setOperationAction(ISD::LOAD, MVT::f128, Custom);
117   setOperationAction(ISD::STORE, MVT::f128, Custom);
118 
119   /// } Load & Store
120 
121   // Custom legalize address nodes into LO/HI parts.
122   MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
123   setOperationAction(ISD::BlockAddress, PtrVT, Custom);
124   setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
125   setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
126   setOperationAction(ISD::ConstantPool, PtrVT, Custom);
127   setOperationAction(ISD::JumpTable, PtrVT, Custom);
128 
129   /// VAARG handling {
130   setOperationAction(ISD::VASTART, MVT::Other, Custom);
131   // VAARG needs to be lowered to access with 8 bytes alignment.
132   setOperationAction(ISD::VAARG, MVT::Other, Custom);
133   // Use the default implementation.
134   setOperationAction(ISD::VACOPY, MVT::Other, Expand);
135   setOperationAction(ISD::VAEND, MVT::Other, Expand);
136   /// } VAARG handling
137 
138   /// Stack {
139   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
140   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
141 
142   // Use the default implementation.
143   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
144   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
145   /// } Stack
146 
147   /// Branch {
148 
149   // VE doesn't have BRCOND
150   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
151 
152   // BR_JT is not implemented yet.
153   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
154 
155   /// } Branch
156 
157   /// Int Ops {
158   for (MVT IntVT : {MVT::i32, MVT::i64}) {
159     // VE has no REM or DIVREM operations.
160     setOperationAction(ISD::UREM, IntVT, Expand);
161     setOperationAction(ISD::SREM, IntVT, Expand);
162     setOperationAction(ISD::SDIVREM, IntVT, Expand);
163     setOperationAction(ISD::UDIVREM, IntVT, Expand);
164 
165     // VE has no SHL_PARTS/SRA_PARTS/SRL_PARTS operations.
166     setOperationAction(ISD::SHL_PARTS, IntVT, Expand);
167     setOperationAction(ISD::SRA_PARTS, IntVT, Expand);
168     setOperationAction(ISD::SRL_PARTS, IntVT, Expand);
169 
170     // VE has no MULHU/S or U/SMUL_LOHI operations.
171     // TODO: Use MPD instruction to implement SMUL_LOHI for i32 type.
172     setOperationAction(ISD::MULHU, IntVT, Expand);
173     setOperationAction(ISD::MULHS, IntVT, Expand);
174     setOperationAction(ISD::UMUL_LOHI, IntVT, Expand);
175     setOperationAction(ISD::SMUL_LOHI, IntVT, Expand);
176 
177     // VE has no CTTZ, ROTL, ROTR operations.
178     setOperationAction(ISD::CTTZ, IntVT, Expand);
179     setOperationAction(ISD::ROTL, IntVT, Expand);
180     setOperationAction(ISD::ROTR, IntVT, Expand);
181 
182     // VE has 64 bits instruction which works as i64 BSWAP operation.  This
183     // instruction works fine as i32 BSWAP operation with an additional
184     // parameter.  Use isel patterns to lower BSWAP.
185     setOperationAction(ISD::BSWAP, IntVT, Legal);
186 
187     // VE has only 64 bits instructions which work as i64 BITREVERSE/CTLZ/CTPOP
188     // operations.  Use isel patterns for i64, promote for i32.
189     LegalizeAction Act = (IntVT == MVT::i32) ? Promote : Legal;
190     setOperationAction(ISD::BITREVERSE, IntVT, Act);
191     setOperationAction(ISD::CTLZ, IntVT, Act);
192     setOperationAction(ISD::CTLZ_ZERO_UNDEF, IntVT, Act);
193     setOperationAction(ISD::CTPOP, IntVT, Act);
194 
195     // VE has only 64 bits instructions which work as i64 AND/OR/XOR operations.
196     // Use isel patterns for i64, promote for i32.
197     setOperationAction(ISD::AND, IntVT, Act);
198     setOperationAction(ISD::OR, IntVT, Act);
199     setOperationAction(ISD::XOR, IntVT, Act);
200   }
201   /// } Int Ops
202 
203   /// Conversion {
204   // VE doesn't have instructions for fp<->uint, so expand them by llvm
205   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote); // use i64
206   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote); // use i64
207   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
208   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
209 
210   // fp16 not supported
211   for (MVT FPVT : MVT::fp_valuetypes()) {
212     setOperationAction(ISD::FP16_TO_FP, FPVT, Expand);
213     setOperationAction(ISD::FP_TO_FP16, FPVT, Expand);
214   }
215   /// } Conversion
216 
217   /// Floating-point Ops {
218   /// Note: Floating-point operations are fneg, fadd, fsub, fmul, fdiv, frem,
219   ///       and fcmp.
220 
221   // VE doesn't have following floating point operations.
222   for (MVT VT : MVT::fp_valuetypes()) {
223     setOperationAction(ISD::FNEG, VT, Expand);
224     setOperationAction(ISD::FREM, VT, Expand);
225   }
226 
227   // VE doesn't have fdiv of f128.
228   setOperationAction(ISD::FDIV, MVT::f128, Expand);
229 
230   for (MVT FPVT : {MVT::f32, MVT::f64}) {
231     // f32 and f64 uses ConstantFP.  f128 uses ConstantPool.
232     setOperationAction(ISD::ConstantFP, FPVT, Legal);
233   }
234   /// } Floating-point Ops
235 
236   /// Floating-point math functions {
237 
238   // VE doesn't have following floating point math functions.
239   for (MVT VT : MVT::fp_valuetypes()) {
240     setOperationAction(ISD::FABS, VT, Expand);
241     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
242     setOperationAction(ISD::FCOS, VT, Expand);
243     setOperationAction(ISD::FSIN, VT, Expand);
244     setOperationAction(ISD::FSQRT, VT, Expand);
245   }
246 
247   /// } Floating-point math functions
248 
249   /// Atomic instructions {
250 
251   setMaxAtomicSizeInBitsSupported(64);
252   setMinCmpXchgSizeInBits(32);
253   setSupportsUnalignedAtomics(false);
254 
255   // Use custom inserter for ATOMIC_FENCE.
256   setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
257 
258   // Other atomic instructions.
259   for (MVT VT : MVT::integer_valuetypes()) {
260     // Support i8/i16 atomic swap.
261     setOperationAction(ISD::ATOMIC_SWAP, VT, Custom);
262 
263     // FIXME: Support "atmam" instructions.
264     setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Expand);
265     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Expand);
266     setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Expand);
267     setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Expand);
268 
269     // VE doesn't have follwing instructions.
270     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Expand);
271     setOperationAction(ISD::ATOMIC_LOAD_CLR, VT, Expand);
272     setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Expand);
273     setOperationAction(ISD::ATOMIC_LOAD_NAND, VT, Expand);
274     setOperationAction(ISD::ATOMIC_LOAD_MIN, VT, Expand);
275     setOperationAction(ISD::ATOMIC_LOAD_MAX, VT, Expand);
276     setOperationAction(ISD::ATOMIC_LOAD_UMIN, VT, Expand);
277     setOperationAction(ISD::ATOMIC_LOAD_UMAX, VT, Expand);
278   }
279 
280   /// } Atomic instructions
281 
282   /// SJLJ instructions {
283   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
284   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
285   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
286   if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
287     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
288   /// } SJLJ instructions
289 
290   // Intrinsic instructions
291   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
292 }
293 
294 void VETargetLowering::initVPUActions() {
295   for (MVT LegalVecVT : AllVectorVTs) {
296     setOperationAction(ISD::BUILD_VECTOR, LegalVecVT, Custom);
297     setOperationAction(ISD::INSERT_VECTOR_ELT, LegalVecVT, Legal);
298     setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalVecVT, Legal);
299     // Translate all vector instructions with legal element types to VVP_*
300     // nodes.
301     // TODO We will custom-widen into VVP_* nodes in the future. While we are
302     // buildling the infrastructure for this, we only do this for legal vector
303     // VTs.
304 #define HANDLE_VP_TO_VVP(VP_OPC, VVP_NAME)                                     \
305   setOperationAction(ISD::VP_OPC, LegalVecVT, Custom);
306 #define ADD_VVP_OP(VVP_NAME, ISD_NAME)                                         \
307   setOperationAction(ISD::ISD_NAME, LegalVecVT, Custom);
308 #include "VVPNodes.def"
309   }
310 
311   for (MVT LegalPackedVT : AllPackedVTs) {
312     setOperationAction(ISD::INSERT_VECTOR_ELT, LegalPackedVT, Custom);
313     setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalPackedVT, Custom);
314   }
315 }
316 
317 SDValue
318 VETargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
319                               bool IsVarArg,
320                               const SmallVectorImpl<ISD::OutputArg> &Outs,
321                               const SmallVectorImpl<SDValue> &OutVals,
322                               const SDLoc &DL, SelectionDAG &DAG) const {
323   // CCValAssign - represent the assignment of the return value to locations.
324   SmallVector<CCValAssign, 16> RVLocs;
325 
326   // CCState - Info about the registers and stack slot.
327   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
328                  *DAG.getContext());
329 
330   // Analyze return values.
331   CCInfo.AnalyzeReturn(Outs, getReturnCC(CallConv));
332 
333   SDValue Flag;
334   SmallVector<SDValue, 4> RetOps(1, Chain);
335 
336   // Copy the result values into the output registers.
337   for (unsigned i = 0; i != RVLocs.size(); ++i) {
338     CCValAssign &VA = RVLocs[i];
339     assert(VA.isRegLoc() && "Can only return in registers!");
340     assert(!VA.needsCustom() && "Unexpected custom lowering");
341     SDValue OutVal = OutVals[i];
342 
343     // Integer return values must be sign or zero extended by the callee.
344     switch (VA.getLocInfo()) {
345     case CCValAssign::Full:
346       break;
347     case CCValAssign::SExt:
348       OutVal = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), OutVal);
349       break;
350     case CCValAssign::ZExt:
351       OutVal = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), OutVal);
352       break;
353     case CCValAssign::AExt:
354       OutVal = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), OutVal);
355       break;
356     case CCValAssign::BCvt: {
357       // Convert a float return value to i64 with padding.
358       //     63     31   0
359       //    +------+------+
360       //    | float|   0  |
361       //    +------+------+
362       assert(VA.getLocVT() == MVT::i64);
363       assert(VA.getValVT() == MVT::f32);
364       SDValue Undef = SDValue(
365           DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0);
366       SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
367       OutVal = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
368                                           MVT::i64, Undef, OutVal, Sub_f32),
369                        0);
370       break;
371     }
372     default:
373       llvm_unreachable("Unknown loc info!");
374     }
375 
376     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVal, Flag);
377 
378     // Guarantee that all emitted copies are stuck together with flags.
379     Flag = Chain.getValue(1);
380     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
381   }
382 
383   RetOps[0] = Chain; // Update chain.
384 
385   // Add the flag if we have it.
386   if (Flag.getNode())
387     RetOps.push_back(Flag);
388 
389   return DAG.getNode(VEISD::RET_FLAG, DL, MVT::Other, RetOps);
390 }
391 
392 SDValue VETargetLowering::LowerFormalArguments(
393     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
394     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
395     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
396   MachineFunction &MF = DAG.getMachineFunction();
397 
398   // Get the base offset of the incoming arguments stack space.
399   unsigned ArgsBaseOffset = Subtarget->getRsaSize();
400   // Get the size of the preserved arguments area
401   unsigned ArgsPreserved = 64;
402 
403   // Analyze arguments according to CC_VE.
404   SmallVector<CCValAssign, 16> ArgLocs;
405   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
406                  *DAG.getContext());
407   // Allocate the preserved area first.
408   CCInfo.AllocateStack(ArgsPreserved, Align(8));
409   // We already allocated the preserved area, so the stack offset computed
410   // by CC_VE would be correct now.
411   CCInfo.AnalyzeFormalArguments(Ins, getParamCC(CallConv, false));
412 
413   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
414     CCValAssign &VA = ArgLocs[i];
415     assert(!VA.needsCustom() && "Unexpected custom lowering");
416     if (VA.isRegLoc()) {
417       // This argument is passed in a register.
418       // All integer register arguments are promoted by the caller to i64.
419 
420       // Create a virtual register for the promoted live-in value.
421       unsigned VReg =
422           MF.addLiveIn(VA.getLocReg(), getRegClassFor(VA.getLocVT()));
423       SDValue Arg = DAG.getCopyFromReg(Chain, DL, VReg, VA.getLocVT());
424 
425       // The caller promoted the argument, so insert an Assert?ext SDNode so we
426       // won't promote the value again in this function.
427       switch (VA.getLocInfo()) {
428       case CCValAssign::SExt:
429         Arg = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Arg,
430                           DAG.getValueType(VA.getValVT()));
431         break;
432       case CCValAssign::ZExt:
433         Arg = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Arg,
434                           DAG.getValueType(VA.getValVT()));
435         break;
436       case CCValAssign::BCvt: {
437         // Extract a float argument from i64 with padding.
438         //     63     31   0
439         //    +------+------+
440         //    | float|   0  |
441         //    +------+------+
442         assert(VA.getLocVT() == MVT::i64);
443         assert(VA.getValVT() == MVT::f32);
444         SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
445         Arg = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
446                                          MVT::f32, Arg, Sub_f32),
447                       0);
448         break;
449       }
450       default:
451         break;
452       }
453 
454       // Truncate the register down to the argument type.
455       if (VA.isExtInLoc())
456         Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
457 
458       InVals.push_back(Arg);
459       continue;
460     }
461 
462     // The registers are exhausted. This argument was passed on the stack.
463     assert(VA.isMemLoc());
464     // The CC_VE_Full/Half functions compute stack offsets relative to the
465     // beginning of the arguments area at %fp + the size of reserved area.
466     unsigned Offset = VA.getLocMemOffset() + ArgsBaseOffset;
467     unsigned ValSize = VA.getValVT().getSizeInBits() / 8;
468 
469     // Adjust offset for a float argument by adding 4 since the argument is
470     // stored in 8 bytes buffer with offset like below.  LLVM generates
471     // 4 bytes load instruction, so need to adjust offset here.  This
472     // adjustment is required in only LowerFormalArguments.  In LowerCall,
473     // a float argument is converted to i64 first, and stored as 8 bytes
474     // data, which is required by ABI, so no need for adjustment.
475     //    0      4
476     //    +------+------+
477     //    | empty| float|
478     //    +------+------+
479     if (VA.getValVT() == MVT::f32)
480       Offset += 4;
481 
482     int FI = MF.getFrameInfo().CreateFixedObject(ValSize, Offset, true);
483     InVals.push_back(
484         DAG.getLoad(VA.getValVT(), DL, Chain,
485                     DAG.getFrameIndex(FI, getPointerTy(MF.getDataLayout())),
486                     MachinePointerInfo::getFixedStack(MF, FI)));
487   }
488 
489   if (!IsVarArg)
490     return Chain;
491 
492   // This function takes variable arguments, some of which may have been passed
493   // in registers %s0-%s8.
494   //
495   // The va_start intrinsic needs to know the offset to the first variable
496   // argument.
497   // TODO: need to calculate offset correctly once we support f128.
498   unsigned ArgOffset = ArgLocs.size() * 8;
499   VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
500   // Skip the reserved area at the top of stack.
501   FuncInfo->setVarArgsFrameOffset(ArgOffset + ArgsBaseOffset);
502 
503   return Chain;
504 }
505 
506 // FIXME? Maybe this could be a TableGen attribute on some registers and
507 // this table could be generated automatically from RegInfo.
508 Register VETargetLowering::getRegisterByName(const char *RegName, LLT VT,
509                                              const MachineFunction &MF) const {
510   Register Reg = StringSwitch<Register>(RegName)
511                      .Case("sp", VE::SX11)    // Stack pointer
512                      .Case("fp", VE::SX9)     // Frame pointer
513                      .Case("sl", VE::SX8)     // Stack limit
514                      .Case("lr", VE::SX10)    // Link register
515                      .Case("tp", VE::SX14)    // Thread pointer
516                      .Case("outer", VE::SX12) // Outer regiser
517                      .Case("info", VE::SX17)  // Info area register
518                      .Case("got", VE::SX15)   // Global offset table register
519                      .Case("plt", VE::SX16) // Procedure linkage table register
520                      .Default(0);
521 
522   if (Reg)
523     return Reg;
524 
525   report_fatal_error("Invalid register name global variable");
526 }
527 
528 //===----------------------------------------------------------------------===//
529 // TargetLowering Implementation
530 //===----------------------------------------------------------------------===//
531 
532 SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
533                                     SmallVectorImpl<SDValue> &InVals) const {
534   SelectionDAG &DAG = CLI.DAG;
535   SDLoc DL = CLI.DL;
536   SDValue Chain = CLI.Chain;
537   auto PtrVT = getPointerTy(DAG.getDataLayout());
538 
539   // VE target does not yet support tail call optimization.
540   CLI.IsTailCall = false;
541 
542   // Get the base offset of the outgoing arguments stack space.
543   unsigned ArgsBaseOffset = Subtarget->getRsaSize();
544   // Get the size of the preserved arguments area
545   unsigned ArgsPreserved = 8 * 8u;
546 
547   // Analyze operands of the call, assigning locations to each operand.
548   SmallVector<CCValAssign, 16> ArgLocs;
549   CCState CCInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), ArgLocs,
550                  *DAG.getContext());
551   // Allocate the preserved area first.
552   CCInfo.AllocateStack(ArgsPreserved, Align(8));
553   // We already allocated the preserved area, so the stack offset computed
554   // by CC_VE would be correct now.
555   CCInfo.AnalyzeCallOperands(CLI.Outs, getParamCC(CLI.CallConv, false));
556 
557   // VE requires to use both register and stack for varargs or no-prototyped
558   // functions.
559   bool UseBoth = CLI.IsVarArg;
560 
561   // Analyze operands again if it is required to store BOTH.
562   SmallVector<CCValAssign, 16> ArgLocs2;
563   CCState CCInfo2(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(),
564                   ArgLocs2, *DAG.getContext());
565   if (UseBoth)
566     CCInfo2.AnalyzeCallOperands(CLI.Outs, getParamCC(CLI.CallConv, true));
567 
568   // Get the size of the outgoing arguments stack space requirement.
569   unsigned ArgsSize = CCInfo.getNextStackOffset();
570 
571   // Keep stack frames 16-byte aligned.
572   ArgsSize = alignTo(ArgsSize, 16);
573 
574   // Adjust the stack pointer to make room for the arguments.
575   // FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls
576   // with more than 6 arguments.
577   Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, DL);
578 
579   // Collect the set of registers to pass to the function and their values.
580   // This will be emitted as a sequence of CopyToReg nodes glued to the call
581   // instruction.
582   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
583 
584   // Collect chains from all the memory opeations that copy arguments to the
585   // stack. They must follow the stack pointer adjustment above and precede the
586   // call instruction itself.
587   SmallVector<SDValue, 8> MemOpChains;
588 
589   // VE needs to get address of callee function in a register
590   // So, prepare to copy it to SX12 here.
591 
592   // If the callee is a GlobalAddress node (quite common, every direct call is)
593   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
594   // Likewise ExternalSymbol -> TargetExternalSymbol.
595   SDValue Callee = CLI.Callee;
596 
597   bool IsPICCall = isPositionIndependent();
598 
599   // PC-relative references to external symbols should go through $stub.
600   // If so, we need to prepare GlobalBaseReg first.
601   const TargetMachine &TM = DAG.getTarget();
602   const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
603   const GlobalValue *GV = nullptr;
604   auto *CalleeG = dyn_cast<GlobalAddressSDNode>(Callee);
605   if (CalleeG)
606     GV = CalleeG->getGlobal();
607   bool Local = TM.shouldAssumeDSOLocal(*Mod, GV);
608   bool UsePlt = !Local;
609   MachineFunction &MF = DAG.getMachineFunction();
610 
611   // Turn GlobalAddress/ExternalSymbol node into a value node
612   // containing the address of them here.
613   if (CalleeG) {
614     if (IsPICCall) {
615       if (UsePlt)
616         Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
617       Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
618       Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee);
619     } else {
620       Callee =
621           makeHiLoPair(Callee, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
622     }
623   } else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
624     if (IsPICCall) {
625       if (UsePlt)
626         Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
627       Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, 0);
628       Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee);
629     } else {
630       Callee =
631           makeHiLoPair(Callee, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
632     }
633   }
634 
635   RegsToPass.push_back(std::make_pair(VE::SX12, Callee));
636 
637   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
638     CCValAssign &VA = ArgLocs[i];
639     SDValue Arg = CLI.OutVals[i];
640 
641     // Promote the value if needed.
642     switch (VA.getLocInfo()) {
643     default:
644       llvm_unreachable("Unknown location info!");
645     case CCValAssign::Full:
646       break;
647     case CCValAssign::SExt:
648       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
649       break;
650     case CCValAssign::ZExt:
651       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
652       break;
653     case CCValAssign::AExt:
654       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
655       break;
656     case CCValAssign::BCvt: {
657       // Convert a float argument to i64 with padding.
658       //     63     31   0
659       //    +------+------+
660       //    | float|   0  |
661       //    +------+------+
662       assert(VA.getLocVT() == MVT::i64);
663       assert(VA.getValVT() == MVT::f32);
664       SDValue Undef = SDValue(
665           DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0);
666       SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
667       Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
668                                        MVT::i64, Undef, Arg, Sub_f32),
669                     0);
670       break;
671     }
672     }
673 
674     if (VA.isRegLoc()) {
675       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
676       if (!UseBoth)
677         continue;
678       VA = ArgLocs2[i];
679     }
680 
681     assert(VA.isMemLoc());
682 
683     // Create a store off the stack pointer for this argument.
684     SDValue StackPtr = DAG.getRegister(VE::SX11, PtrVT);
685     // The argument area starts at %fp/%sp + the size of reserved area.
686     SDValue PtrOff =
687         DAG.getIntPtrConstant(VA.getLocMemOffset() + ArgsBaseOffset, DL);
688     PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
689     MemOpChains.push_back(
690         DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo()));
691   }
692 
693   // Emit all stores, make sure they occur before the call.
694   if (!MemOpChains.empty())
695     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
696 
697   // Build a sequence of CopyToReg nodes glued together with token chain and
698   // glue operands which copy the outgoing args into registers. The InGlue is
699   // necessary since all emitted instructions must be stuck together in order
700   // to pass the live physical registers.
701   SDValue InGlue;
702   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
703     Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
704                              RegsToPass[i].second, InGlue);
705     InGlue = Chain.getValue(1);
706   }
707 
708   // Build the operands for the call instruction itself.
709   SmallVector<SDValue, 8> Ops;
710   Ops.push_back(Chain);
711   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
712     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
713                                   RegsToPass[i].second.getValueType()));
714 
715   // Add a register mask operand representing the call-preserved registers.
716   const VERegisterInfo *TRI = Subtarget->getRegisterInfo();
717   const uint32_t *Mask =
718       TRI->getCallPreservedMask(DAG.getMachineFunction(), CLI.CallConv);
719   assert(Mask && "Missing call preserved mask for calling convention");
720   Ops.push_back(DAG.getRegisterMask(Mask));
721 
722   // Make sure the CopyToReg nodes are glued to the call instruction which
723   // consumes the registers.
724   if (InGlue.getNode())
725     Ops.push_back(InGlue);
726 
727   // Now the call itself.
728   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
729   Chain = DAG.getNode(VEISD::CALL, DL, NodeTys, Ops);
730   InGlue = Chain.getValue(1);
731 
732   // Revert the stack pointer immediately after the call.
733   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, DL, true),
734                              DAG.getIntPtrConstant(0, DL, true), InGlue, DL);
735   InGlue = Chain.getValue(1);
736 
737   // Now extract the return values. This is more or less the same as
738   // LowerFormalArguments.
739 
740   // Assign locations to each value returned by this call.
741   SmallVector<CCValAssign, 16> RVLocs;
742   CCState RVInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), RVLocs,
743                  *DAG.getContext());
744 
745   // Set inreg flag manually for codegen generated library calls that
746   // return float.
747   if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && !CLI.CB)
748     CLI.Ins[0].Flags.setInReg();
749 
750   RVInfo.AnalyzeCallResult(CLI.Ins, getReturnCC(CLI.CallConv));
751 
752   // Copy all of the result registers out of their specified physreg.
753   for (unsigned i = 0; i != RVLocs.size(); ++i) {
754     CCValAssign &VA = RVLocs[i];
755     assert(!VA.needsCustom() && "Unexpected custom lowering");
756     unsigned Reg = VA.getLocReg();
757 
758     // When returning 'inreg {i32, i32 }', two consecutive i32 arguments can
759     // reside in the same register in the high and low bits. Reuse the
760     // CopyFromReg previous node to avoid duplicate copies.
761     SDValue RV;
762     if (RegisterSDNode *SrcReg = dyn_cast<RegisterSDNode>(Chain.getOperand(1)))
763       if (SrcReg->getReg() == Reg && Chain->getOpcode() == ISD::CopyFromReg)
764         RV = Chain.getValue(0);
765 
766     // But usually we'll create a new CopyFromReg for a different register.
767     if (!RV.getNode()) {
768       RV = DAG.getCopyFromReg(Chain, DL, Reg, RVLocs[i].getLocVT(), InGlue);
769       Chain = RV.getValue(1);
770       InGlue = Chain.getValue(2);
771     }
772 
773     // The callee promoted the return value, so insert an Assert?ext SDNode so
774     // we won't promote the value again in this function.
775     switch (VA.getLocInfo()) {
776     case CCValAssign::SExt:
777       RV = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), RV,
778                        DAG.getValueType(VA.getValVT()));
779       break;
780     case CCValAssign::ZExt:
781       RV = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), RV,
782                        DAG.getValueType(VA.getValVT()));
783       break;
784     case CCValAssign::BCvt: {
785       // Extract a float return value from i64 with padding.
786       //     63     31   0
787       //    +------+------+
788       //    | float|   0  |
789       //    +------+------+
790       assert(VA.getLocVT() == MVT::i64);
791       assert(VA.getValVT() == MVT::f32);
792       SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
793       RV = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
794                                       MVT::f32, RV, Sub_f32),
795                    0);
796       break;
797     }
798     default:
799       break;
800     }
801 
802     // Truncate the register down to the return value type.
803     if (VA.isExtInLoc())
804       RV = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), RV);
805 
806     InVals.push_back(RV);
807   }
808 
809   return Chain;
810 }
811 
812 bool VETargetLowering::isOffsetFoldingLegal(
813     const GlobalAddressSDNode *GA) const {
814   // VE uses 64 bit addressing, so we need multiple instructions to generate
815   // an address.  Folding address with offset increases the number of
816   // instructions, so that we disable it here.  Offsets will be folded in
817   // the DAG combine later if it worth to do so.
818   return false;
819 }
820 
821 /// isFPImmLegal - Returns true if the target can instruction select the
822 /// specified FP immediate natively. If false, the legalizer will
823 /// materialize the FP immediate as a load from a constant pool.
824 bool VETargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
825                                     bool ForCodeSize) const {
826   return VT == MVT::f32 || VT == MVT::f64;
827 }
828 
829 /// Determine if the target supports unaligned memory accesses.
830 ///
831 /// This function returns true if the target allows unaligned memory accesses
832 /// of the specified type in the given address space. If true, it also returns
833 /// whether the unaligned memory access is "fast" in the last argument by
834 /// reference. This is used, for example, in situations where an array
835 /// copy/move/set is converted to a sequence of store operations. Its use
836 /// helps to ensure that such replacements don't generate code that causes an
837 /// alignment error (trap) on the target machine.
838 bool VETargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
839                                                       unsigned AddrSpace,
840                                                       unsigned Align,
841                                                       MachineMemOperand::Flags,
842                                                       bool *Fast) const {
843   if (Fast) {
844     // It's fast anytime on VE
845     *Fast = true;
846   }
847   return true;
848 }
849 
850 VETargetLowering::VETargetLowering(const TargetMachine &TM,
851                                    const VESubtarget &STI)
852     : TargetLowering(TM), Subtarget(&STI) {
853   // Instructions which use registers as conditionals examine all the
854   // bits (as does the pseudo SELECT_CC expansion). I don't think it
855   // matters much whether it's ZeroOrOneBooleanContent, or
856   // ZeroOrNegativeOneBooleanContent, so, arbitrarily choose the
857   // former.
858   setBooleanContents(ZeroOrOneBooleanContent);
859   setBooleanVectorContents(ZeroOrOneBooleanContent);
860 
861   initRegisterClasses();
862   initSPUActions();
863   initVPUActions();
864 
865   setStackPointerRegisterToSaveRestore(VE::SX11);
866 
867   // We have target-specific dag combine patterns for the following nodes:
868   setTargetDAGCombine(ISD::TRUNCATE);
869 
870   // Set function alignment to 16 bytes
871   setMinFunctionAlignment(Align(16));
872 
873   // VE stores all argument by 8 bytes alignment
874   setMinStackArgumentAlignment(Align(8));
875 
876   computeRegisterProperties(Subtarget->getRegisterInfo());
877 }
878 
879 const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const {
880 #define TARGET_NODE_CASE(NAME)                                                 \
881   case VEISD::NAME:                                                            \
882     return "VEISD::" #NAME;
883   switch ((VEISD::NodeType)Opcode) {
884   case VEISD::FIRST_NUMBER:
885     break;
886     TARGET_NODE_CASE(CALL)
887     TARGET_NODE_CASE(EH_SJLJ_LONGJMP)
888     TARGET_NODE_CASE(EH_SJLJ_SETJMP)
889     TARGET_NODE_CASE(EH_SJLJ_SETUP_DISPATCH)
890     TARGET_NODE_CASE(GETFUNPLT)
891     TARGET_NODE_CASE(GETSTACKTOP)
892     TARGET_NODE_CASE(GETTLSADDR)
893     TARGET_NODE_CASE(GLOBAL_BASE_REG)
894     TARGET_NODE_CASE(Hi)
895     TARGET_NODE_CASE(Lo)
896     TARGET_NODE_CASE(MEMBARRIER)
897     TARGET_NODE_CASE(RET_FLAG)
898     TARGET_NODE_CASE(TS1AM)
899     TARGET_NODE_CASE(VEC_BROADCAST)
900 
901     // Register the VVP_* SDNodes.
902 #define ADD_VVP_OP(VVP_NAME, ...) TARGET_NODE_CASE(VVP_NAME)
903 #include "VVPNodes.def"
904   }
905 #undef TARGET_NODE_CASE
906   return nullptr;
907 }
908 
909 EVT VETargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
910                                          EVT VT) const {
911   return MVT::i32;
912 }
913 
914 // Convert to a target node and set target flags.
915 SDValue VETargetLowering::withTargetFlags(SDValue Op, unsigned TF,
916                                           SelectionDAG &DAG) const {
917   if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op))
918     return DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA),
919                                       GA->getValueType(0), GA->getOffset(), TF);
920 
921   if (const BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op))
922     return DAG.getTargetBlockAddress(BA->getBlockAddress(), Op.getValueType(),
923                                      0, TF);
924 
925   if (const ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op))
926     return DAG.getTargetConstantPool(CP->getConstVal(), CP->getValueType(0),
927                                      CP->getAlign(), CP->getOffset(), TF);
928 
929   if (const ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op))
930     return DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0),
931                                        TF);
932 
933   if (const JumpTableSDNode *JT = dyn_cast<JumpTableSDNode>(Op))
934     return DAG.getTargetJumpTable(JT->getIndex(), JT->getValueType(0), TF);
935 
936   llvm_unreachable("Unhandled address SDNode");
937 }
938 
939 // Split Op into high and low parts according to HiTF and LoTF.
940 // Return an ADD node combining the parts.
941 SDValue VETargetLowering::makeHiLoPair(SDValue Op, unsigned HiTF, unsigned LoTF,
942                                        SelectionDAG &DAG) const {
943   SDLoc DL(Op);
944   EVT VT = Op.getValueType();
945   SDValue Hi = DAG.getNode(VEISD::Hi, DL, VT, withTargetFlags(Op, HiTF, DAG));
946   SDValue Lo = DAG.getNode(VEISD::Lo, DL, VT, withTargetFlags(Op, LoTF, DAG));
947   return DAG.getNode(ISD::ADD, DL, VT, Hi, Lo);
948 }
949 
950 // Build SDNodes for producing an address from a GlobalAddress, ConstantPool,
951 // or ExternalSymbol SDNode.
952 SDValue VETargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
953   SDLoc DL(Op);
954   EVT PtrVT = Op.getValueType();
955 
956   // Handle PIC mode first. VE needs a got load for every variable!
957   if (isPositionIndependent()) {
958     auto GlobalN = dyn_cast<GlobalAddressSDNode>(Op);
959 
960     if (isa<ConstantPoolSDNode>(Op) || isa<JumpTableSDNode>(Op) ||
961         (GlobalN && GlobalN->getGlobal()->hasLocalLinkage())) {
962       // Create following instructions for local linkage PIC code.
963       //     lea %reg, label@gotoff_lo
964       //     and %reg, %reg, (32)0
965       //     lea.sl %reg, label@gotoff_hi(%reg, %got)
966       SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOTOFF_HI32,
967                                   VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
968       SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
969       return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
970     }
971     // Create following instructions for not local linkage PIC code.
972     //     lea %reg, label@got_lo
973     //     and %reg, %reg, (32)0
974     //     lea.sl %reg, label@got_hi(%reg)
975     //     ld %reg, (%reg, %got)
976     SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOT_HI32,
977                                 VEMCExpr::VK_VE_GOT_LO32, DAG);
978     SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
979     SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
980     return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), AbsAddr,
981                        MachinePointerInfo::getGOT(DAG.getMachineFunction()));
982   }
983 
984   // This is one of the absolute code models.
985   switch (getTargetMachine().getCodeModel()) {
986   default:
987     llvm_unreachable("Unsupported absolute code model");
988   case CodeModel::Small:
989   case CodeModel::Medium:
990   case CodeModel::Large:
991     // abs64.
992     return makeHiLoPair(Op, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
993   }
994 }
995 
996 /// Custom Lower {
997 
998 // The mappings for emitLeading/TrailingFence for VE is designed by following
999 // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
1000 Instruction *VETargetLowering::emitLeadingFence(IRBuilder<> &Builder,
1001                                                 Instruction *Inst,
1002                                                 AtomicOrdering Ord) const {
1003   switch (Ord) {
1004   case AtomicOrdering::NotAtomic:
1005   case AtomicOrdering::Unordered:
1006     llvm_unreachable("Invalid fence: unordered/non-atomic");
1007   case AtomicOrdering::Monotonic:
1008   case AtomicOrdering::Acquire:
1009     return nullptr; // Nothing to do
1010   case AtomicOrdering::Release:
1011   case AtomicOrdering::AcquireRelease:
1012     return Builder.CreateFence(AtomicOrdering::Release);
1013   case AtomicOrdering::SequentiallyConsistent:
1014     if (!Inst->hasAtomicStore())
1015       return nullptr; // Nothing to do
1016     return Builder.CreateFence(AtomicOrdering::SequentiallyConsistent);
1017   }
1018   llvm_unreachable("Unknown fence ordering in emitLeadingFence");
1019 }
1020 
1021 Instruction *VETargetLowering::emitTrailingFence(IRBuilder<> &Builder,
1022                                                  Instruction *Inst,
1023                                                  AtomicOrdering Ord) const {
1024   switch (Ord) {
1025   case AtomicOrdering::NotAtomic:
1026   case AtomicOrdering::Unordered:
1027     llvm_unreachable("Invalid fence: unordered/not-atomic");
1028   case AtomicOrdering::Monotonic:
1029   case AtomicOrdering::Release:
1030     return nullptr; // Nothing to do
1031   case AtomicOrdering::Acquire:
1032   case AtomicOrdering::AcquireRelease:
1033     return Builder.CreateFence(AtomicOrdering::Acquire);
1034   case AtomicOrdering::SequentiallyConsistent:
1035     return Builder.CreateFence(AtomicOrdering::SequentiallyConsistent);
1036   }
1037   llvm_unreachable("Unknown fence ordering in emitTrailingFence");
1038 }
1039 
1040 SDValue VETargetLowering::lowerATOMIC_FENCE(SDValue Op,
1041                                             SelectionDAG &DAG) const {
1042   SDLoc DL(Op);
1043   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
1044       cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
1045   SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
1046       cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
1047 
1048   // VE uses Release consistency, so need a fence instruction if it is a
1049   // cross-thread fence.
1050   if (FenceSSID == SyncScope::System) {
1051     switch (FenceOrdering) {
1052     case AtomicOrdering::NotAtomic:
1053     case AtomicOrdering::Unordered:
1054     case AtomicOrdering::Monotonic:
1055       // No need to generate fencem instruction here.
1056       break;
1057     case AtomicOrdering::Acquire:
1058       // Generate "fencem 2" as acquire fence.
1059       return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
1060                                         DAG.getTargetConstant(2, DL, MVT::i32),
1061                                         Op.getOperand(0)),
1062                      0);
1063     case AtomicOrdering::Release:
1064       // Generate "fencem 1" as release fence.
1065       return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
1066                                         DAG.getTargetConstant(1, DL, MVT::i32),
1067                                         Op.getOperand(0)),
1068                      0);
1069     case AtomicOrdering::AcquireRelease:
1070     case AtomicOrdering::SequentiallyConsistent:
1071       // Generate "fencem 3" as acq_rel and seq_cst fence.
1072       // FIXME: "fencem 3" doesn't wait for for PCIe deveices accesses,
1073       //        so  seq_cst may require more instruction for them.
1074       return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
1075                                         DAG.getTargetConstant(3, DL, MVT::i32),
1076                                         Op.getOperand(0)),
1077                      0);
1078     }
1079   }
1080 
1081   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
1082   return DAG.getNode(VEISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
1083 }
1084 
1085 TargetLowering::AtomicExpansionKind
1086 VETargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
1087   // We have TS1AM implementation for i8/i16/i32/i64, so use it.
1088   if (AI->getOperation() == AtomicRMWInst::Xchg) {
1089     return AtomicExpansionKind::None;
1090   }
1091   // FIXME: Support "ATMAM" instruction for LOAD_ADD/SUB/AND/OR.
1092 
1093   // Otherwise, expand it using compare and exchange instruction to not call
1094   // __sync_fetch_and_* functions.
1095   return AtomicExpansionKind::CmpXChg;
1096 }
1097 
1098 static SDValue prepareTS1AM(SDValue Op, SelectionDAG &DAG, SDValue &Flag,
1099                             SDValue &Bits) {
1100   SDLoc DL(Op);
1101   AtomicSDNode *N = cast<AtomicSDNode>(Op);
1102   SDValue Ptr = N->getOperand(1);
1103   SDValue Val = N->getOperand(2);
1104   EVT PtrVT = Ptr.getValueType();
1105   bool Byte = N->getMemoryVT() == MVT::i8;
1106   //   Remainder = AND Ptr, 3
1107   //   Flag = 1 << Remainder  ; If Byte is true (1 byte swap flag)
1108   //   Flag = 3 << Remainder  ; If Byte is false (2 bytes swap flag)
1109   //   Bits = Remainder << 3
1110   //   NewVal = Val << Bits
1111   SDValue Const3 = DAG.getConstant(3, DL, PtrVT);
1112   SDValue Remainder = DAG.getNode(ISD::AND, DL, PtrVT, {Ptr, Const3});
1113   SDValue Mask = Byte ? DAG.getConstant(1, DL, MVT::i32)
1114                       : DAG.getConstant(3, DL, MVT::i32);
1115   Flag = DAG.getNode(ISD::SHL, DL, MVT::i32, {Mask, Remainder});
1116   Bits = DAG.getNode(ISD::SHL, DL, PtrVT, {Remainder, Const3});
1117   return DAG.getNode(ISD::SHL, DL, Val.getValueType(), {Val, Bits});
1118 }
1119 
1120 static SDValue finalizeTS1AM(SDValue Op, SelectionDAG &DAG, SDValue Data,
1121                              SDValue Bits) {
1122   SDLoc DL(Op);
1123   EVT VT = Data.getValueType();
1124   bool Byte = cast<AtomicSDNode>(Op)->getMemoryVT() == MVT::i8;
1125   //   NewData = Data >> Bits
1126   //   Result = NewData & 0xff   ; If Byte is true (1 byte)
1127   //   Result = NewData & 0xffff ; If Byte is false (2 bytes)
1128 
1129   SDValue NewData = DAG.getNode(ISD::SRL, DL, VT, Data, Bits);
1130   return DAG.getNode(ISD::AND, DL, VT,
1131                      {NewData, DAG.getConstant(Byte ? 0xff : 0xffff, DL, VT)});
1132 }
1133 
1134 SDValue VETargetLowering::lowerATOMIC_SWAP(SDValue Op,
1135                                            SelectionDAG &DAG) const {
1136   SDLoc DL(Op);
1137   AtomicSDNode *N = cast<AtomicSDNode>(Op);
1138 
1139   if (N->getMemoryVT() == MVT::i8) {
1140     // For i8, use "ts1am"
1141     //   Input:
1142     //     ATOMIC_SWAP Ptr, Val, Order
1143     //
1144     //   Output:
1145     //     Remainder = AND Ptr, 3
1146     //     Flag = 1 << Remainder   ; 1 byte swap flag for TS1AM inst.
1147     //     Bits = Remainder << 3
1148     //     NewVal = Val << Bits
1149     //
1150     //     Aligned = AND Ptr, -4
1151     //     Data = TS1AM Aligned, Flag, NewVal
1152     //
1153     //     NewData = Data >> Bits
1154     //     Result = NewData & 0xff ; 1 byte result
1155     SDValue Flag;
1156     SDValue Bits;
1157     SDValue NewVal = prepareTS1AM(Op, DAG, Flag, Bits);
1158 
1159     SDValue Ptr = N->getOperand(1);
1160     SDValue Aligned = DAG.getNode(ISD::AND, DL, Ptr.getValueType(),
1161                                   {Ptr, DAG.getConstant(-4, DL, MVT::i64)});
1162     SDValue TS1AM = DAG.getAtomic(VEISD::TS1AM, DL, N->getMemoryVT(),
1163                                   DAG.getVTList(Op.getNode()->getValueType(0),
1164                                                 Op.getNode()->getValueType(1)),
1165                                   {N->getChain(), Aligned, Flag, NewVal},
1166                                   N->getMemOperand());
1167 
1168     SDValue Result = finalizeTS1AM(Op, DAG, TS1AM, Bits);
1169     SDValue Chain = TS1AM.getValue(1);
1170     return DAG.getMergeValues({Result, Chain}, DL);
1171   }
1172   if (N->getMemoryVT() == MVT::i16) {
1173     // For i16, use "ts1am"
1174     SDValue Flag;
1175     SDValue Bits;
1176     SDValue NewVal = prepareTS1AM(Op, DAG, Flag, Bits);
1177 
1178     SDValue Ptr = N->getOperand(1);
1179     SDValue Aligned = DAG.getNode(ISD::AND, DL, Ptr.getValueType(),
1180                                   {Ptr, DAG.getConstant(-4, DL, MVT::i64)});
1181     SDValue TS1AM = DAG.getAtomic(VEISD::TS1AM, DL, N->getMemoryVT(),
1182                                   DAG.getVTList(Op.getNode()->getValueType(0),
1183                                                 Op.getNode()->getValueType(1)),
1184                                   {N->getChain(), Aligned, Flag, NewVal},
1185                                   N->getMemOperand());
1186 
1187     SDValue Result = finalizeTS1AM(Op, DAG, TS1AM, Bits);
1188     SDValue Chain = TS1AM.getValue(1);
1189     return DAG.getMergeValues({Result, Chain}, DL);
1190   }
1191   // Otherwise, let llvm legalize it.
1192   return Op;
1193 }
1194 
1195 SDValue VETargetLowering::lowerGlobalAddress(SDValue Op,
1196                                              SelectionDAG &DAG) const {
1197   return makeAddress(Op, DAG);
1198 }
1199 
1200 SDValue VETargetLowering::lowerBlockAddress(SDValue Op,
1201                                             SelectionDAG &DAG) const {
1202   return makeAddress(Op, DAG);
1203 }
1204 
1205 SDValue VETargetLowering::lowerConstantPool(SDValue Op,
1206                                             SelectionDAG &DAG) const {
1207   return makeAddress(Op, DAG);
1208 }
1209 
1210 SDValue
1211 VETargetLowering::lowerToTLSGeneralDynamicModel(SDValue Op,
1212                                                 SelectionDAG &DAG) const {
1213   SDLoc DL(Op);
1214 
1215   // Generate the following code:
1216   //   t1: ch,glue = callseq_start t0, 0, 0
1217   //   t2: i64,ch,glue = VEISD::GETTLSADDR t1, label, t1:1
1218   //   t3: ch,glue = callseq_end t2, 0, 0, t2:2
1219   //   t4: i64,ch,glue = CopyFromReg t3, Register:i64 $sx0, t3:1
1220   SDValue Label = withTargetFlags(Op, 0, DAG);
1221   EVT PtrVT = Op.getValueType();
1222 
1223   // Lowering the machine isd will make sure everything is in the right
1224   // location.
1225   SDValue Chain = DAG.getEntryNode();
1226   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
1227   const uint32_t *Mask = Subtarget->getRegisterInfo()->getCallPreservedMask(
1228       DAG.getMachineFunction(), CallingConv::C);
1229   Chain = DAG.getCALLSEQ_START(Chain, 64, 0, DL);
1230   SDValue Args[] = {Chain, Label, DAG.getRegisterMask(Mask), Chain.getValue(1)};
1231   Chain = DAG.getNode(VEISD::GETTLSADDR, DL, NodeTys, Args);
1232   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(64, DL, true),
1233                              DAG.getIntPtrConstant(0, DL, true),
1234                              Chain.getValue(1), DL);
1235   Chain = DAG.getCopyFromReg(Chain, DL, VE::SX0, PtrVT, Chain.getValue(1));
1236 
1237   // GETTLSADDR will be codegen'ed as call. Inform MFI that function has calls.
1238   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
1239   MFI.setHasCalls(true);
1240 
1241   // Also generate code to prepare a GOT register if it is PIC.
1242   if (isPositionIndependent()) {
1243     MachineFunction &MF = DAG.getMachineFunction();
1244     Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
1245   }
1246 
1247   return Chain;
1248 }
1249 
1250 SDValue VETargetLowering::lowerGlobalTLSAddress(SDValue Op,
1251                                                 SelectionDAG &DAG) const {
1252   // The current implementation of nld (2.26) doesn't allow local exec model
1253   // code described in VE-tls_v1.1.pdf (*1) as its input. Instead, we always
1254   // generate the general dynamic model code sequence.
1255   //
1256   // *1: https://www.nec.com/en/global/prod/hpc/aurora/document/VE-tls_v1.1.pdf
1257   return lowerToTLSGeneralDynamicModel(Op, DAG);
1258 }
1259 
1260 SDValue VETargetLowering::lowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
1261   return makeAddress(Op, DAG);
1262 }
1263 
1264 // Lower a f128 load into two f64 loads.
1265 static SDValue lowerLoadF128(SDValue Op, SelectionDAG &DAG) {
1266   SDLoc DL(Op);
1267   LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Op.getNode());
1268   assert(LdNode && LdNode->getOffset().isUndef() && "Unexpected node type");
1269   unsigned Alignment = LdNode->getAlign().value();
1270   if (Alignment > 8)
1271     Alignment = 8;
1272 
1273   SDValue Lo64 =
1274       DAG.getLoad(MVT::f64, DL, LdNode->getChain(), LdNode->getBasePtr(),
1275                   LdNode->getPointerInfo(), Alignment,
1276                   LdNode->isVolatile() ? MachineMemOperand::MOVolatile
1277                                        : MachineMemOperand::MONone);
1278   EVT AddrVT = LdNode->getBasePtr().getValueType();
1279   SDValue HiPtr = DAG.getNode(ISD::ADD, DL, AddrVT, LdNode->getBasePtr(),
1280                               DAG.getConstant(8, DL, AddrVT));
1281   SDValue Hi64 =
1282       DAG.getLoad(MVT::f64, DL, LdNode->getChain(), HiPtr,
1283                   LdNode->getPointerInfo(), Alignment,
1284                   LdNode->isVolatile() ? MachineMemOperand::MOVolatile
1285                                        : MachineMemOperand::MONone);
1286 
1287   SDValue SubRegEven = DAG.getTargetConstant(VE::sub_even, DL, MVT::i32);
1288   SDValue SubRegOdd = DAG.getTargetConstant(VE::sub_odd, DL, MVT::i32);
1289 
1290   // VE stores Hi64 to 8(addr) and Lo64 to 0(addr)
1291   SDNode *InFP128 =
1292       DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f128);
1293   InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f128,
1294                                SDValue(InFP128, 0), Hi64, SubRegEven);
1295   InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f128,
1296                                SDValue(InFP128, 0), Lo64, SubRegOdd);
1297   SDValue OutChains[2] = {SDValue(Lo64.getNode(), 1),
1298                           SDValue(Hi64.getNode(), 1)};
1299   SDValue OutChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
1300   SDValue Ops[2] = {SDValue(InFP128, 0), OutChain};
1301   return DAG.getMergeValues(Ops, DL);
1302 }
1303 
1304 SDValue VETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
1305   LoadSDNode *LdNode = cast<LoadSDNode>(Op.getNode());
1306 
1307   SDValue BasePtr = LdNode->getBasePtr();
1308   if (isa<FrameIndexSDNode>(BasePtr.getNode())) {
1309     // Do not expand store instruction with frame index here because of
1310     // dependency problems.  We expand it later in eliminateFrameIndex().
1311     return Op;
1312   }
1313 
1314   EVT MemVT = LdNode->getMemoryVT();
1315   if (MemVT == MVT::f128)
1316     return lowerLoadF128(Op, DAG);
1317 
1318   return Op;
1319 }
1320 
1321 // Lower a f128 store into two f64 stores.
1322 static SDValue lowerStoreF128(SDValue Op, SelectionDAG &DAG) {
1323   SDLoc DL(Op);
1324   StoreSDNode *StNode = dyn_cast<StoreSDNode>(Op.getNode());
1325   assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type");
1326 
1327   SDValue SubRegEven = DAG.getTargetConstant(VE::sub_even, DL, MVT::i32);
1328   SDValue SubRegOdd = DAG.getTargetConstant(VE::sub_odd, DL, MVT::i32);
1329 
1330   SDNode *Hi64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i64,
1331                                     StNode->getValue(), SubRegEven);
1332   SDNode *Lo64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i64,
1333                                     StNode->getValue(), SubRegOdd);
1334 
1335   unsigned Alignment = StNode->getAlign().value();
1336   if (Alignment > 8)
1337     Alignment = 8;
1338 
1339   // VE stores Hi64 to 8(addr) and Lo64 to 0(addr)
1340   SDValue OutChains[2];
1341   OutChains[0] =
1342       DAG.getStore(StNode->getChain(), DL, SDValue(Lo64, 0),
1343                    StNode->getBasePtr(), MachinePointerInfo(), Alignment,
1344                    StNode->isVolatile() ? MachineMemOperand::MOVolatile
1345                                         : MachineMemOperand::MONone);
1346   EVT AddrVT = StNode->getBasePtr().getValueType();
1347   SDValue HiPtr = DAG.getNode(ISD::ADD, DL, AddrVT, StNode->getBasePtr(),
1348                               DAG.getConstant(8, DL, AddrVT));
1349   OutChains[1] =
1350       DAG.getStore(StNode->getChain(), DL, SDValue(Hi64, 0), HiPtr,
1351                    MachinePointerInfo(), Alignment,
1352                    StNode->isVolatile() ? MachineMemOperand::MOVolatile
1353                                         : MachineMemOperand::MONone);
1354   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
1355 }
1356 
1357 SDValue VETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1358   StoreSDNode *StNode = cast<StoreSDNode>(Op.getNode());
1359   assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type");
1360 
1361   SDValue BasePtr = StNode->getBasePtr();
1362   if (isa<FrameIndexSDNode>(BasePtr.getNode())) {
1363     // Do not expand store instruction with frame index here because of
1364     // dependency problems.  We expand it later in eliminateFrameIndex().
1365     return Op;
1366   }
1367 
1368   EVT MemVT = StNode->getMemoryVT();
1369   if (MemVT == MVT::f128)
1370     return lowerStoreF128(Op, DAG);
1371 
1372   // Otherwise, ask llvm to expand it.
1373   return SDValue();
1374 }
1375 
1376 SDValue VETargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
1377   MachineFunction &MF = DAG.getMachineFunction();
1378   VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
1379   auto PtrVT = getPointerTy(DAG.getDataLayout());
1380 
1381   // Need frame address to find the address of VarArgsFrameIndex.
1382   MF.getFrameInfo().setFrameAddressIsTaken(true);
1383 
1384   // vastart just stores the address of the VarArgsFrameIndex slot into the
1385   // memory location argument.
1386   SDLoc DL(Op);
1387   SDValue Offset =
1388       DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getRegister(VE::SX9, PtrVT),
1389                   DAG.getIntPtrConstant(FuncInfo->getVarArgsFrameOffset(), DL));
1390   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
1391   return DAG.getStore(Op.getOperand(0), DL, Offset, Op.getOperand(1),
1392                       MachinePointerInfo(SV));
1393 }
1394 
1395 SDValue VETargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
1396   SDNode *Node = Op.getNode();
1397   EVT VT = Node->getValueType(0);
1398   SDValue InChain = Node->getOperand(0);
1399   SDValue VAListPtr = Node->getOperand(1);
1400   EVT PtrVT = VAListPtr.getValueType();
1401   const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
1402   SDLoc DL(Node);
1403   SDValue VAList =
1404       DAG.getLoad(PtrVT, DL, InChain, VAListPtr, MachinePointerInfo(SV));
1405   SDValue Chain = VAList.getValue(1);
1406   SDValue NextPtr;
1407 
1408   if (VT == MVT::f128) {
1409     // VE f128 values must be stored with 16 bytes alignment.  We doesn't
1410     // know the actual alignment of VAList, so we take alignment of it
1411     // dyanmically.
1412     int Align = 16;
1413     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
1414                          DAG.getConstant(Align - 1, DL, PtrVT));
1415     VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
1416                          DAG.getConstant(-Align, DL, PtrVT));
1417     // Increment the pointer, VAList, by 16 to the next vaarg.
1418     NextPtr =
1419         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(16, DL));
1420   } else if (VT == MVT::f32) {
1421     // float --> need special handling like below.
1422     //    0      4
1423     //    +------+------+
1424     //    | empty| float|
1425     //    +------+------+
1426     // Increment the pointer, VAList, by 8 to the next vaarg.
1427     NextPtr =
1428         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(8, DL));
1429     // Then, adjust VAList.
1430     unsigned InternalOffset = 4;
1431     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
1432                          DAG.getConstant(InternalOffset, DL, PtrVT));
1433   } else {
1434     // Increment the pointer, VAList, by 8 to the next vaarg.
1435     NextPtr =
1436         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(8, DL));
1437   }
1438 
1439   // Store the incremented VAList to the legalized pointer.
1440   InChain = DAG.getStore(Chain, DL, NextPtr, VAListPtr, MachinePointerInfo(SV));
1441 
1442   // Load the actual argument out of the pointer VAList.
1443   // We can't count on greater alignment than the word size.
1444   return DAG.getLoad(VT, DL, InChain, VAList, MachinePointerInfo(),
1445                      std::min(PtrVT.getSizeInBits(), VT.getSizeInBits()) / 8);
1446 }
1447 
1448 SDValue VETargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op,
1449                                                   SelectionDAG &DAG) const {
1450   // Generate following code.
1451   //   (void)__llvm_grow_stack(size);
1452   //   ret = GETSTACKTOP;        // pseudo instruction
1453   SDLoc DL(Op);
1454 
1455   // Get the inputs.
1456   SDNode *Node = Op.getNode();
1457   SDValue Chain = Op.getOperand(0);
1458   SDValue Size = Op.getOperand(1);
1459   MaybeAlign Alignment(Op.getConstantOperandVal(2));
1460   EVT VT = Node->getValueType(0);
1461 
1462   // Chain the dynamic stack allocation so that it doesn't modify the stack
1463   // pointer when other instructions are using the stack.
1464   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
1465 
1466   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
1467   Align StackAlign = TFI.getStackAlign();
1468   bool NeedsAlign = Alignment.valueOrOne() > StackAlign;
1469 
1470   // Prepare arguments
1471   TargetLowering::ArgListTy Args;
1472   TargetLowering::ArgListEntry Entry;
1473   Entry.Node = Size;
1474   Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
1475   Args.push_back(Entry);
1476   if (NeedsAlign) {
1477     Entry.Node = DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT);
1478     Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
1479     Args.push_back(Entry);
1480   }
1481   Type *RetTy = Type::getVoidTy(*DAG.getContext());
1482 
1483   EVT PtrVT = Op.getValueType();
1484   SDValue Callee;
1485   if (NeedsAlign) {
1486     Callee = DAG.getTargetExternalSymbol("__ve_grow_stack_align", PtrVT, 0);
1487   } else {
1488     Callee = DAG.getTargetExternalSymbol("__ve_grow_stack", PtrVT, 0);
1489   }
1490 
1491   TargetLowering::CallLoweringInfo CLI(DAG);
1492   CLI.setDebugLoc(DL)
1493       .setChain(Chain)
1494       .setCallee(CallingConv::PreserveAll, RetTy, Callee, std::move(Args))
1495       .setDiscardResult(true);
1496   std::pair<SDValue, SDValue> pair = LowerCallTo(CLI);
1497   Chain = pair.second;
1498   SDValue Result = DAG.getNode(VEISD::GETSTACKTOP, DL, VT, Chain);
1499   if (NeedsAlign) {
1500     Result = DAG.getNode(ISD::ADD, DL, VT, Result,
1501                          DAG.getConstant((Alignment->value() - 1ULL), DL, VT));
1502     Result = DAG.getNode(ISD::AND, DL, VT, Result,
1503                          DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT));
1504   }
1505   //  Chain = Result.getValue(1);
1506   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
1507                              DAG.getIntPtrConstant(0, DL, true), SDValue(), DL);
1508 
1509   SDValue Ops[2] = {Result, Chain};
1510   return DAG.getMergeValues(Ops, DL);
1511 }
1512 
1513 SDValue VETargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
1514                                                SelectionDAG &DAG) const {
1515   SDLoc DL(Op);
1516   return DAG.getNode(VEISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0),
1517                      Op.getOperand(1));
1518 }
1519 
1520 SDValue VETargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
1521                                               SelectionDAG &DAG) const {
1522   SDLoc DL(Op);
1523   return DAG.getNode(VEISD::EH_SJLJ_SETJMP, DL,
1524                      DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
1525                      Op.getOperand(1));
1526 }
1527 
1528 SDValue VETargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
1529                                                       SelectionDAG &DAG) const {
1530   SDLoc DL(Op);
1531   return DAG.getNode(VEISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
1532                      Op.getOperand(0));
1533 }
1534 
1535 static SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG,
1536                               const VETargetLowering &TLI,
1537                               const VESubtarget *Subtarget) {
1538   SDLoc DL(Op);
1539   MachineFunction &MF = DAG.getMachineFunction();
1540   EVT PtrVT = TLI.getPointerTy(MF.getDataLayout());
1541 
1542   MachineFrameInfo &MFI = MF.getFrameInfo();
1543   MFI.setFrameAddressIsTaken(true);
1544 
1545   unsigned Depth = Op.getConstantOperandVal(0);
1546   const VERegisterInfo *RegInfo = Subtarget->getRegisterInfo();
1547   unsigned FrameReg = RegInfo->getFrameRegister(MF);
1548   SDValue FrameAddr =
1549       DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameReg, PtrVT);
1550   while (Depth--)
1551     FrameAddr = DAG.getLoad(Op.getValueType(), DL, DAG.getEntryNode(),
1552                             FrameAddr, MachinePointerInfo());
1553   return FrameAddr;
1554 }
1555 
1556 static SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
1557                                const VETargetLowering &TLI,
1558                                const VESubtarget *Subtarget) {
1559   MachineFunction &MF = DAG.getMachineFunction();
1560   MachineFrameInfo &MFI = MF.getFrameInfo();
1561   MFI.setReturnAddressIsTaken(true);
1562 
1563   if (TLI.verifyReturnAddressArgumentIsConstant(Op, DAG))
1564     return SDValue();
1565 
1566   SDValue FrameAddr = lowerFRAMEADDR(Op, DAG, TLI, Subtarget);
1567 
1568   SDLoc DL(Op);
1569   EVT VT = Op.getValueType();
1570   SDValue Offset = DAG.getConstant(8, DL, VT);
1571   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1572                      DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
1573                      MachinePointerInfo());
1574 }
1575 
1576 SDValue VETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
1577                                                   SelectionDAG &DAG) const {
1578   SDLoc DL(Op);
1579   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1580   switch (IntNo) {
1581   default: // Don't custom lower most intrinsics.
1582     return SDValue();
1583   case Intrinsic::eh_sjlj_lsda: {
1584     MachineFunction &MF = DAG.getMachineFunction();
1585     MVT VT = Op.getSimpleValueType();
1586     const VETargetMachine *TM =
1587         static_cast<const VETargetMachine *>(&DAG.getTarget());
1588 
1589     // Create GCC_except_tableXX string.  The real symbol for that will be
1590     // generated in EHStreamer::emitExceptionTable() later.  So, we just
1591     // borrow it's name here.
1592     TM->getStrList()->push_back(std::string(
1593         (Twine("GCC_except_table") + Twine(MF.getFunctionNumber())).str()));
1594     SDValue Addr =
1595         DAG.getTargetExternalSymbol(TM->getStrList()->back().c_str(), VT, 0);
1596     if (isPositionIndependent()) {
1597       Addr = makeHiLoPair(Addr, VEMCExpr::VK_VE_GOTOFF_HI32,
1598                           VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
1599       SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, VT);
1600       return DAG.getNode(ISD::ADD, DL, VT, GlobalBase, Addr);
1601     }
1602     return makeHiLoPair(Addr, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
1603   }
1604   }
1605 }
1606 
1607 static bool getUniqueInsertion(SDNode *N, unsigned &UniqueIdx) {
1608   if (!isa<BuildVectorSDNode>(N))
1609     return false;
1610   const auto *BVN = cast<BuildVectorSDNode>(N);
1611 
1612   // Find first non-undef insertion.
1613   unsigned Idx;
1614   for (Idx = 0; Idx < BVN->getNumOperands(); ++Idx) {
1615     auto ElemV = BVN->getOperand(Idx);
1616     if (!ElemV->isUndef())
1617       break;
1618   }
1619   // Catch the (hypothetical) all-undef case.
1620   if (Idx == BVN->getNumOperands())
1621     return false;
1622   // Remember insertion.
1623   UniqueIdx = Idx++;
1624   // Verify that all other insertions are undef.
1625   for (; Idx < BVN->getNumOperands(); ++Idx) {
1626     auto ElemV = BVN->getOperand(Idx);
1627     if (!ElemV->isUndef())
1628       return false;
1629   }
1630   return true;
1631 }
1632 
1633 static SDValue getSplatValue(SDNode *N) {
1634   if (auto *BuildVec = dyn_cast<BuildVectorSDNode>(N)) {
1635     return BuildVec->getSplatValue();
1636   }
1637   return SDValue();
1638 }
1639 
1640 SDValue VETargetLowering::lowerBUILD_VECTOR(SDValue Op,
1641                                             SelectionDAG &DAG) const {
1642   SDLoc DL(Op);
1643   unsigned NumEls = Op.getValueType().getVectorNumElements();
1644   MVT ElemVT = Op.getSimpleValueType().getVectorElementType();
1645 
1646   // If there is just one element, expand to INSERT_VECTOR_ELT.
1647   unsigned UniqueIdx;
1648   if (getUniqueInsertion(Op.getNode(), UniqueIdx)) {
1649     SDValue AccuV = DAG.getUNDEF(Op.getValueType());
1650     auto ElemV = Op->getOperand(UniqueIdx);
1651     SDValue IdxV = DAG.getConstant(UniqueIdx, DL, MVT::i64);
1652     return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), AccuV,
1653                        ElemV, IdxV);
1654   }
1655 
1656   // Else emit a broadcast.
1657   if (SDValue ScalarV = getSplatValue(Op.getNode())) {
1658     // lower to VEC_BROADCAST
1659     MVT LegalResVT = MVT::getVectorVT(ElemVT, 256);
1660 
1661     auto AVL = DAG.getConstant(NumEls, DL, MVT::i32);
1662     return DAG.getNode(VEISD::VEC_BROADCAST, DL, LegalResVT, Op.getOperand(0),
1663                        AVL);
1664   }
1665 
1666   // Expand
1667   return SDValue();
1668 }
1669 
1670 SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
1671   unsigned Opcode = Op.getOpcode();
1672   if (ISD::isVPOpcode(Opcode))
1673     return lowerToVVP(Op, DAG);
1674 
1675   switch (Opcode) {
1676   default:
1677     llvm_unreachable("Should not custom lower this!");
1678   case ISD::ATOMIC_FENCE:
1679     return lowerATOMIC_FENCE(Op, DAG);
1680   case ISD::ATOMIC_SWAP:
1681     return lowerATOMIC_SWAP(Op, DAG);
1682   case ISD::BlockAddress:
1683     return lowerBlockAddress(Op, DAG);
1684   case ISD::ConstantPool:
1685     return lowerConstantPool(Op, DAG);
1686   case ISD::DYNAMIC_STACKALLOC:
1687     return lowerDYNAMIC_STACKALLOC(Op, DAG);
1688   case ISD::EH_SJLJ_LONGJMP:
1689     return lowerEH_SJLJ_LONGJMP(Op, DAG);
1690   case ISD::EH_SJLJ_SETJMP:
1691     return lowerEH_SJLJ_SETJMP(Op, DAG);
1692   case ISD::EH_SJLJ_SETUP_DISPATCH:
1693     return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
1694   case ISD::FRAMEADDR:
1695     return lowerFRAMEADDR(Op, DAG, *this, Subtarget);
1696   case ISD::GlobalAddress:
1697     return lowerGlobalAddress(Op, DAG);
1698   case ISD::GlobalTLSAddress:
1699     return lowerGlobalTLSAddress(Op, DAG);
1700   case ISD::INTRINSIC_WO_CHAIN:
1701     return lowerINTRINSIC_WO_CHAIN(Op, DAG);
1702   case ISD::JumpTable:
1703     return lowerJumpTable(Op, DAG);
1704   case ISD::LOAD:
1705     return lowerLOAD(Op, DAG);
1706   case ISD::RETURNADDR:
1707     return lowerRETURNADDR(Op, DAG, *this, Subtarget);
1708   case ISD::BUILD_VECTOR:
1709     return lowerBUILD_VECTOR(Op, DAG);
1710   case ISD::STORE:
1711     return lowerSTORE(Op, DAG);
1712   case ISD::VASTART:
1713     return lowerVASTART(Op, DAG);
1714   case ISD::VAARG:
1715     return lowerVAARG(Op, DAG);
1716 
1717   case ISD::INSERT_VECTOR_ELT:
1718     return lowerINSERT_VECTOR_ELT(Op, DAG);
1719   case ISD::EXTRACT_VECTOR_ELT:
1720     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
1721 
1722 #define ADD_BINARY_VVP_OP(VVP_NAME, ISD_NAME) case ISD::ISD_NAME:
1723 #include "VVPNodes.def"
1724     return lowerToVVP(Op, DAG);
1725   }
1726 }
1727 /// } Custom Lower
1728 
1729 void VETargetLowering::ReplaceNodeResults(SDNode *N,
1730                                           SmallVectorImpl<SDValue> &Results,
1731                                           SelectionDAG &DAG) const {
1732   switch (N->getOpcode()) {
1733   case ISD::ATOMIC_SWAP:
1734     // Let LLVM expand atomic swap instruction through LowerOperation.
1735     return;
1736   default:
1737     LLVM_DEBUG(N->dumpr(&DAG));
1738     llvm_unreachable("Do not know how to custom type legalize this operation!");
1739   }
1740 }
1741 
1742 /// JumpTable for VE.
1743 ///
1744 ///   VE cannot generate relocatable symbol in jump table.  VE cannot
1745 ///   generate expressions using symbols in both text segment and data
1746 ///   segment like below.
1747 ///             .4byte  .LBB0_2-.LJTI0_0
1748 ///   So, we generate offset from the top of function like below as
1749 ///   a custom label.
1750 ///             .4byte  .LBB0_2-<function name>
1751 
1752 unsigned VETargetLowering::getJumpTableEncoding() const {
1753   // Use custom label for PIC.
1754   if (isPositionIndependent())
1755     return MachineJumpTableInfo::EK_Custom32;
1756 
1757   // Otherwise, use the normal jump table encoding heuristics.
1758   return TargetLowering::getJumpTableEncoding();
1759 }
1760 
1761 const MCExpr *VETargetLowering::LowerCustomJumpTableEntry(
1762     const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB,
1763     unsigned Uid, MCContext &Ctx) const {
1764   assert(isPositionIndependent());
1765 
1766   // Generate custom label for PIC like below.
1767   //    .4bytes  .LBB0_2-<function name>
1768   const auto *Value = MCSymbolRefExpr::create(MBB->getSymbol(), Ctx);
1769   MCSymbol *Sym = Ctx.getOrCreateSymbol(MBB->getParent()->getName().data());
1770   const auto *Base = MCSymbolRefExpr::create(Sym, Ctx);
1771   return MCBinaryExpr::createSub(Value, Base, Ctx);
1772 }
1773 
1774 SDValue VETargetLowering::getPICJumpTableRelocBase(SDValue Table,
1775                                                    SelectionDAG &DAG) const {
1776   assert(isPositionIndependent());
1777   SDLoc DL(Table);
1778   Function *Function = &DAG.getMachineFunction().getFunction();
1779   assert(Function != nullptr);
1780   auto PtrTy = getPointerTy(DAG.getDataLayout(), Function->getAddressSpace());
1781 
1782   // In the jump table, we have following values in PIC mode.
1783   //    .4bytes  .LBB0_2-<function name>
1784   // We need to add this value and the address of this function to generate
1785   // .LBB0_2 label correctly under PIC mode.  So, we want to generate following
1786   // instructions:
1787   //     lea %reg, fun@gotoff_lo
1788   //     and %reg, %reg, (32)0
1789   //     lea.sl %reg, fun@gotoff_hi(%reg, %got)
1790   // In order to do so, we need to genarate correctly marked DAG node using
1791   // makeHiLoPair.
1792   SDValue Op = DAG.getGlobalAddress(Function, DL, PtrTy);
1793   SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOTOFF_HI32,
1794                               VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
1795   SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrTy);
1796   return DAG.getNode(ISD::ADD, DL, PtrTy, GlobalBase, HiLo);
1797 }
1798 
1799 Register VETargetLowering::prepareMBB(MachineBasicBlock &MBB,
1800                                       MachineBasicBlock::iterator I,
1801                                       MachineBasicBlock *TargetBB,
1802                                       const DebugLoc &DL) const {
1803   MachineFunction *MF = MBB.getParent();
1804   MachineRegisterInfo &MRI = MF->getRegInfo();
1805   const VEInstrInfo *TII = Subtarget->getInstrInfo();
1806 
1807   const TargetRegisterClass *RC = &VE::I64RegClass;
1808   Register Tmp1 = MRI.createVirtualRegister(RC);
1809   Register Tmp2 = MRI.createVirtualRegister(RC);
1810   Register Result = MRI.createVirtualRegister(RC);
1811 
1812   if (isPositionIndependent()) {
1813     // Create following instructions for local linkage PIC code.
1814     //     lea %Tmp1, TargetBB@gotoff_lo
1815     //     and %Tmp2, %Tmp1, (32)0
1816     //     lea.sl %Result, TargetBB@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
1817     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
1818         .addImm(0)
1819         .addImm(0)
1820         .addMBB(TargetBB, VEMCExpr::VK_VE_GOTOFF_LO32);
1821     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
1822         .addReg(Tmp1, getKillRegState(true))
1823         .addImm(M0(32));
1824     BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Result)
1825         .addReg(VE::SX15)
1826         .addReg(Tmp2, getKillRegState(true))
1827         .addMBB(TargetBB, VEMCExpr::VK_VE_GOTOFF_HI32);
1828   } else {
1829     // Create following instructions for non-PIC code.
1830     //     lea     %Tmp1, TargetBB@lo
1831     //     and     %Tmp2, %Tmp1, (32)0
1832     //     lea.sl  %Result, TargetBB@hi(%Tmp2)
1833     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
1834         .addImm(0)
1835         .addImm(0)
1836         .addMBB(TargetBB, VEMCExpr::VK_VE_LO32);
1837     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
1838         .addReg(Tmp1, getKillRegState(true))
1839         .addImm(M0(32));
1840     BuildMI(MBB, I, DL, TII->get(VE::LEASLrii), Result)
1841         .addReg(Tmp2, getKillRegState(true))
1842         .addImm(0)
1843         .addMBB(TargetBB, VEMCExpr::VK_VE_HI32);
1844   }
1845   return Result;
1846 }
1847 
1848 Register VETargetLowering::prepareSymbol(MachineBasicBlock &MBB,
1849                                          MachineBasicBlock::iterator I,
1850                                          StringRef Symbol, const DebugLoc &DL,
1851                                          bool IsLocal = false,
1852                                          bool IsCall = false) const {
1853   MachineFunction *MF = MBB.getParent();
1854   MachineRegisterInfo &MRI = MF->getRegInfo();
1855   const VEInstrInfo *TII = Subtarget->getInstrInfo();
1856 
1857   const TargetRegisterClass *RC = &VE::I64RegClass;
1858   Register Result = MRI.createVirtualRegister(RC);
1859 
1860   if (isPositionIndependent()) {
1861     if (IsCall && !IsLocal) {
1862       // Create following instructions for non-local linkage PIC code function
1863       // calls.  These instructions uses IC and magic number -24, so we expand
1864       // them in VEAsmPrinter.cpp from GETFUNPLT pseudo instruction.
1865       //     lea %Reg, Symbol@plt_lo(-24)
1866       //     and %Reg, %Reg, (32)0
1867       //     sic %s16
1868       //     lea.sl %Result, Symbol@plt_hi(%Reg, %s16) ; %s16 is PLT
1869       BuildMI(MBB, I, DL, TII->get(VE::GETFUNPLT), Result)
1870           .addExternalSymbol("abort");
1871     } else if (IsLocal) {
1872       Register Tmp1 = MRI.createVirtualRegister(RC);
1873       Register Tmp2 = MRI.createVirtualRegister(RC);
1874       // Create following instructions for local linkage PIC code.
1875       //     lea %Tmp1, Symbol@gotoff_lo
1876       //     and %Tmp2, %Tmp1, (32)0
1877       //     lea.sl %Result, Symbol@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
1878       BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
1879           .addImm(0)
1880           .addImm(0)
1881           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOTOFF_LO32);
1882       BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
1883           .addReg(Tmp1, getKillRegState(true))
1884           .addImm(M0(32));
1885       BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Result)
1886           .addReg(VE::SX15)
1887           .addReg(Tmp2, getKillRegState(true))
1888           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOTOFF_HI32);
1889     } else {
1890       Register Tmp1 = MRI.createVirtualRegister(RC);
1891       Register Tmp2 = MRI.createVirtualRegister(RC);
1892       // Create following instructions for not local linkage PIC code.
1893       //     lea %Tmp1, Symbol@got_lo
1894       //     and %Tmp2, %Tmp1, (32)0
1895       //     lea.sl %Tmp3, Symbol@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
1896       //     ld %Result, 0(%Tmp3)
1897       Register Tmp3 = MRI.createVirtualRegister(RC);
1898       BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
1899           .addImm(0)
1900           .addImm(0)
1901           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOT_LO32);
1902       BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
1903           .addReg(Tmp1, getKillRegState(true))
1904           .addImm(M0(32));
1905       BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Tmp3)
1906           .addReg(VE::SX15)
1907           .addReg(Tmp2, getKillRegState(true))
1908           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOT_HI32);
1909       BuildMI(MBB, I, DL, TII->get(VE::LDrii), Result)
1910           .addReg(Tmp3, getKillRegState(true))
1911           .addImm(0)
1912           .addImm(0);
1913     }
1914   } else {
1915     Register Tmp1 = MRI.createVirtualRegister(RC);
1916     Register Tmp2 = MRI.createVirtualRegister(RC);
1917     // Create following instructions for non-PIC code.
1918     //     lea     %Tmp1, Symbol@lo
1919     //     and     %Tmp2, %Tmp1, (32)0
1920     //     lea.sl  %Result, Symbol@hi(%Tmp2)
1921     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
1922         .addImm(0)
1923         .addImm(0)
1924         .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_LO32);
1925     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
1926         .addReg(Tmp1, getKillRegState(true))
1927         .addImm(M0(32));
1928     BuildMI(MBB, I, DL, TII->get(VE::LEASLrii), Result)
1929         .addReg(Tmp2, getKillRegState(true))
1930         .addImm(0)
1931         .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_HI32);
1932   }
1933   return Result;
1934 }
1935 
1936 void VETargetLowering::setupEntryBlockForSjLj(MachineInstr &MI,
1937                                               MachineBasicBlock *MBB,
1938                                               MachineBasicBlock *DispatchBB,
1939                                               int FI, int Offset) const {
1940   DebugLoc DL = MI.getDebugLoc();
1941   const VEInstrInfo *TII = Subtarget->getInstrInfo();
1942 
1943   Register LabelReg =
1944       prepareMBB(*MBB, MachineBasicBlock::iterator(MI), DispatchBB, DL);
1945 
1946   // Store an address of DispatchBB to a given jmpbuf[1] where has next IC
1947   // referenced by longjmp (throw) later.
1948   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
1949   addFrameReference(MIB, FI, Offset); // jmpbuf[1]
1950   MIB.addReg(LabelReg, getKillRegState(true));
1951 }
1952 
1953 MachineBasicBlock *
1954 VETargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
1955                                    MachineBasicBlock *MBB) const {
1956   DebugLoc DL = MI.getDebugLoc();
1957   MachineFunction *MF = MBB->getParent();
1958   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1959   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
1960   MachineRegisterInfo &MRI = MF->getRegInfo();
1961 
1962   const BasicBlock *BB = MBB->getBasicBlock();
1963   MachineFunction::iterator I = ++MBB->getIterator();
1964 
1965   // Memory Reference.
1966   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
1967                                            MI.memoperands_end());
1968   Register BufReg = MI.getOperand(1).getReg();
1969 
1970   Register DstReg;
1971 
1972   DstReg = MI.getOperand(0).getReg();
1973   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
1974   assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
1975   (void)TRI;
1976   Register MainDestReg = MRI.createVirtualRegister(RC);
1977   Register RestoreDestReg = MRI.createVirtualRegister(RC);
1978 
1979   // For `v = call @llvm.eh.sjlj.setjmp(buf)`, we generate following
1980   // instructions.  SP/FP must be saved in jmpbuf before `llvm.eh.sjlj.setjmp`.
1981   //
1982   // ThisMBB:
1983   //   buf[3] = %s17 iff %s17 is used as BP
1984   //   buf[1] = RestoreMBB as IC after longjmp
1985   //   # SjLjSetup RestoreMBB
1986   //
1987   // MainMBB:
1988   //   v_main = 0
1989   //
1990   // SinkMBB:
1991   //   v = phi(v_main, MainMBB, v_restore, RestoreMBB)
1992   //   ...
1993   //
1994   // RestoreMBB:
1995   //   %s17 = buf[3] = iff %s17 is used as BP
1996   //   v_restore = 1
1997   //   goto SinkMBB
1998 
1999   MachineBasicBlock *ThisMBB = MBB;
2000   MachineBasicBlock *MainMBB = MF->CreateMachineBasicBlock(BB);
2001   MachineBasicBlock *SinkMBB = MF->CreateMachineBasicBlock(BB);
2002   MachineBasicBlock *RestoreMBB = MF->CreateMachineBasicBlock(BB);
2003   MF->insert(I, MainMBB);
2004   MF->insert(I, SinkMBB);
2005   MF->push_back(RestoreMBB);
2006   RestoreMBB->setHasAddressTaken();
2007 
2008   // Transfer the remainder of BB and its successor edges to SinkMBB.
2009   SinkMBB->splice(SinkMBB->begin(), MBB,
2010                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
2011   SinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
2012 
2013   // ThisMBB:
2014   Register LabelReg =
2015       prepareMBB(*MBB, MachineBasicBlock::iterator(MI), RestoreMBB, DL);
2016 
2017   // Store BP in buf[3] iff this function is using BP.
2018   const VEFrameLowering *TFI = Subtarget->getFrameLowering();
2019   if (TFI->hasBP(*MF)) {
2020     MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
2021     MIB.addReg(BufReg);
2022     MIB.addImm(0);
2023     MIB.addImm(24);
2024     MIB.addReg(VE::SX17);
2025     MIB.setMemRefs(MMOs);
2026   }
2027 
2028   // Store IP in buf[1].
2029   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
2030   MIB.add(MI.getOperand(1)); // we can preserve the kill flags here.
2031   MIB.addImm(0);
2032   MIB.addImm(8);
2033   MIB.addReg(LabelReg, getKillRegState(true));
2034   MIB.setMemRefs(MMOs);
2035 
2036   // SP/FP are already stored in jmpbuf before `llvm.eh.sjlj.setjmp`.
2037 
2038   // Insert setup.
2039   MIB =
2040       BuildMI(*ThisMBB, MI, DL, TII->get(VE::EH_SjLj_Setup)).addMBB(RestoreMBB);
2041 
2042   const VERegisterInfo *RegInfo = Subtarget->getRegisterInfo();
2043   MIB.addRegMask(RegInfo->getNoPreservedMask());
2044   ThisMBB->addSuccessor(MainMBB);
2045   ThisMBB->addSuccessor(RestoreMBB);
2046 
2047   // MainMBB:
2048   BuildMI(MainMBB, DL, TII->get(VE::LEAzii), MainDestReg)
2049       .addImm(0)
2050       .addImm(0)
2051       .addImm(0);
2052   MainMBB->addSuccessor(SinkMBB);
2053 
2054   // SinkMBB:
2055   BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(VE::PHI), DstReg)
2056       .addReg(MainDestReg)
2057       .addMBB(MainMBB)
2058       .addReg(RestoreDestReg)
2059       .addMBB(RestoreMBB);
2060 
2061   // RestoreMBB:
2062   // Restore BP from buf[3] iff this function is using BP.  The address of
2063   // buf is in SX10.
2064   // FIXME: Better to not use SX10 here
2065   if (TFI->hasBP(*MF)) {
2066     MachineInstrBuilder MIB =
2067         BuildMI(RestoreMBB, DL, TII->get(VE::LDrii), VE::SX17);
2068     MIB.addReg(VE::SX10);
2069     MIB.addImm(0);
2070     MIB.addImm(24);
2071     MIB.setMemRefs(MMOs);
2072   }
2073   BuildMI(RestoreMBB, DL, TII->get(VE::LEAzii), RestoreDestReg)
2074       .addImm(0)
2075       .addImm(0)
2076       .addImm(1);
2077   BuildMI(RestoreMBB, DL, TII->get(VE::BRCFLa_t)).addMBB(SinkMBB);
2078   RestoreMBB->addSuccessor(SinkMBB);
2079 
2080   MI.eraseFromParent();
2081   return SinkMBB;
2082 }
2083 
2084 MachineBasicBlock *
2085 VETargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
2086                                     MachineBasicBlock *MBB) const {
2087   DebugLoc DL = MI.getDebugLoc();
2088   MachineFunction *MF = MBB->getParent();
2089   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2090   MachineRegisterInfo &MRI = MF->getRegInfo();
2091 
2092   // Memory Reference.
2093   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
2094                                            MI.memoperands_end());
2095   Register BufReg = MI.getOperand(0).getReg();
2096 
2097   Register Tmp = MRI.createVirtualRegister(&VE::I64RegClass);
2098   // Since FP is only updated here but NOT referenced, it's treated as GPR.
2099   Register FP = VE::SX9;
2100   Register SP = VE::SX11;
2101 
2102   MachineInstrBuilder MIB;
2103 
2104   MachineBasicBlock *ThisMBB = MBB;
2105 
2106   // For `call @llvm.eh.sjlj.longjmp(buf)`, we generate following instructions.
2107   //
2108   // ThisMBB:
2109   //   %fp = load buf[0]
2110   //   %jmp = load buf[1]
2111   //   %s10 = buf        ; Store an address of buf to SX10 for RestoreMBB
2112   //   %sp = load buf[2] ; generated by llvm.eh.sjlj.setjmp.
2113   //   jmp %jmp
2114 
2115   // Reload FP.
2116   MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), FP);
2117   MIB.addReg(BufReg);
2118   MIB.addImm(0);
2119   MIB.addImm(0);
2120   MIB.setMemRefs(MMOs);
2121 
2122   // Reload IP.
2123   MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), Tmp);
2124   MIB.addReg(BufReg);
2125   MIB.addImm(0);
2126   MIB.addImm(8);
2127   MIB.setMemRefs(MMOs);
2128 
2129   // Copy BufReg to SX10 for later use in setjmp.
2130   // FIXME: Better to not use SX10 here
2131   BuildMI(*ThisMBB, MI, DL, TII->get(VE::ORri), VE::SX10)
2132       .addReg(BufReg)
2133       .addImm(0);
2134 
2135   // Reload SP.
2136   MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), SP);
2137   MIB.add(MI.getOperand(0)); // we can preserve the kill flags here.
2138   MIB.addImm(0);
2139   MIB.addImm(16);
2140   MIB.setMemRefs(MMOs);
2141 
2142   // Jump.
2143   BuildMI(*ThisMBB, MI, DL, TII->get(VE::BCFLari_t))
2144       .addReg(Tmp, getKillRegState(true))
2145       .addImm(0);
2146 
2147   MI.eraseFromParent();
2148   return ThisMBB;
2149 }
2150 
2151 MachineBasicBlock *
2152 VETargetLowering::emitSjLjDispatchBlock(MachineInstr &MI,
2153                                         MachineBasicBlock *BB) const {
2154   DebugLoc DL = MI.getDebugLoc();
2155   MachineFunction *MF = BB->getParent();
2156   MachineFrameInfo &MFI = MF->getFrameInfo();
2157   MachineRegisterInfo &MRI = MF->getRegInfo();
2158   const VEInstrInfo *TII = Subtarget->getInstrInfo();
2159   int FI = MFI.getFunctionContextIndex();
2160 
2161   // Get a mapping of the call site numbers to all of the landing pads they're
2162   // associated with.
2163   DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
2164   unsigned MaxCSNum = 0;
2165   for (auto &MBB : *MF) {
2166     if (!MBB.isEHPad())
2167       continue;
2168 
2169     MCSymbol *Sym = nullptr;
2170     for (const auto &MI : MBB) {
2171       if (MI.isDebugInstr())
2172         continue;
2173 
2174       assert(MI.isEHLabel() && "expected EH_LABEL");
2175       Sym = MI.getOperand(0).getMCSymbol();
2176       break;
2177     }
2178 
2179     if (!MF->hasCallSiteLandingPad(Sym))
2180       continue;
2181 
2182     for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
2183       CallSiteNumToLPad[CSI].push_back(&MBB);
2184       MaxCSNum = std::max(MaxCSNum, CSI);
2185     }
2186   }
2187 
2188   // Get an ordered list of the machine basic blocks for the jump table.
2189   std::vector<MachineBasicBlock *> LPadList;
2190   SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
2191   LPadList.reserve(CallSiteNumToLPad.size());
2192 
2193   for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
2194     for (auto &LP : CallSiteNumToLPad[CSI]) {
2195       LPadList.push_back(LP);
2196       InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
2197     }
2198   }
2199 
2200   assert(!LPadList.empty() &&
2201          "No landing pad destinations for the dispatch jump table!");
2202 
2203   // The %fn_context is allocated like below (from --print-after=sjljehprepare):
2204   //   %fn_context = alloca { i8*, i64, [4 x i64], i8*, i8*, [5 x i8*] }
2205   //
2206   // This `[5 x i8*]` is jmpbuf, so jmpbuf[1] is FI+72.
2207   // First `i64` is callsite, so callsite is FI+8.
2208   static const int OffsetIC = 72;
2209   static const int OffsetCS = 8;
2210 
2211   // Create the MBBs for the dispatch code like following:
2212   //
2213   // ThisMBB:
2214   //   Prepare DispatchBB address and store it to buf[1].
2215   //   ...
2216   //
2217   // DispatchBB:
2218   //   %s15 = GETGOT iff isPositionIndependent
2219   //   %callsite = load callsite
2220   //   brgt.l.t #size of callsites, %callsite, DispContBB
2221   //
2222   // TrapBB:
2223   //   Call abort.
2224   //
2225   // DispContBB:
2226   //   %breg = address of jump table
2227   //   %pc = load and calculate next pc from %breg and %callsite
2228   //   jmp %pc
2229 
2230   // Shove the dispatch's address into the return slot in the function context.
2231   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
2232   DispatchBB->setIsEHPad(true);
2233 
2234   // Trap BB will causes trap like `assert(0)`.
2235   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
2236   DispatchBB->addSuccessor(TrapBB);
2237 
2238   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
2239   DispatchBB->addSuccessor(DispContBB);
2240 
2241   // Insert MBBs.
2242   MF->push_back(DispatchBB);
2243   MF->push_back(DispContBB);
2244   MF->push_back(TrapBB);
2245 
2246   // Insert code to call abort in the TrapBB.
2247   Register Abort = prepareSymbol(*TrapBB, TrapBB->end(), "abort", DL,
2248                                  /* Local */ false, /* Call */ true);
2249   BuildMI(TrapBB, DL, TII->get(VE::BSICrii), VE::SX10)
2250       .addReg(Abort, getKillRegState(true))
2251       .addImm(0)
2252       .addImm(0);
2253 
2254   // Insert code into the entry block that creates and registers the function
2255   // context.
2256   setupEntryBlockForSjLj(MI, BB, DispatchBB, FI, OffsetIC);
2257 
2258   // Create the jump table and associated information
2259   unsigned JTE = getJumpTableEncoding();
2260   MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
2261   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
2262 
2263   const VERegisterInfo &RI = TII->getRegisterInfo();
2264   // Add a register mask with no preserved registers.  This results in all
2265   // registers being marked as clobbered.
2266   BuildMI(DispatchBB, DL, TII->get(VE::NOP))
2267       .addRegMask(RI.getNoPreservedMask());
2268 
2269   if (isPositionIndependent()) {
2270     // Force to generate GETGOT, since current implementation doesn't store GOT
2271     // register.
2272     BuildMI(DispatchBB, DL, TII->get(VE::GETGOT), VE::SX15);
2273   }
2274 
2275   // IReg is used as an index in a memory operand and therefore can't be SP
2276   const TargetRegisterClass *RC = &VE::I64RegClass;
2277   Register IReg = MRI.createVirtualRegister(RC);
2278   addFrameReference(BuildMI(DispatchBB, DL, TII->get(VE::LDLZXrii), IReg), FI,
2279                     OffsetCS);
2280   if (LPadList.size() < 64) {
2281     BuildMI(DispatchBB, DL, TII->get(VE::BRCFLir_t))
2282         .addImm(VECC::CC_ILE)
2283         .addImm(LPadList.size())
2284         .addReg(IReg)
2285         .addMBB(TrapBB);
2286   } else {
2287     assert(LPadList.size() <= 0x7FFFFFFF && "Too large Landing Pad!");
2288     Register TmpReg = MRI.createVirtualRegister(RC);
2289     BuildMI(DispatchBB, DL, TII->get(VE::LEAzii), TmpReg)
2290         .addImm(0)
2291         .addImm(0)
2292         .addImm(LPadList.size());
2293     BuildMI(DispatchBB, DL, TII->get(VE::BRCFLrr_t))
2294         .addImm(VECC::CC_ILE)
2295         .addReg(TmpReg, getKillRegState(true))
2296         .addReg(IReg)
2297         .addMBB(TrapBB);
2298   }
2299 
2300   Register BReg = MRI.createVirtualRegister(RC);
2301   Register Tmp1 = MRI.createVirtualRegister(RC);
2302   Register Tmp2 = MRI.createVirtualRegister(RC);
2303 
2304   if (isPositionIndependent()) {
2305     // Create following instructions for local linkage PIC code.
2306     //     lea    %Tmp1, .LJTI0_0@gotoff_lo
2307     //     and    %Tmp2, %Tmp1, (32)0
2308     //     lea.sl %BReg, .LJTI0_0@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
2309     BuildMI(DispContBB, DL, TII->get(VE::LEAzii), Tmp1)
2310         .addImm(0)
2311         .addImm(0)
2312         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_GOTOFF_LO32);
2313     BuildMI(DispContBB, DL, TII->get(VE::ANDrm), Tmp2)
2314         .addReg(Tmp1, getKillRegState(true))
2315         .addImm(M0(32));
2316     BuildMI(DispContBB, DL, TII->get(VE::LEASLrri), BReg)
2317         .addReg(VE::SX15)
2318         .addReg(Tmp2, getKillRegState(true))
2319         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_GOTOFF_HI32);
2320   } else {
2321     // Create following instructions for non-PIC code.
2322     //     lea     %Tmp1, .LJTI0_0@lo
2323     //     and     %Tmp2, %Tmp1, (32)0
2324     //     lea.sl  %BReg, .LJTI0_0@hi(%Tmp2)
2325     BuildMI(DispContBB, DL, TII->get(VE::LEAzii), Tmp1)
2326         .addImm(0)
2327         .addImm(0)
2328         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_LO32);
2329     BuildMI(DispContBB, DL, TII->get(VE::ANDrm), Tmp2)
2330         .addReg(Tmp1, getKillRegState(true))
2331         .addImm(M0(32));
2332     BuildMI(DispContBB, DL, TII->get(VE::LEASLrii), BReg)
2333         .addReg(Tmp2, getKillRegState(true))
2334         .addImm(0)
2335         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_HI32);
2336   }
2337 
2338   switch (JTE) {
2339   case MachineJumpTableInfo::EK_BlockAddress: {
2340     // Generate simple block address code for no-PIC model.
2341     //     sll %Tmp1, %IReg, 3
2342     //     lds %TReg, 0(%Tmp1, %BReg)
2343     //     bcfla %TReg
2344 
2345     Register TReg = MRI.createVirtualRegister(RC);
2346     Register Tmp1 = MRI.createVirtualRegister(RC);
2347 
2348     BuildMI(DispContBB, DL, TII->get(VE::SLLri), Tmp1)
2349         .addReg(IReg, getKillRegState(true))
2350         .addImm(3);
2351     BuildMI(DispContBB, DL, TII->get(VE::LDrri), TReg)
2352         .addReg(BReg, getKillRegState(true))
2353         .addReg(Tmp1, getKillRegState(true))
2354         .addImm(0);
2355     BuildMI(DispContBB, DL, TII->get(VE::BCFLari_t))
2356         .addReg(TReg, getKillRegState(true))
2357         .addImm(0);
2358     break;
2359   }
2360   case MachineJumpTableInfo::EK_Custom32: {
2361     // Generate block address code using differences from the function pointer
2362     // for PIC model.
2363     //     sll %Tmp1, %IReg, 2
2364     //     ldl.zx %OReg, 0(%Tmp1, %BReg)
2365     //     Prepare function address in BReg2.
2366     //     adds.l %TReg, %BReg2, %OReg
2367     //     bcfla %TReg
2368 
2369     assert(isPositionIndependent());
2370     Register OReg = MRI.createVirtualRegister(RC);
2371     Register TReg = MRI.createVirtualRegister(RC);
2372     Register Tmp1 = MRI.createVirtualRegister(RC);
2373 
2374     BuildMI(DispContBB, DL, TII->get(VE::SLLri), Tmp1)
2375         .addReg(IReg, getKillRegState(true))
2376         .addImm(2);
2377     BuildMI(DispContBB, DL, TII->get(VE::LDLZXrri), OReg)
2378         .addReg(BReg, getKillRegState(true))
2379         .addReg(Tmp1, getKillRegState(true))
2380         .addImm(0);
2381     Register BReg2 =
2382         prepareSymbol(*DispContBB, DispContBB->end(),
2383                       DispContBB->getParent()->getName(), DL, /* Local */ true);
2384     BuildMI(DispContBB, DL, TII->get(VE::ADDSLrr), TReg)
2385         .addReg(OReg, getKillRegState(true))
2386         .addReg(BReg2, getKillRegState(true));
2387     BuildMI(DispContBB, DL, TII->get(VE::BCFLari_t))
2388         .addReg(TReg, getKillRegState(true))
2389         .addImm(0);
2390     break;
2391   }
2392   default:
2393     llvm_unreachable("Unexpected jump table encoding");
2394   }
2395 
2396   // Add the jump table entries as successors to the MBB.
2397   SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
2398   for (auto &LP : LPadList)
2399     if (SeenMBBs.insert(LP).second)
2400       DispContBB->addSuccessor(LP);
2401 
2402   // N.B. the order the invoke BBs are processed in doesn't matter here.
2403   SmallVector<MachineBasicBlock *, 64> MBBLPads;
2404   const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
2405   for (MachineBasicBlock *MBB : InvokeBBs) {
2406     // Remove the landing pad successor from the invoke block and replace it
2407     // with the new dispatch block.
2408     // Keep a copy of Successors since it's modified inside the loop.
2409     SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
2410                                                    MBB->succ_rend());
2411     // FIXME: Avoid quadratic complexity.
2412     for (auto MBBS : Successors) {
2413       if (MBBS->isEHPad()) {
2414         MBB->removeSuccessor(MBBS);
2415         MBBLPads.push_back(MBBS);
2416       }
2417     }
2418 
2419     MBB->addSuccessor(DispatchBB);
2420 
2421     // Find the invoke call and mark all of the callee-saved registers as
2422     // 'implicit defined' so that they're spilled.  This prevents code from
2423     // moving instructions to before the EH block, where they will never be
2424     // executed.
2425     for (auto &II : reverse(*MBB)) {
2426       if (!II.isCall())
2427         continue;
2428 
2429       DenseMap<Register, bool> DefRegs;
2430       for (auto &MOp : II.operands())
2431         if (MOp.isReg())
2432           DefRegs[MOp.getReg()] = true;
2433 
2434       MachineInstrBuilder MIB(*MF, &II);
2435       for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
2436         Register Reg = SavedRegs[RI];
2437         if (!DefRegs[Reg])
2438           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
2439       }
2440 
2441       break;
2442     }
2443   }
2444 
2445   // Mark all former landing pads as non-landing pads.  The dispatch is the only
2446   // landing pad now.
2447   for (auto &LP : MBBLPads)
2448     LP->setIsEHPad(false);
2449 
2450   // The instruction is gone now.
2451   MI.eraseFromParent();
2452   return BB;
2453 }
2454 
2455 MachineBasicBlock *
2456 VETargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
2457                                               MachineBasicBlock *BB) const {
2458   switch (MI.getOpcode()) {
2459   default:
2460     llvm_unreachable("Unknown Custom Instruction!");
2461   case VE::EH_SjLj_LongJmp:
2462     return emitEHSjLjLongJmp(MI, BB);
2463   case VE::EH_SjLj_SetJmp:
2464     return emitEHSjLjSetJmp(MI, BB);
2465   case VE::EH_SjLj_Setup_Dispatch:
2466     return emitSjLjDispatchBlock(MI, BB);
2467   }
2468 }
2469 
2470 static bool isI32Insn(const SDNode *User, const SDNode *N) {
2471   switch (User->getOpcode()) {
2472   default:
2473     return false;
2474   case ISD::ADD:
2475   case ISD::SUB:
2476   case ISD::MUL:
2477   case ISD::SDIV:
2478   case ISD::UDIV:
2479   case ISD::SETCC:
2480   case ISD::SMIN:
2481   case ISD::SMAX:
2482   case ISD::SHL:
2483   case ISD::SRA:
2484   case ISD::BSWAP:
2485   case ISD::SINT_TO_FP:
2486   case ISD::UINT_TO_FP:
2487   case ISD::BR_CC:
2488   case ISD::BITCAST:
2489   case ISD::ATOMIC_CMP_SWAP:
2490   case ISD::ATOMIC_SWAP:
2491     return true;
2492   case ISD::SRL:
2493     if (N->getOperand(0).getOpcode() != ISD::SRL)
2494       return true;
2495     // (srl (trunc (srl ...))) may be optimized by combining srl, so
2496     // doesn't optimize trunc now.
2497     return false;
2498   case ISD::SELECT_CC:
2499     if (User->getOperand(2).getNode() != N &&
2500         User->getOperand(3).getNode() != N)
2501       return true;
2502     LLVM_FALLTHROUGH;
2503   case ISD::AND:
2504   case ISD::OR:
2505   case ISD::XOR:
2506   case ISD::SELECT:
2507   case ISD::CopyToReg:
2508     // Check all use of selections, bit operations, and copies.  If all of them
2509     // are safe, optimize truncate to extract_subreg.
2510     for (SDNode::use_iterator UI = User->use_begin(), UE = User->use_end();
2511          UI != UE; ++UI) {
2512       switch ((*UI)->getOpcode()) {
2513       default:
2514         // If the use is an instruction which treats the source operand as i32,
2515         // it is safe to avoid truncate here.
2516         if (isI32Insn(*UI, N))
2517           continue;
2518         break;
2519       case ISD::ANY_EXTEND:
2520       case ISD::SIGN_EXTEND:
2521       case ISD::ZERO_EXTEND: {
2522         // Special optimizations to the combination of ext and trunc.
2523         // (ext ... (select ... (trunc ...))) is safe to avoid truncate here
2524         // since this truncate instruction clears higher 32 bits which is filled
2525         // by one of ext instructions later.
2526         assert(N->getValueType(0) == MVT::i32 &&
2527                "find truncate to not i32 integer");
2528         if (User->getOpcode() == ISD::SELECT_CC ||
2529             User->getOpcode() == ISD::SELECT)
2530           continue;
2531         break;
2532       }
2533       }
2534       return false;
2535     }
2536     return true;
2537   }
2538 }
2539 
2540 // Optimize TRUNCATE in DAG combining.  Optimizing it in CUSTOM lower is
2541 // sometime too early.  Optimizing it in DAG pattern matching in VEInstrInfo.td
2542 // is sometime too late.  So, doing it at here.
2543 SDValue VETargetLowering::combineTRUNCATE(SDNode *N,
2544                                           DAGCombinerInfo &DCI) const {
2545   assert(N->getOpcode() == ISD::TRUNCATE &&
2546          "Should be called with a TRUNCATE node");
2547 
2548   SelectionDAG &DAG = DCI.DAG;
2549   SDLoc DL(N);
2550   EVT VT = N->getValueType(0);
2551 
2552   // We prefer to do this when all types are legal.
2553   if (!DCI.isAfterLegalizeDAG())
2554     return SDValue();
2555 
2556   // Skip combine TRUNCATE atm if the operand of TRUNCATE might be a constant.
2557   if (N->getOperand(0)->getOpcode() == ISD::SELECT_CC &&
2558       isa<ConstantSDNode>(N->getOperand(0)->getOperand(0)) &&
2559       isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
2560     return SDValue();
2561 
2562   // Check all use of this TRUNCATE.
2563   for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
2564        ++UI) {
2565     SDNode *User = *UI;
2566 
2567     // Make sure that we're not going to replace TRUNCATE for non i32
2568     // instructions.
2569     //
2570     // FIXME: Although we could sometimes handle this, and it does occur in
2571     // practice that one of the condition inputs to the select is also one of
2572     // the outputs, we currently can't deal with this.
2573     if (isI32Insn(User, N))
2574       continue;
2575 
2576     return SDValue();
2577   }
2578 
2579   SDValue SubI32 = DAG.getTargetConstant(VE::sub_i32, DL, MVT::i32);
2580   return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT,
2581                                     N->getOperand(0), SubI32),
2582                  0);
2583 }
2584 
2585 SDValue VETargetLowering::PerformDAGCombine(SDNode *N,
2586                                             DAGCombinerInfo &DCI) const {
2587   switch (N->getOpcode()) {
2588   default:
2589     break;
2590   case ISD::TRUNCATE:
2591     return combineTRUNCATE(N, DCI);
2592   }
2593 
2594   return SDValue();
2595 }
2596 
2597 //===----------------------------------------------------------------------===//
2598 // VE Inline Assembly Support
2599 //===----------------------------------------------------------------------===//
2600 
2601 VETargetLowering::ConstraintType
2602 VETargetLowering::getConstraintType(StringRef Constraint) const {
2603   if (Constraint.size() == 1) {
2604     switch (Constraint[0]) {
2605     default:
2606       break;
2607     case 'v': // vector registers
2608       return C_RegisterClass;
2609     }
2610   }
2611   return TargetLowering::getConstraintType(Constraint);
2612 }
2613 
2614 std::pair<unsigned, const TargetRegisterClass *>
2615 VETargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
2616                                                StringRef Constraint,
2617                                                MVT VT) const {
2618   const TargetRegisterClass *RC = nullptr;
2619   if (Constraint.size() == 1) {
2620     switch (Constraint[0]) {
2621     default:
2622       return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
2623     case 'r':
2624       RC = &VE::I64RegClass;
2625       break;
2626     case 'v':
2627       RC = &VE::V64RegClass;
2628       break;
2629     }
2630     return std::make_pair(0U, RC);
2631   }
2632 
2633   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
2634 }
2635 
2636 //===----------------------------------------------------------------------===//
2637 // VE Target Optimization Support
2638 //===----------------------------------------------------------------------===//
2639 
2640 unsigned VETargetLowering::getMinimumJumpTableEntries() const {
2641   // Specify 8 for PIC model to relieve the impact of PIC load instructions.
2642   if (isJumpTableRelative())
2643     return 8;
2644 
2645   return TargetLowering::getMinimumJumpTableEntries();
2646 }
2647 
2648 bool VETargetLowering::hasAndNot(SDValue Y) const {
2649   EVT VT = Y.getValueType();
2650 
2651   // VE doesn't have vector and not instruction.
2652   if (VT.isVector())
2653     return false;
2654 
2655   // VE allows different immediate values for X and Y where ~X & Y.
2656   // Only simm7 works for X, and only mimm works for Y on VE.  However, this
2657   // function is used to check whether an immediate value is OK for and-not
2658   // instruction as both X and Y.  Generating additional instruction to
2659   // retrieve an immediate value is no good since the purpose of this
2660   // function is to convert a series of 3 instructions to another series of
2661   // 3 instructions with better parallelism.  Therefore, we return false
2662   // for all immediate values now.
2663   // FIXME: Change hasAndNot function to have two operands to make it work
2664   //        correctly with Aurora VE.
2665   if (isa<ConstantSDNode>(Y))
2666     return false;
2667 
2668   // It's ok for generic registers.
2669   return true;
2670 }
2671 
2672 /// \returns the VVP_* SDNode opcode corresponsing to \p OC.
2673 static Optional<unsigned> getVVPOpcode(unsigned Opcode) {
2674   switch (Opcode) {
2675 #define HANDLE_VP_TO_VVP(VPOPC, VVPNAME)                                       \
2676   case ISD::VPOPC:                                                             \
2677     return VEISD::VVPNAME;
2678 #define ADD_VVP_OP(VVPNAME, SDNAME)                                            \
2679   case VEISD::VVPNAME:                                                         \
2680   case ISD::SDNAME:                                                            \
2681     return VEISD::VVPNAME;
2682 #include "VVPNodes.def"
2683   }
2684   return None;
2685 }
2686 
2687 SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const {
2688   // Can we represent this as a VVP node.
2689   const unsigned Opcode = Op->getOpcode();
2690   auto VVPOpcodeOpt = getVVPOpcode(Opcode);
2691   if (!VVPOpcodeOpt.hasValue())
2692     return SDValue();
2693   unsigned VVPOpcode = VVPOpcodeOpt.getValue();
2694   const bool FromVP = ISD::isVPOpcode(Opcode);
2695 
2696   // The representative and legalized vector type of this operation.
2697   SDLoc DL(Op);
2698   MVT MaskVT = MVT::v256i1; // TODO: packed mode.
2699   EVT OpVecVT = Op.getValueType();
2700   EVT LegalVecVT = getTypeToTransformTo(*DAG.getContext(), OpVecVT);
2701 
2702   SDValue AVL;
2703   SDValue Mask;
2704 
2705   if (FromVP) {
2706     // All upstream VP SDNodes always have a mask and avl.
2707     auto MaskIdx = ISD::getVPMaskIdx(Opcode).getValue();
2708     auto AVLIdx = ISD::getVPExplicitVectorLengthIdx(Opcode).getValue();
2709     Mask = Op->getOperand(MaskIdx);
2710     AVL = Op->getOperand(AVLIdx);
2711 
2712   } else {
2713     // Materialize the VL parameter.
2714     AVL = DAG.getConstant(OpVecVT.getVectorNumElements(), DL, MVT::i32);
2715     SDValue ConstTrue = DAG.getConstant(1, DL, MVT::i32);
2716     Mask = DAG.getNode(VEISD::VEC_BROADCAST, DL, MaskVT,
2717                        ConstTrue); // emit a VEISD::VEC_BROADCAST here.
2718   }
2719 
2720   // Categories we are interested in.
2721   bool IsBinaryOp = false;
2722 
2723   switch (VVPOpcode) {
2724 #define ADD_BINARY_VVP_OP(VVPNAME, ...)                                        \
2725   case VEISD::VVPNAME:                                                         \
2726     IsBinaryOp = true;                                                         \
2727     break;
2728 #include "VVPNodes.def"
2729   }
2730 
2731   if (IsBinaryOp) {
2732     assert(LegalVecVT.isSimple());
2733     return DAG.getNode(VVPOpcode, DL, LegalVecVT, Op->getOperand(0),
2734                        Op->getOperand(1), Mask, AVL);
2735   }
2736   llvm_unreachable("lowerToVVP called for unexpected SDNode.");
2737 }
2738 
2739 SDValue VETargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
2740                                                   SelectionDAG &DAG) const {
2741   assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
2742   MVT VT = Op.getOperand(0).getSimpleValueType();
2743 
2744   // Special treatment for packed V64 types.
2745   assert(VT == MVT::v512i32 || VT == MVT::v512f32);
2746   // Example of codes:
2747   //   %packed_v = extractelt %vr, %idx / 2
2748   //   %v = %packed_v >> (%idx % 2 * 32)
2749   //   %res = %v & 0xffffffff
2750 
2751   SDValue Vec = Op.getOperand(0);
2752   SDValue Idx = Op.getOperand(1);
2753   SDLoc DL(Op);
2754   SDValue Result = Op;
2755   if (0 /* Idx->isConstant() */) {
2756     // TODO: optimized implementation using constant values
2757   } else {
2758     SDValue Const1 = DAG.getConstant(1, DL, MVT::i64);
2759     SDValue HalfIdx = DAG.getNode(ISD::SRL, DL, MVT::i64, {Idx, Const1});
2760     SDValue PackedElt =
2761         SDValue(DAG.getMachineNode(VE::LVSvr, DL, MVT::i64, {Vec, HalfIdx}), 0);
2762     SDValue AndIdx = DAG.getNode(ISD::AND, DL, MVT::i64, {Idx, Const1});
2763     SDValue Shift = DAG.getNode(ISD::XOR, DL, MVT::i64, {AndIdx, Const1});
2764     SDValue Const5 = DAG.getConstant(5, DL, MVT::i64);
2765     Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, {Shift, Const5});
2766     PackedElt = DAG.getNode(ISD::SRL, DL, MVT::i64, {PackedElt, Shift});
2767     SDValue Mask = DAG.getConstant(0xFFFFFFFFL, DL, MVT::i64);
2768     PackedElt = DAG.getNode(ISD::AND, DL, MVT::i64, {PackedElt, Mask});
2769     SDValue SubI32 = DAG.getTargetConstant(VE::sub_i32, DL, MVT::i32);
2770     Result = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
2771                                         MVT::i32, PackedElt, SubI32),
2772                      0);
2773 
2774     if (Op.getSimpleValueType() == MVT::f32) {
2775       Result = DAG.getBitcast(MVT::f32, Result);
2776     } else {
2777       assert(Op.getSimpleValueType() == MVT::i32);
2778     }
2779   }
2780   return Result;
2781 }
2782 
2783 SDValue VETargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
2784                                                  SelectionDAG &DAG) const {
2785   assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
2786   MVT VT = Op.getOperand(0).getSimpleValueType();
2787 
2788   // Special treatment for packed V64 types.
2789   assert(VT == MVT::v512i32 || VT == MVT::v512f32);
2790   // The v512i32 and v512f32 starts from upper bits (0..31).  This "upper
2791   // bits" required `val << 32` from C implementation's point of view.
2792   //
2793   // Example of codes:
2794   //   %packed_elt = extractelt %vr, (%idx >> 1)
2795   //   %shift = ((%idx & 1) ^ 1) << 5
2796   //   %packed_elt &= 0xffffffff00000000 >> shift
2797   //   %packed_elt |= (zext %val) << shift
2798   //   %vr = insertelt %vr, %packed_elt, (%idx >> 1)
2799 
2800   SDLoc DL(Op);
2801   SDValue Vec = Op.getOperand(0);
2802   SDValue Val = Op.getOperand(1);
2803   SDValue Idx = Op.getOperand(2);
2804   if (Idx.getSimpleValueType() == MVT::i32)
2805     Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
2806   if (Val.getSimpleValueType() == MVT::f32)
2807     Val = DAG.getBitcast(MVT::i32, Val);
2808   assert(Val.getSimpleValueType() == MVT::i32);
2809   Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
2810 
2811   SDValue Result = Op;
2812   if (0 /* Idx->isConstant()*/) {
2813     // TODO: optimized implementation using constant values
2814   } else {
2815     SDValue Const1 = DAG.getConstant(1, DL, MVT::i64);
2816     SDValue HalfIdx = DAG.getNode(ISD::SRL, DL, MVT::i64, {Idx, Const1});
2817     SDValue PackedElt =
2818         SDValue(DAG.getMachineNode(VE::LVSvr, DL, MVT::i64, {Vec, HalfIdx}), 0);
2819     SDValue AndIdx = DAG.getNode(ISD::AND, DL, MVT::i64, {Idx, Const1});
2820     SDValue Shift = DAG.getNode(ISD::XOR, DL, MVT::i64, {AndIdx, Const1});
2821     SDValue Const5 = DAG.getConstant(5, DL, MVT::i64);
2822     Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, {Shift, Const5});
2823     SDValue Mask = DAG.getConstant(0xFFFFFFFF00000000L, DL, MVT::i64);
2824     Mask = DAG.getNode(ISD::SRL, DL, MVT::i64, {Mask, Shift});
2825     PackedElt = DAG.getNode(ISD::AND, DL, MVT::i64, {PackedElt, Mask});
2826     Val = DAG.getNode(ISD::SHL, DL, MVT::i64, {Val, Shift});
2827     PackedElt = DAG.getNode(ISD::OR, DL, MVT::i64, {PackedElt, Val});
2828     Result =
2829         SDValue(DAG.getMachineNode(VE::LSVrr_v, DL, Vec.getSimpleValueType(),
2830                                    {HalfIdx, PackedElt, Vec}),
2831                 0);
2832   }
2833   return Result;
2834 }
2835