1 //===-- VEISelLowering.cpp - VE DAG Lowering Implementation ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the interfaces that VE uses to lower LLVM code into a
10 // selection DAG.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "VEISelLowering.h"
15 #include "MCTargetDesc/VEMCExpr.h"
16 #include "VECustomDAG.h"
17 #include "VEInstrBuilder.h"
18 #include "VEMachineFunctionInfo.h"
19 #include "VERegisterInfo.h"
20 #include "VETargetMachine.h"
21 #include "llvm/ADT/StringSwitch.h"
22 #include "llvm/CodeGen/CallingConvLower.h"
23 #include "llvm/CodeGen/MachineFrameInfo.h"
24 #include "llvm/CodeGen/MachineFunction.h"
25 #include "llvm/CodeGen/MachineInstrBuilder.h"
26 #include "llvm/CodeGen/MachineJumpTableInfo.h"
27 #include "llvm/CodeGen/MachineModuleInfo.h"
28 #include "llvm/CodeGen/MachineRegisterInfo.h"
29 #include "llvm/CodeGen/SelectionDAG.h"
30 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
31 #include "llvm/IR/DerivedTypes.h"
32 #include "llvm/IR/Function.h"
33 #include "llvm/IR/IRBuilder.h"
34 #include "llvm/IR/Module.h"
35 #include "llvm/Support/ErrorHandling.h"
36 #include "llvm/Support/KnownBits.h"
37 using namespace llvm;
38 
39 #define DEBUG_TYPE "ve-lower"
40 
41 //===----------------------------------------------------------------------===//
42 // Calling Convention Implementation
43 //===----------------------------------------------------------------------===//
44 
45 #include "VEGenCallingConv.inc"
46 
47 CCAssignFn *getReturnCC(CallingConv::ID CallConv) {
48   switch (CallConv) {
49   default:
50     return RetCC_VE_C;
51   case CallingConv::Fast:
52     return RetCC_VE_Fast;
53   }
54 }
55 
56 CCAssignFn *getParamCC(CallingConv::ID CallConv, bool IsVarArg) {
57   if (IsVarArg)
58     return CC_VE2;
59   switch (CallConv) {
60   default:
61     return CC_VE_C;
62   case CallingConv::Fast:
63     return CC_VE_Fast;
64   }
65 }
66 
67 bool VETargetLowering::CanLowerReturn(
68     CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
69     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
70   CCAssignFn *RetCC = getReturnCC(CallConv);
71   SmallVector<CCValAssign, 16> RVLocs;
72   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
73   return CCInfo.CheckReturn(Outs, RetCC);
74 }
75 
76 static const MVT AllVectorVTs[] = {MVT::v256i32, MVT::v512i32, MVT::v256i64,
77                                    MVT::v256f32, MVT::v512f32, MVT::v256f64};
78 
79 static const MVT AllMaskVTs[] = {MVT::v256i1, MVT::v512i1};
80 
81 static const MVT AllPackedVTs[] = {MVT::v512i32, MVT::v512f32};
82 
83 void VETargetLowering::initRegisterClasses() {
84   // Set up the register classes.
85   addRegisterClass(MVT::i32, &VE::I32RegClass);
86   addRegisterClass(MVT::i64, &VE::I64RegClass);
87   addRegisterClass(MVT::f32, &VE::F32RegClass);
88   addRegisterClass(MVT::f64, &VE::I64RegClass);
89   addRegisterClass(MVT::f128, &VE::F128RegClass);
90 
91   if (Subtarget->enableVPU()) {
92     for (MVT VecVT : AllVectorVTs)
93       addRegisterClass(VecVT, &VE::V64RegClass);
94     addRegisterClass(MVT::v256i1, &VE::VMRegClass);
95     addRegisterClass(MVT::v512i1, &VE::VM512RegClass);
96   }
97 }
98 
99 void VETargetLowering::initSPUActions() {
100   const auto &TM = getTargetMachine();
101   /// Load & Store {
102 
103   // VE doesn't have i1 sign extending load.
104   for (MVT VT : MVT::integer_valuetypes()) {
105     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
106     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
107     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
108     setTruncStoreAction(VT, MVT::i1, Expand);
109   }
110 
111   // VE doesn't have floating point extload/truncstore, so expand them.
112   for (MVT FPVT : MVT::fp_valuetypes()) {
113     for (MVT OtherFPVT : MVT::fp_valuetypes()) {
114       setLoadExtAction(ISD::EXTLOAD, FPVT, OtherFPVT, Expand);
115       setTruncStoreAction(FPVT, OtherFPVT, Expand);
116     }
117   }
118 
119   // VE doesn't have fp128 load/store, so expand them in custom lower.
120   setOperationAction(ISD::LOAD, MVT::f128, Custom);
121   setOperationAction(ISD::STORE, MVT::f128, Custom);
122 
123   /// } Load & Store
124 
125   // Custom legalize address nodes into LO/HI parts.
126   MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
127   setOperationAction(ISD::BlockAddress, PtrVT, Custom);
128   setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
129   setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
130   setOperationAction(ISD::ConstantPool, PtrVT, Custom);
131   setOperationAction(ISD::JumpTable, PtrVT, Custom);
132 
133   /// VAARG handling {
134   setOperationAction(ISD::VASTART, MVT::Other, Custom);
135   // VAARG needs to be lowered to access with 8 bytes alignment.
136   setOperationAction(ISD::VAARG, MVT::Other, Custom);
137   // Use the default implementation.
138   setOperationAction(ISD::VACOPY, MVT::Other, Expand);
139   setOperationAction(ISD::VAEND, MVT::Other, Expand);
140   /// } VAARG handling
141 
142   /// Stack {
143   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
144   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
145 
146   // Use the default implementation.
147   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
148   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
149   /// } Stack
150 
151   /// Branch {
152 
153   // VE doesn't have BRCOND
154   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
155 
156   // BR_JT is not implemented yet.
157   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
158 
159   /// } Branch
160 
161   /// Int Ops {
162   for (MVT IntVT : {MVT::i32, MVT::i64}) {
163     // VE has no REM or DIVREM operations.
164     setOperationAction(ISD::UREM, IntVT, Expand);
165     setOperationAction(ISD::SREM, IntVT, Expand);
166     setOperationAction(ISD::SDIVREM, IntVT, Expand);
167     setOperationAction(ISD::UDIVREM, IntVT, Expand);
168 
169     // VE has no SHL_PARTS/SRA_PARTS/SRL_PARTS operations.
170     setOperationAction(ISD::SHL_PARTS, IntVT, Expand);
171     setOperationAction(ISD::SRA_PARTS, IntVT, Expand);
172     setOperationAction(ISD::SRL_PARTS, IntVT, Expand);
173 
174     // VE has no MULHU/S or U/SMUL_LOHI operations.
175     // TODO: Use MPD instruction to implement SMUL_LOHI for i32 type.
176     setOperationAction(ISD::MULHU, IntVT, Expand);
177     setOperationAction(ISD::MULHS, IntVT, Expand);
178     setOperationAction(ISD::UMUL_LOHI, IntVT, Expand);
179     setOperationAction(ISD::SMUL_LOHI, IntVT, Expand);
180 
181     // VE has no CTTZ, ROTL, ROTR operations.
182     setOperationAction(ISD::CTTZ, IntVT, Expand);
183     setOperationAction(ISD::ROTL, IntVT, Expand);
184     setOperationAction(ISD::ROTR, IntVT, Expand);
185 
186     // VE has 64 bits instruction which works as i64 BSWAP operation.  This
187     // instruction works fine as i32 BSWAP operation with an additional
188     // parameter.  Use isel patterns to lower BSWAP.
189     setOperationAction(ISD::BSWAP, IntVT, Legal);
190 
191     // VE has only 64 bits instructions which work as i64 BITREVERSE/CTLZ/CTPOP
192     // operations.  Use isel patterns for i64, promote for i32.
193     LegalizeAction Act = (IntVT == MVT::i32) ? Promote : Legal;
194     setOperationAction(ISD::BITREVERSE, IntVT, Act);
195     setOperationAction(ISD::CTLZ, IntVT, Act);
196     setOperationAction(ISD::CTLZ_ZERO_UNDEF, IntVT, Act);
197     setOperationAction(ISD::CTPOP, IntVT, Act);
198 
199     // VE has only 64 bits instructions which work as i64 AND/OR/XOR operations.
200     // Use isel patterns for i64, promote for i32.
201     setOperationAction(ISD::AND, IntVT, Act);
202     setOperationAction(ISD::OR, IntVT, Act);
203     setOperationAction(ISD::XOR, IntVT, Act);
204   }
205   /// } Int Ops
206 
207   /// Conversion {
208   // VE doesn't have instructions for fp<->uint, so expand them by llvm
209   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote); // use i64
210   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote); // use i64
211   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
212   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
213 
214   // fp16 not supported
215   for (MVT FPVT : MVT::fp_valuetypes()) {
216     setOperationAction(ISD::FP16_TO_FP, FPVT, Expand);
217     setOperationAction(ISD::FP_TO_FP16, FPVT, Expand);
218   }
219   /// } Conversion
220 
221   /// Floating-point Ops {
222   /// Note: Floating-point operations are fneg, fadd, fsub, fmul, fdiv, frem,
223   ///       and fcmp.
224 
225   // VE doesn't have following floating point operations.
226   for (MVT VT : MVT::fp_valuetypes()) {
227     setOperationAction(ISD::FNEG, VT, Expand);
228     setOperationAction(ISD::FREM, VT, Expand);
229   }
230 
231   // VE doesn't have fdiv of f128.
232   setOperationAction(ISD::FDIV, MVT::f128, Expand);
233 
234   for (MVT FPVT : {MVT::f32, MVT::f64}) {
235     // f32 and f64 uses ConstantFP.  f128 uses ConstantPool.
236     setOperationAction(ISD::ConstantFP, FPVT, Legal);
237   }
238   /// } Floating-point Ops
239 
240   /// Floating-point math functions {
241 
242   // VE doesn't have following floating point math functions.
243   for (MVT VT : MVT::fp_valuetypes()) {
244     setOperationAction(ISD::FABS, VT, Expand);
245     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
246     setOperationAction(ISD::FCOS, VT, Expand);
247     setOperationAction(ISD::FSIN, VT, Expand);
248     setOperationAction(ISD::FSQRT, VT, Expand);
249   }
250 
251   /// } Floating-point math functions
252 
253   /// Atomic instructions {
254 
255   setMaxAtomicSizeInBitsSupported(64);
256   setMinCmpXchgSizeInBits(32);
257   setSupportsUnalignedAtomics(false);
258 
259   // Use custom inserter for ATOMIC_FENCE.
260   setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
261 
262   // Other atomic instructions.
263   for (MVT VT : MVT::integer_valuetypes()) {
264     // Support i8/i16 atomic swap.
265     setOperationAction(ISD::ATOMIC_SWAP, VT, Custom);
266 
267     // FIXME: Support "atmam" instructions.
268     setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Expand);
269     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Expand);
270     setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Expand);
271     setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Expand);
272 
273     // VE doesn't have follwing instructions.
274     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Expand);
275     setOperationAction(ISD::ATOMIC_LOAD_CLR, VT, Expand);
276     setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Expand);
277     setOperationAction(ISD::ATOMIC_LOAD_NAND, VT, Expand);
278     setOperationAction(ISD::ATOMIC_LOAD_MIN, VT, Expand);
279     setOperationAction(ISD::ATOMIC_LOAD_MAX, VT, Expand);
280     setOperationAction(ISD::ATOMIC_LOAD_UMIN, VT, Expand);
281     setOperationAction(ISD::ATOMIC_LOAD_UMAX, VT, Expand);
282   }
283 
284   /// } Atomic instructions
285 
286   /// SJLJ instructions {
287   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
288   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
289   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
290   if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
291     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
292   /// } SJLJ instructions
293 
294   // Intrinsic instructions
295   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
296 }
297 
298 void VETargetLowering::initVPUActions() {
299   for (MVT LegalMaskVT : AllMaskVTs)
300     setOperationAction(ISD::BUILD_VECTOR, LegalMaskVT, Custom);
301 
302   for (unsigned Opc : {ISD::AND, ISD::OR, ISD::XOR})
303     setOperationAction(Opc, MVT::v512i1, Custom);
304 
305   for (MVT LegalVecVT : AllVectorVTs) {
306     setOperationAction(ISD::BUILD_VECTOR, LegalVecVT, Custom);
307     setOperationAction(ISD::INSERT_VECTOR_ELT, LegalVecVT, Legal);
308     setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalVecVT, Legal);
309     // Translate all vector instructions with legal element types to VVP_*
310     // nodes.
311     // TODO We will custom-widen into VVP_* nodes in the future. While we are
312     // buildling the infrastructure for this, we only do this for legal vector
313     // VTs.
314 #define HANDLE_VP_TO_VVP(VP_OPC, VVP_NAME)                                     \
315   setOperationAction(ISD::VP_OPC, LegalVecVT, Custom);
316 #define ADD_VVP_OP(VVP_NAME, ISD_NAME)                                         \
317   setOperationAction(ISD::ISD_NAME, LegalVecVT, Custom);
318     setOperationAction(ISD::EXPERIMENTAL_VP_STRIDED_LOAD, LegalVecVT, Custom);
319     setOperationAction(ISD::EXPERIMENTAL_VP_STRIDED_STORE, LegalVecVT, Custom);
320 #include "VVPNodes.def"
321   }
322 
323   for (MVT LegalPackedVT : AllPackedVTs) {
324     setOperationAction(ISD::INSERT_VECTOR_ELT, LegalPackedVT, Custom);
325     setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalPackedVT, Custom);
326   }
327 
328   // vNt32, vNt64 ops (legal element types)
329   for (MVT VT : MVT::vector_valuetypes()) {
330     MVT ElemVT = VT.getVectorElementType();
331     unsigned ElemBits = ElemVT.getScalarSizeInBits();
332     if (ElemBits != 32 && ElemBits != 64)
333       continue;
334 
335     for (unsigned MemOpc : {ISD::MLOAD, ISD::MSTORE, ISD::LOAD, ISD::STORE})
336       setOperationAction(MemOpc, VT, Custom);
337 
338     const ISD::NodeType IntReductionOCs[] = {
339         ISD::VECREDUCE_ADD,  ISD::VECREDUCE_MUL,  ISD::VECREDUCE_AND,
340         ISD::VECREDUCE_OR,   ISD::VECREDUCE_XOR,  ISD::VECREDUCE_SMIN,
341         ISD::VECREDUCE_SMAX, ISD::VECREDUCE_UMIN, ISD::VECREDUCE_UMAX};
342 
343     for (unsigned IntRedOpc : IntReductionOCs)
344       setOperationAction(IntRedOpc, VT, Custom);
345   }
346 
347   // v256i1 and v512i1 ops
348   for (MVT MaskVT : AllMaskVTs) {
349     // Custom lower mask ops
350     setOperationAction(ISD::STORE, MaskVT, Custom);
351     setOperationAction(ISD::LOAD, MaskVT, Custom);
352   }
353 }
354 
355 SDValue
356 VETargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
357                               bool IsVarArg,
358                               const SmallVectorImpl<ISD::OutputArg> &Outs,
359                               const SmallVectorImpl<SDValue> &OutVals,
360                               const SDLoc &DL, SelectionDAG &DAG) const {
361   // CCValAssign - represent the assignment of the return value to locations.
362   SmallVector<CCValAssign, 16> RVLocs;
363 
364   // CCState - Info about the registers and stack slot.
365   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
366                  *DAG.getContext());
367 
368   // Analyze return values.
369   CCInfo.AnalyzeReturn(Outs, getReturnCC(CallConv));
370 
371   SDValue Flag;
372   SmallVector<SDValue, 4> RetOps(1, Chain);
373 
374   // Copy the result values into the output registers.
375   for (unsigned i = 0; i != RVLocs.size(); ++i) {
376     CCValAssign &VA = RVLocs[i];
377     assert(VA.isRegLoc() && "Can only return in registers!");
378     assert(!VA.needsCustom() && "Unexpected custom lowering");
379     SDValue OutVal = OutVals[i];
380 
381     // Integer return values must be sign or zero extended by the callee.
382     switch (VA.getLocInfo()) {
383     case CCValAssign::Full:
384       break;
385     case CCValAssign::SExt:
386       OutVal = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), OutVal);
387       break;
388     case CCValAssign::ZExt:
389       OutVal = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), OutVal);
390       break;
391     case CCValAssign::AExt:
392       OutVal = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), OutVal);
393       break;
394     case CCValAssign::BCvt: {
395       // Convert a float return value to i64 with padding.
396       //     63     31   0
397       //    +------+------+
398       //    | float|   0  |
399       //    +------+------+
400       assert(VA.getLocVT() == MVT::i64);
401       assert(VA.getValVT() == MVT::f32);
402       SDValue Undef = SDValue(
403           DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0);
404       SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
405       OutVal = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
406                                           MVT::i64, Undef, OutVal, Sub_f32),
407                        0);
408       break;
409     }
410     default:
411       llvm_unreachable("Unknown loc info!");
412     }
413 
414     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVal, Flag);
415 
416     // Guarantee that all emitted copies are stuck together with flags.
417     Flag = Chain.getValue(1);
418     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
419   }
420 
421   RetOps[0] = Chain; // Update chain.
422 
423   // Add the flag if we have it.
424   if (Flag.getNode())
425     RetOps.push_back(Flag);
426 
427   return DAG.getNode(VEISD::RET_FLAG, DL, MVT::Other, RetOps);
428 }
429 
430 SDValue VETargetLowering::LowerFormalArguments(
431     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
432     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
433     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
434   MachineFunction &MF = DAG.getMachineFunction();
435 
436   // Get the base offset of the incoming arguments stack space.
437   unsigned ArgsBaseOffset = Subtarget->getRsaSize();
438   // Get the size of the preserved arguments area
439   unsigned ArgsPreserved = 64;
440 
441   // Analyze arguments according to CC_VE.
442   SmallVector<CCValAssign, 16> ArgLocs;
443   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
444                  *DAG.getContext());
445   // Allocate the preserved area first.
446   CCInfo.AllocateStack(ArgsPreserved, Align(8));
447   // We already allocated the preserved area, so the stack offset computed
448   // by CC_VE would be correct now.
449   CCInfo.AnalyzeFormalArguments(Ins, getParamCC(CallConv, false));
450 
451   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
452     CCValAssign &VA = ArgLocs[i];
453     assert(!VA.needsCustom() && "Unexpected custom lowering");
454     if (VA.isRegLoc()) {
455       // This argument is passed in a register.
456       // All integer register arguments are promoted by the caller to i64.
457 
458       // Create a virtual register for the promoted live-in value.
459       Register VReg =
460           MF.addLiveIn(VA.getLocReg(), getRegClassFor(VA.getLocVT()));
461       SDValue Arg = DAG.getCopyFromReg(Chain, DL, VReg, VA.getLocVT());
462 
463       // The caller promoted the argument, so insert an Assert?ext SDNode so we
464       // won't promote the value again in this function.
465       switch (VA.getLocInfo()) {
466       case CCValAssign::SExt:
467         Arg = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Arg,
468                           DAG.getValueType(VA.getValVT()));
469         break;
470       case CCValAssign::ZExt:
471         Arg = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Arg,
472                           DAG.getValueType(VA.getValVT()));
473         break;
474       case CCValAssign::BCvt: {
475         // Extract a float argument from i64 with padding.
476         //     63     31   0
477         //    +------+------+
478         //    | float|   0  |
479         //    +------+------+
480         assert(VA.getLocVT() == MVT::i64);
481         assert(VA.getValVT() == MVT::f32);
482         SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
483         Arg = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
484                                          MVT::f32, Arg, Sub_f32),
485                       0);
486         break;
487       }
488       default:
489         break;
490       }
491 
492       // Truncate the register down to the argument type.
493       if (VA.isExtInLoc())
494         Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
495 
496       InVals.push_back(Arg);
497       continue;
498     }
499 
500     // The registers are exhausted. This argument was passed on the stack.
501     assert(VA.isMemLoc());
502     // The CC_VE_Full/Half functions compute stack offsets relative to the
503     // beginning of the arguments area at %fp + the size of reserved area.
504     unsigned Offset = VA.getLocMemOffset() + ArgsBaseOffset;
505     unsigned ValSize = VA.getValVT().getSizeInBits() / 8;
506 
507     // Adjust offset for a float argument by adding 4 since the argument is
508     // stored in 8 bytes buffer with offset like below.  LLVM generates
509     // 4 bytes load instruction, so need to adjust offset here.  This
510     // adjustment is required in only LowerFormalArguments.  In LowerCall,
511     // a float argument is converted to i64 first, and stored as 8 bytes
512     // data, which is required by ABI, so no need for adjustment.
513     //    0      4
514     //    +------+------+
515     //    | empty| float|
516     //    +------+------+
517     if (VA.getValVT() == MVT::f32)
518       Offset += 4;
519 
520     int FI = MF.getFrameInfo().CreateFixedObject(ValSize, Offset, true);
521     InVals.push_back(
522         DAG.getLoad(VA.getValVT(), DL, Chain,
523                     DAG.getFrameIndex(FI, getPointerTy(MF.getDataLayout())),
524                     MachinePointerInfo::getFixedStack(MF, FI)));
525   }
526 
527   if (!IsVarArg)
528     return Chain;
529 
530   // This function takes variable arguments, some of which may have been passed
531   // in registers %s0-%s8.
532   //
533   // The va_start intrinsic needs to know the offset to the first variable
534   // argument.
535   // TODO: need to calculate offset correctly once we support f128.
536   unsigned ArgOffset = ArgLocs.size() * 8;
537   VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
538   // Skip the reserved area at the top of stack.
539   FuncInfo->setVarArgsFrameOffset(ArgOffset + ArgsBaseOffset);
540 
541   return Chain;
542 }
543 
544 // FIXME? Maybe this could be a TableGen attribute on some registers and
545 // this table could be generated automatically from RegInfo.
546 Register VETargetLowering::getRegisterByName(const char *RegName, LLT VT,
547                                              const MachineFunction &MF) const {
548   Register Reg = StringSwitch<Register>(RegName)
549                      .Case("sp", VE::SX11)    // Stack pointer
550                      .Case("fp", VE::SX9)     // Frame pointer
551                      .Case("sl", VE::SX8)     // Stack limit
552                      .Case("lr", VE::SX10)    // Link register
553                      .Case("tp", VE::SX14)    // Thread pointer
554                      .Case("outer", VE::SX12) // Outer regiser
555                      .Case("info", VE::SX17)  // Info area register
556                      .Case("got", VE::SX15)   // Global offset table register
557                      .Case("plt", VE::SX16) // Procedure linkage table register
558                      .Default(0);
559 
560   if (Reg)
561     return Reg;
562 
563   report_fatal_error("Invalid register name global variable");
564 }
565 
566 //===----------------------------------------------------------------------===//
567 // TargetLowering Implementation
568 //===----------------------------------------------------------------------===//
569 
570 SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
571                                     SmallVectorImpl<SDValue> &InVals) const {
572   SelectionDAG &DAG = CLI.DAG;
573   SDLoc DL = CLI.DL;
574   SDValue Chain = CLI.Chain;
575   auto PtrVT = getPointerTy(DAG.getDataLayout());
576 
577   // VE target does not yet support tail call optimization.
578   CLI.IsTailCall = false;
579 
580   // Get the base offset of the outgoing arguments stack space.
581   unsigned ArgsBaseOffset = Subtarget->getRsaSize();
582   // Get the size of the preserved arguments area
583   unsigned ArgsPreserved = 8 * 8u;
584 
585   // Analyze operands of the call, assigning locations to each operand.
586   SmallVector<CCValAssign, 16> ArgLocs;
587   CCState CCInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), ArgLocs,
588                  *DAG.getContext());
589   // Allocate the preserved area first.
590   CCInfo.AllocateStack(ArgsPreserved, Align(8));
591   // We already allocated the preserved area, so the stack offset computed
592   // by CC_VE would be correct now.
593   CCInfo.AnalyzeCallOperands(CLI.Outs, getParamCC(CLI.CallConv, false));
594 
595   // VE requires to use both register and stack for varargs or no-prototyped
596   // functions.
597   bool UseBoth = CLI.IsVarArg;
598 
599   // Analyze operands again if it is required to store BOTH.
600   SmallVector<CCValAssign, 16> ArgLocs2;
601   CCState CCInfo2(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(),
602                   ArgLocs2, *DAG.getContext());
603   if (UseBoth)
604     CCInfo2.AnalyzeCallOperands(CLI.Outs, getParamCC(CLI.CallConv, true));
605 
606   // Get the size of the outgoing arguments stack space requirement.
607   unsigned ArgsSize = CCInfo.getNextStackOffset();
608 
609   // Keep stack frames 16-byte aligned.
610   ArgsSize = alignTo(ArgsSize, 16);
611 
612   // Adjust the stack pointer to make room for the arguments.
613   // FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls
614   // with more than 6 arguments.
615   Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, DL);
616 
617   // Collect the set of registers to pass to the function and their values.
618   // This will be emitted as a sequence of CopyToReg nodes glued to the call
619   // instruction.
620   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
621 
622   // Collect chains from all the memory opeations that copy arguments to the
623   // stack. They must follow the stack pointer adjustment above and precede the
624   // call instruction itself.
625   SmallVector<SDValue, 8> MemOpChains;
626 
627   // VE needs to get address of callee function in a register
628   // So, prepare to copy it to SX12 here.
629 
630   // If the callee is a GlobalAddress node (quite common, every direct call is)
631   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
632   // Likewise ExternalSymbol -> TargetExternalSymbol.
633   SDValue Callee = CLI.Callee;
634 
635   bool IsPICCall = isPositionIndependent();
636 
637   // PC-relative references to external symbols should go through $stub.
638   // If so, we need to prepare GlobalBaseReg first.
639   const TargetMachine &TM = DAG.getTarget();
640   const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
641   const GlobalValue *GV = nullptr;
642   auto *CalleeG = dyn_cast<GlobalAddressSDNode>(Callee);
643   if (CalleeG)
644     GV = CalleeG->getGlobal();
645   bool Local = TM.shouldAssumeDSOLocal(*Mod, GV);
646   bool UsePlt = !Local;
647   MachineFunction &MF = DAG.getMachineFunction();
648 
649   // Turn GlobalAddress/ExternalSymbol node into a value node
650   // containing the address of them here.
651   if (CalleeG) {
652     if (IsPICCall) {
653       if (UsePlt)
654         Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
655       Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
656       Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee);
657     } else {
658       Callee =
659           makeHiLoPair(Callee, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
660     }
661   } else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
662     if (IsPICCall) {
663       if (UsePlt)
664         Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
665       Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, 0);
666       Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee);
667     } else {
668       Callee =
669           makeHiLoPair(Callee, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
670     }
671   }
672 
673   RegsToPass.push_back(std::make_pair(VE::SX12, Callee));
674 
675   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
676     CCValAssign &VA = ArgLocs[i];
677     SDValue Arg = CLI.OutVals[i];
678 
679     // Promote the value if needed.
680     switch (VA.getLocInfo()) {
681     default:
682       llvm_unreachable("Unknown location info!");
683     case CCValAssign::Full:
684       break;
685     case CCValAssign::SExt:
686       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
687       break;
688     case CCValAssign::ZExt:
689       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
690       break;
691     case CCValAssign::AExt:
692       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
693       break;
694     case CCValAssign::BCvt: {
695       // Convert a float argument to i64 with padding.
696       //     63     31   0
697       //    +------+------+
698       //    | float|   0  |
699       //    +------+------+
700       assert(VA.getLocVT() == MVT::i64);
701       assert(VA.getValVT() == MVT::f32);
702       SDValue Undef = SDValue(
703           DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0);
704       SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
705       Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
706                                        MVT::i64, Undef, Arg, Sub_f32),
707                     0);
708       break;
709     }
710     }
711 
712     if (VA.isRegLoc()) {
713       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
714       if (!UseBoth)
715         continue;
716       VA = ArgLocs2[i];
717     }
718 
719     assert(VA.isMemLoc());
720 
721     // Create a store off the stack pointer for this argument.
722     SDValue StackPtr = DAG.getRegister(VE::SX11, PtrVT);
723     // The argument area starts at %fp/%sp + the size of reserved area.
724     SDValue PtrOff =
725         DAG.getIntPtrConstant(VA.getLocMemOffset() + ArgsBaseOffset, DL);
726     PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
727     MemOpChains.push_back(
728         DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo()));
729   }
730 
731   // Emit all stores, make sure they occur before the call.
732   if (!MemOpChains.empty())
733     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
734 
735   // Build a sequence of CopyToReg nodes glued together with token chain and
736   // glue operands which copy the outgoing args into registers. The InGlue is
737   // necessary since all emitted instructions must be stuck together in order
738   // to pass the live physical registers.
739   SDValue InGlue;
740   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
741     Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
742                              RegsToPass[i].second, InGlue);
743     InGlue = Chain.getValue(1);
744   }
745 
746   // Build the operands for the call instruction itself.
747   SmallVector<SDValue, 8> Ops;
748   Ops.push_back(Chain);
749   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
750     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
751                                   RegsToPass[i].second.getValueType()));
752 
753   // Add a register mask operand representing the call-preserved registers.
754   const VERegisterInfo *TRI = Subtarget->getRegisterInfo();
755   const uint32_t *Mask =
756       TRI->getCallPreservedMask(DAG.getMachineFunction(), CLI.CallConv);
757   assert(Mask && "Missing call preserved mask for calling convention");
758   Ops.push_back(DAG.getRegisterMask(Mask));
759 
760   // Make sure the CopyToReg nodes are glued to the call instruction which
761   // consumes the registers.
762   if (InGlue.getNode())
763     Ops.push_back(InGlue);
764 
765   // Now the call itself.
766   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
767   Chain = DAG.getNode(VEISD::CALL, DL, NodeTys, Ops);
768   InGlue = Chain.getValue(1);
769 
770   // Revert the stack pointer immediately after the call.
771   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, DL, true),
772                              DAG.getIntPtrConstant(0, DL, true), InGlue, DL);
773   InGlue = Chain.getValue(1);
774 
775   // Now extract the return values. This is more or less the same as
776   // LowerFormalArguments.
777 
778   // Assign locations to each value returned by this call.
779   SmallVector<CCValAssign, 16> RVLocs;
780   CCState RVInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), RVLocs,
781                  *DAG.getContext());
782 
783   // Set inreg flag manually for codegen generated library calls that
784   // return float.
785   if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && !CLI.CB)
786     CLI.Ins[0].Flags.setInReg();
787 
788   RVInfo.AnalyzeCallResult(CLI.Ins, getReturnCC(CLI.CallConv));
789 
790   // Copy all of the result registers out of their specified physreg.
791   for (unsigned i = 0; i != RVLocs.size(); ++i) {
792     CCValAssign &VA = RVLocs[i];
793     assert(!VA.needsCustom() && "Unexpected custom lowering");
794     Register Reg = VA.getLocReg();
795 
796     // When returning 'inreg {i32, i32 }', two consecutive i32 arguments can
797     // reside in the same register in the high and low bits. Reuse the
798     // CopyFromReg previous node to avoid duplicate copies.
799     SDValue RV;
800     if (RegisterSDNode *SrcReg = dyn_cast<RegisterSDNode>(Chain.getOperand(1)))
801       if (SrcReg->getReg() == Reg && Chain->getOpcode() == ISD::CopyFromReg)
802         RV = Chain.getValue(0);
803 
804     // But usually we'll create a new CopyFromReg for a different register.
805     if (!RV.getNode()) {
806       RV = DAG.getCopyFromReg(Chain, DL, Reg, RVLocs[i].getLocVT(), InGlue);
807       Chain = RV.getValue(1);
808       InGlue = Chain.getValue(2);
809     }
810 
811     // The callee promoted the return value, so insert an Assert?ext SDNode so
812     // we won't promote the value again in this function.
813     switch (VA.getLocInfo()) {
814     case CCValAssign::SExt:
815       RV = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), RV,
816                        DAG.getValueType(VA.getValVT()));
817       break;
818     case CCValAssign::ZExt:
819       RV = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), RV,
820                        DAG.getValueType(VA.getValVT()));
821       break;
822     case CCValAssign::BCvt: {
823       // Extract a float return value from i64 with padding.
824       //     63     31   0
825       //    +------+------+
826       //    | float|   0  |
827       //    +------+------+
828       assert(VA.getLocVT() == MVT::i64);
829       assert(VA.getValVT() == MVT::f32);
830       SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
831       RV = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
832                                       MVT::f32, RV, Sub_f32),
833                    0);
834       break;
835     }
836     default:
837       break;
838     }
839 
840     // Truncate the register down to the return value type.
841     if (VA.isExtInLoc())
842       RV = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), RV);
843 
844     InVals.push_back(RV);
845   }
846 
847   return Chain;
848 }
849 
850 bool VETargetLowering::isOffsetFoldingLegal(
851     const GlobalAddressSDNode *GA) const {
852   // VE uses 64 bit addressing, so we need multiple instructions to generate
853   // an address.  Folding address with offset increases the number of
854   // instructions, so that we disable it here.  Offsets will be folded in
855   // the DAG combine later if it worth to do so.
856   return false;
857 }
858 
859 /// isFPImmLegal - Returns true if the target can instruction select the
860 /// specified FP immediate natively. If false, the legalizer will
861 /// materialize the FP immediate as a load from a constant pool.
862 bool VETargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
863                                     bool ForCodeSize) const {
864   return VT == MVT::f32 || VT == MVT::f64;
865 }
866 
867 /// Determine if the target supports unaligned memory accesses.
868 ///
869 /// This function returns true if the target allows unaligned memory accesses
870 /// of the specified type in the given address space. If true, it also returns
871 /// whether the unaligned memory access is "fast" in the last argument by
872 /// reference. This is used, for example, in situations where an array
873 /// copy/move/set is converted to a sequence of store operations. Its use
874 /// helps to ensure that such replacements don't generate code that causes an
875 /// alignment error (trap) on the target machine.
876 bool VETargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
877                                                       unsigned AddrSpace,
878                                                       Align A,
879                                                       MachineMemOperand::Flags,
880                                                       bool *Fast) const {
881   if (Fast) {
882     // It's fast anytime on VE
883     *Fast = true;
884   }
885   return true;
886 }
887 
888 VETargetLowering::VETargetLowering(const TargetMachine &TM,
889                                    const VESubtarget &STI)
890     : TargetLowering(TM), Subtarget(&STI) {
891   // Instructions which use registers as conditionals examine all the
892   // bits (as does the pseudo SELECT_CC expansion). I don't think it
893   // matters much whether it's ZeroOrOneBooleanContent, or
894   // ZeroOrNegativeOneBooleanContent, so, arbitrarily choose the
895   // former.
896   setBooleanContents(ZeroOrOneBooleanContent);
897   setBooleanVectorContents(ZeroOrOneBooleanContent);
898 
899   initRegisterClasses();
900   initSPUActions();
901   initVPUActions();
902 
903   setStackPointerRegisterToSaveRestore(VE::SX11);
904 
905   // We have target-specific dag combine patterns for the following nodes:
906   setTargetDAGCombine(ISD::TRUNCATE);
907 
908   // Set function alignment to 16 bytes
909   setMinFunctionAlignment(Align(16));
910 
911   // VE stores all argument by 8 bytes alignment
912   setMinStackArgumentAlignment(Align(8));
913 
914   computeRegisterProperties(Subtarget->getRegisterInfo());
915 }
916 
917 const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const {
918 #define TARGET_NODE_CASE(NAME)                                                 \
919   case VEISD::NAME:                                                            \
920     return "VEISD::" #NAME;
921   switch ((VEISD::NodeType)Opcode) {
922   case VEISD::FIRST_NUMBER:
923     break;
924     TARGET_NODE_CASE(CALL)
925     TARGET_NODE_CASE(EH_SJLJ_LONGJMP)
926     TARGET_NODE_CASE(EH_SJLJ_SETJMP)
927     TARGET_NODE_CASE(EH_SJLJ_SETUP_DISPATCH)
928     TARGET_NODE_CASE(GETFUNPLT)
929     TARGET_NODE_CASE(GETSTACKTOP)
930     TARGET_NODE_CASE(GETTLSADDR)
931     TARGET_NODE_CASE(GLOBAL_BASE_REG)
932     TARGET_NODE_CASE(Hi)
933     TARGET_NODE_CASE(Lo)
934     TARGET_NODE_CASE(MEMBARRIER)
935     TARGET_NODE_CASE(RET_FLAG)
936     TARGET_NODE_CASE(TS1AM)
937     TARGET_NODE_CASE(VEC_UNPACK_LO)
938     TARGET_NODE_CASE(VEC_UNPACK_HI)
939     TARGET_NODE_CASE(VEC_PACK)
940     TARGET_NODE_CASE(VEC_BROADCAST)
941     TARGET_NODE_CASE(REPL_I32)
942     TARGET_NODE_CASE(REPL_F32)
943 
944     TARGET_NODE_CASE(LEGALAVL)
945 
946     // Register the VVP_* SDNodes.
947 #define ADD_VVP_OP(VVP_NAME, ...) TARGET_NODE_CASE(VVP_NAME)
948 #include "VVPNodes.def"
949   }
950 #undef TARGET_NODE_CASE
951   return nullptr;
952 }
953 
954 EVT VETargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
955                                          EVT VT) const {
956   return MVT::i32;
957 }
958 
959 // Convert to a target node and set target flags.
960 SDValue VETargetLowering::withTargetFlags(SDValue Op, unsigned TF,
961                                           SelectionDAG &DAG) const {
962   if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op))
963     return DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA),
964                                       GA->getValueType(0), GA->getOffset(), TF);
965 
966   if (const BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op))
967     return DAG.getTargetBlockAddress(BA->getBlockAddress(), Op.getValueType(),
968                                      0, TF);
969 
970   if (const ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op))
971     return DAG.getTargetConstantPool(CP->getConstVal(), CP->getValueType(0),
972                                      CP->getAlign(), CP->getOffset(), TF);
973 
974   if (const ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op))
975     return DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0),
976                                        TF);
977 
978   if (const JumpTableSDNode *JT = dyn_cast<JumpTableSDNode>(Op))
979     return DAG.getTargetJumpTable(JT->getIndex(), JT->getValueType(0), TF);
980 
981   llvm_unreachable("Unhandled address SDNode");
982 }
983 
984 // Split Op into high and low parts according to HiTF and LoTF.
985 // Return an ADD node combining the parts.
986 SDValue VETargetLowering::makeHiLoPair(SDValue Op, unsigned HiTF, unsigned LoTF,
987                                        SelectionDAG &DAG) const {
988   SDLoc DL(Op);
989   EVT VT = Op.getValueType();
990   SDValue Hi = DAG.getNode(VEISD::Hi, DL, VT, withTargetFlags(Op, HiTF, DAG));
991   SDValue Lo = DAG.getNode(VEISD::Lo, DL, VT, withTargetFlags(Op, LoTF, DAG));
992   return DAG.getNode(ISD::ADD, DL, VT, Hi, Lo);
993 }
994 
995 // Build SDNodes for producing an address from a GlobalAddress, ConstantPool,
996 // or ExternalSymbol SDNode.
997 SDValue VETargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
998   SDLoc DL(Op);
999   EVT PtrVT = Op.getValueType();
1000 
1001   // Handle PIC mode first. VE needs a got load for every variable!
1002   if (isPositionIndependent()) {
1003     auto GlobalN = dyn_cast<GlobalAddressSDNode>(Op);
1004 
1005     if (isa<ConstantPoolSDNode>(Op) || isa<JumpTableSDNode>(Op) ||
1006         (GlobalN && GlobalN->getGlobal()->hasLocalLinkage())) {
1007       // Create following instructions for local linkage PIC code.
1008       //     lea %reg, label@gotoff_lo
1009       //     and %reg, %reg, (32)0
1010       //     lea.sl %reg, label@gotoff_hi(%reg, %got)
1011       SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOTOFF_HI32,
1012                                   VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
1013       SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
1014       return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
1015     }
1016     // Create following instructions for not local linkage PIC code.
1017     //     lea %reg, label@got_lo
1018     //     and %reg, %reg, (32)0
1019     //     lea.sl %reg, label@got_hi(%reg)
1020     //     ld %reg, (%reg, %got)
1021     SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOT_HI32,
1022                                 VEMCExpr::VK_VE_GOT_LO32, DAG);
1023     SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
1024     SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
1025     return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), AbsAddr,
1026                        MachinePointerInfo::getGOT(DAG.getMachineFunction()));
1027   }
1028 
1029   // This is one of the absolute code models.
1030   switch (getTargetMachine().getCodeModel()) {
1031   default:
1032     llvm_unreachable("Unsupported absolute code model");
1033   case CodeModel::Small:
1034   case CodeModel::Medium:
1035   case CodeModel::Large:
1036     // abs64.
1037     return makeHiLoPair(Op, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
1038   }
1039 }
1040 
1041 /// Custom Lower {
1042 
1043 // The mappings for emitLeading/TrailingFence for VE is designed by following
1044 // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
1045 Instruction *VETargetLowering::emitLeadingFence(IRBuilderBase &Builder,
1046                                                 Instruction *Inst,
1047                                                 AtomicOrdering Ord) const {
1048   switch (Ord) {
1049   case AtomicOrdering::NotAtomic:
1050   case AtomicOrdering::Unordered:
1051     llvm_unreachable("Invalid fence: unordered/non-atomic");
1052   case AtomicOrdering::Monotonic:
1053   case AtomicOrdering::Acquire:
1054     return nullptr; // Nothing to do
1055   case AtomicOrdering::Release:
1056   case AtomicOrdering::AcquireRelease:
1057     return Builder.CreateFence(AtomicOrdering::Release);
1058   case AtomicOrdering::SequentiallyConsistent:
1059     if (!Inst->hasAtomicStore())
1060       return nullptr; // Nothing to do
1061     return Builder.CreateFence(AtomicOrdering::SequentiallyConsistent);
1062   }
1063   llvm_unreachable("Unknown fence ordering in emitLeadingFence");
1064 }
1065 
1066 Instruction *VETargetLowering::emitTrailingFence(IRBuilderBase &Builder,
1067                                                  Instruction *Inst,
1068                                                  AtomicOrdering Ord) const {
1069   switch (Ord) {
1070   case AtomicOrdering::NotAtomic:
1071   case AtomicOrdering::Unordered:
1072     llvm_unreachable("Invalid fence: unordered/not-atomic");
1073   case AtomicOrdering::Monotonic:
1074   case AtomicOrdering::Release:
1075     return nullptr; // Nothing to do
1076   case AtomicOrdering::Acquire:
1077   case AtomicOrdering::AcquireRelease:
1078     return Builder.CreateFence(AtomicOrdering::Acquire);
1079   case AtomicOrdering::SequentiallyConsistent:
1080     return Builder.CreateFence(AtomicOrdering::SequentiallyConsistent);
1081   }
1082   llvm_unreachable("Unknown fence ordering in emitTrailingFence");
1083 }
1084 
1085 SDValue VETargetLowering::lowerATOMIC_FENCE(SDValue Op,
1086                                             SelectionDAG &DAG) const {
1087   SDLoc DL(Op);
1088   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
1089       cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
1090   SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
1091       cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
1092 
1093   // VE uses Release consistency, so need a fence instruction if it is a
1094   // cross-thread fence.
1095   if (FenceSSID == SyncScope::System) {
1096     switch (FenceOrdering) {
1097     case AtomicOrdering::NotAtomic:
1098     case AtomicOrdering::Unordered:
1099     case AtomicOrdering::Monotonic:
1100       // No need to generate fencem instruction here.
1101       break;
1102     case AtomicOrdering::Acquire:
1103       // Generate "fencem 2" as acquire fence.
1104       return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
1105                                         DAG.getTargetConstant(2, DL, MVT::i32),
1106                                         Op.getOperand(0)),
1107                      0);
1108     case AtomicOrdering::Release:
1109       // Generate "fencem 1" as release fence.
1110       return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
1111                                         DAG.getTargetConstant(1, DL, MVT::i32),
1112                                         Op.getOperand(0)),
1113                      0);
1114     case AtomicOrdering::AcquireRelease:
1115     case AtomicOrdering::SequentiallyConsistent:
1116       // Generate "fencem 3" as acq_rel and seq_cst fence.
1117       // FIXME: "fencem 3" doesn't wait for for PCIe deveices accesses,
1118       //        so  seq_cst may require more instruction for them.
1119       return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
1120                                         DAG.getTargetConstant(3, DL, MVT::i32),
1121                                         Op.getOperand(0)),
1122                      0);
1123     }
1124   }
1125 
1126   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
1127   return DAG.getNode(VEISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
1128 }
1129 
1130 TargetLowering::AtomicExpansionKind
1131 VETargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
1132   // We have TS1AM implementation for i8/i16/i32/i64, so use it.
1133   if (AI->getOperation() == AtomicRMWInst::Xchg) {
1134     return AtomicExpansionKind::None;
1135   }
1136   // FIXME: Support "ATMAM" instruction for LOAD_ADD/SUB/AND/OR.
1137 
1138   // Otherwise, expand it using compare and exchange instruction to not call
1139   // __sync_fetch_and_* functions.
1140   return AtomicExpansionKind::CmpXChg;
1141 }
1142 
1143 static SDValue prepareTS1AM(SDValue Op, SelectionDAG &DAG, SDValue &Flag,
1144                             SDValue &Bits) {
1145   SDLoc DL(Op);
1146   AtomicSDNode *N = cast<AtomicSDNode>(Op);
1147   SDValue Ptr = N->getOperand(1);
1148   SDValue Val = N->getOperand(2);
1149   EVT PtrVT = Ptr.getValueType();
1150   bool Byte = N->getMemoryVT() == MVT::i8;
1151   //   Remainder = AND Ptr, 3
1152   //   Flag = 1 << Remainder  ; If Byte is true (1 byte swap flag)
1153   //   Flag = 3 << Remainder  ; If Byte is false (2 bytes swap flag)
1154   //   Bits = Remainder << 3
1155   //   NewVal = Val << Bits
1156   SDValue Const3 = DAG.getConstant(3, DL, PtrVT);
1157   SDValue Remainder = DAG.getNode(ISD::AND, DL, PtrVT, {Ptr, Const3});
1158   SDValue Mask = Byte ? DAG.getConstant(1, DL, MVT::i32)
1159                       : DAG.getConstant(3, DL, MVT::i32);
1160   Flag = DAG.getNode(ISD::SHL, DL, MVT::i32, {Mask, Remainder});
1161   Bits = DAG.getNode(ISD::SHL, DL, PtrVT, {Remainder, Const3});
1162   return DAG.getNode(ISD::SHL, DL, Val.getValueType(), {Val, Bits});
1163 }
1164 
1165 static SDValue finalizeTS1AM(SDValue Op, SelectionDAG &DAG, SDValue Data,
1166                              SDValue Bits) {
1167   SDLoc DL(Op);
1168   EVT VT = Data.getValueType();
1169   bool Byte = cast<AtomicSDNode>(Op)->getMemoryVT() == MVT::i8;
1170   //   NewData = Data >> Bits
1171   //   Result = NewData & 0xff   ; If Byte is true (1 byte)
1172   //   Result = NewData & 0xffff ; If Byte is false (2 bytes)
1173 
1174   SDValue NewData = DAG.getNode(ISD::SRL, DL, VT, Data, Bits);
1175   return DAG.getNode(ISD::AND, DL, VT,
1176                      {NewData, DAG.getConstant(Byte ? 0xff : 0xffff, DL, VT)});
1177 }
1178 
1179 SDValue VETargetLowering::lowerATOMIC_SWAP(SDValue Op,
1180                                            SelectionDAG &DAG) const {
1181   SDLoc DL(Op);
1182   AtomicSDNode *N = cast<AtomicSDNode>(Op);
1183 
1184   if (N->getMemoryVT() == MVT::i8) {
1185     // For i8, use "ts1am"
1186     //   Input:
1187     //     ATOMIC_SWAP Ptr, Val, Order
1188     //
1189     //   Output:
1190     //     Remainder = AND Ptr, 3
1191     //     Flag = 1 << Remainder   ; 1 byte swap flag for TS1AM inst.
1192     //     Bits = Remainder << 3
1193     //     NewVal = Val << Bits
1194     //
1195     //     Aligned = AND Ptr, -4
1196     //     Data = TS1AM Aligned, Flag, NewVal
1197     //
1198     //     NewData = Data >> Bits
1199     //     Result = NewData & 0xff ; 1 byte result
1200     SDValue Flag;
1201     SDValue Bits;
1202     SDValue NewVal = prepareTS1AM(Op, DAG, Flag, Bits);
1203 
1204     SDValue Ptr = N->getOperand(1);
1205     SDValue Aligned = DAG.getNode(ISD::AND, DL, Ptr.getValueType(),
1206                                   {Ptr, DAG.getConstant(-4, DL, MVT::i64)});
1207     SDValue TS1AM = DAG.getAtomic(VEISD::TS1AM, DL, N->getMemoryVT(),
1208                                   DAG.getVTList(Op.getNode()->getValueType(0),
1209                                                 Op.getNode()->getValueType(1)),
1210                                   {N->getChain(), Aligned, Flag, NewVal},
1211                                   N->getMemOperand());
1212 
1213     SDValue Result = finalizeTS1AM(Op, DAG, TS1AM, Bits);
1214     SDValue Chain = TS1AM.getValue(1);
1215     return DAG.getMergeValues({Result, Chain}, DL);
1216   }
1217   if (N->getMemoryVT() == MVT::i16) {
1218     // For i16, use "ts1am"
1219     SDValue Flag;
1220     SDValue Bits;
1221     SDValue NewVal = prepareTS1AM(Op, DAG, Flag, Bits);
1222 
1223     SDValue Ptr = N->getOperand(1);
1224     SDValue Aligned = DAG.getNode(ISD::AND, DL, Ptr.getValueType(),
1225                                   {Ptr, DAG.getConstant(-4, DL, MVT::i64)});
1226     SDValue TS1AM = DAG.getAtomic(VEISD::TS1AM, DL, N->getMemoryVT(),
1227                                   DAG.getVTList(Op.getNode()->getValueType(0),
1228                                                 Op.getNode()->getValueType(1)),
1229                                   {N->getChain(), Aligned, Flag, NewVal},
1230                                   N->getMemOperand());
1231 
1232     SDValue Result = finalizeTS1AM(Op, DAG, TS1AM, Bits);
1233     SDValue Chain = TS1AM.getValue(1);
1234     return DAG.getMergeValues({Result, Chain}, DL);
1235   }
1236   // Otherwise, let llvm legalize it.
1237   return Op;
1238 }
1239 
1240 SDValue VETargetLowering::lowerGlobalAddress(SDValue Op,
1241                                              SelectionDAG &DAG) const {
1242   return makeAddress(Op, DAG);
1243 }
1244 
1245 SDValue VETargetLowering::lowerBlockAddress(SDValue Op,
1246                                             SelectionDAG &DAG) const {
1247   return makeAddress(Op, DAG);
1248 }
1249 
1250 SDValue VETargetLowering::lowerConstantPool(SDValue Op,
1251                                             SelectionDAG &DAG) const {
1252   return makeAddress(Op, DAG);
1253 }
1254 
1255 SDValue
1256 VETargetLowering::lowerToTLSGeneralDynamicModel(SDValue Op,
1257                                                 SelectionDAG &DAG) const {
1258   SDLoc DL(Op);
1259 
1260   // Generate the following code:
1261   //   t1: ch,glue = callseq_start t0, 0, 0
1262   //   t2: i64,ch,glue = VEISD::GETTLSADDR t1, label, t1:1
1263   //   t3: ch,glue = callseq_end t2, 0, 0, t2:2
1264   //   t4: i64,ch,glue = CopyFromReg t3, Register:i64 $sx0, t3:1
1265   SDValue Label = withTargetFlags(Op, 0, DAG);
1266   EVT PtrVT = Op.getValueType();
1267 
1268   // Lowering the machine isd will make sure everything is in the right
1269   // location.
1270   SDValue Chain = DAG.getEntryNode();
1271   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
1272   const uint32_t *Mask = Subtarget->getRegisterInfo()->getCallPreservedMask(
1273       DAG.getMachineFunction(), CallingConv::C);
1274   Chain = DAG.getCALLSEQ_START(Chain, 64, 0, DL);
1275   SDValue Args[] = {Chain, Label, DAG.getRegisterMask(Mask), Chain.getValue(1)};
1276   Chain = DAG.getNode(VEISD::GETTLSADDR, DL, NodeTys, Args);
1277   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(64, DL, true),
1278                              DAG.getIntPtrConstant(0, DL, true),
1279                              Chain.getValue(1), DL);
1280   Chain = DAG.getCopyFromReg(Chain, DL, VE::SX0, PtrVT, Chain.getValue(1));
1281 
1282   // GETTLSADDR will be codegen'ed as call. Inform MFI that function has calls.
1283   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
1284   MFI.setHasCalls(true);
1285 
1286   // Also generate code to prepare a GOT register if it is PIC.
1287   if (isPositionIndependent()) {
1288     MachineFunction &MF = DAG.getMachineFunction();
1289     Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
1290   }
1291 
1292   return Chain;
1293 }
1294 
1295 SDValue VETargetLowering::lowerGlobalTLSAddress(SDValue Op,
1296                                                 SelectionDAG &DAG) const {
1297   // The current implementation of nld (2.26) doesn't allow local exec model
1298   // code described in VE-tls_v1.1.pdf (*1) as its input. Instead, we always
1299   // generate the general dynamic model code sequence.
1300   //
1301   // *1: https://www.nec.com/en/global/prod/hpc/aurora/document/VE-tls_v1.1.pdf
1302   return lowerToTLSGeneralDynamicModel(Op, DAG);
1303 }
1304 
1305 SDValue VETargetLowering::lowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
1306   return makeAddress(Op, DAG);
1307 }
1308 
1309 // Lower a f128 load into two f64 loads.
1310 static SDValue lowerLoadF128(SDValue Op, SelectionDAG &DAG) {
1311   SDLoc DL(Op);
1312   LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Op.getNode());
1313   assert(LdNode && LdNode->getOffset().isUndef() && "Unexpected node type");
1314   unsigned Alignment = LdNode->getAlign().value();
1315   if (Alignment > 8)
1316     Alignment = 8;
1317 
1318   SDValue Lo64 =
1319       DAG.getLoad(MVT::f64, DL, LdNode->getChain(), LdNode->getBasePtr(),
1320                   LdNode->getPointerInfo(), Alignment,
1321                   LdNode->isVolatile() ? MachineMemOperand::MOVolatile
1322                                        : MachineMemOperand::MONone);
1323   EVT AddrVT = LdNode->getBasePtr().getValueType();
1324   SDValue HiPtr = DAG.getNode(ISD::ADD, DL, AddrVT, LdNode->getBasePtr(),
1325                               DAG.getConstant(8, DL, AddrVT));
1326   SDValue Hi64 =
1327       DAG.getLoad(MVT::f64, DL, LdNode->getChain(), HiPtr,
1328                   LdNode->getPointerInfo(), Alignment,
1329                   LdNode->isVolatile() ? MachineMemOperand::MOVolatile
1330                                        : MachineMemOperand::MONone);
1331 
1332   SDValue SubRegEven = DAG.getTargetConstant(VE::sub_even, DL, MVT::i32);
1333   SDValue SubRegOdd = DAG.getTargetConstant(VE::sub_odd, DL, MVT::i32);
1334 
1335   // VE stores Hi64 to 8(addr) and Lo64 to 0(addr)
1336   SDNode *InFP128 =
1337       DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f128);
1338   InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f128,
1339                                SDValue(InFP128, 0), Hi64, SubRegEven);
1340   InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f128,
1341                                SDValue(InFP128, 0), Lo64, SubRegOdd);
1342   SDValue OutChains[2] = {SDValue(Lo64.getNode(), 1),
1343                           SDValue(Hi64.getNode(), 1)};
1344   SDValue OutChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
1345   SDValue Ops[2] = {SDValue(InFP128, 0), OutChain};
1346   return DAG.getMergeValues(Ops, DL);
1347 }
1348 
1349 // Lower a vXi1 load into following instructions
1350 //   LDrii %1, (,%addr)
1351 //   LVMxir  %vm, 0, %1
1352 //   LDrii %2, 8(,%addr)
1353 //   LVMxir  %vm, 0, %2
1354 //   ...
1355 static SDValue lowerLoadI1(SDValue Op, SelectionDAG &DAG) {
1356   SDLoc DL(Op);
1357   LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Op.getNode());
1358   assert(LdNode && LdNode->getOffset().isUndef() && "Unexpected node type");
1359 
1360   SDValue BasePtr = LdNode->getBasePtr();
1361   unsigned Alignment = LdNode->getAlign().value();
1362   if (Alignment > 8)
1363     Alignment = 8;
1364 
1365   EVT AddrVT = BasePtr.getValueType();
1366   EVT MemVT = LdNode->getMemoryVT();
1367   if (MemVT == MVT::v256i1 || MemVT == MVT::v4i64) {
1368     SDValue OutChains[4];
1369     SDNode *VM = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MemVT);
1370     for (int i = 0; i < 4; ++i) {
1371       // Generate load dag and prepare chains.
1372       SDValue Addr = DAG.getNode(ISD::ADD, DL, AddrVT, BasePtr,
1373                                  DAG.getConstant(8 * i, DL, AddrVT));
1374       SDValue Val =
1375           DAG.getLoad(MVT::i64, DL, LdNode->getChain(), Addr,
1376                       LdNode->getPointerInfo(), Alignment,
1377                       LdNode->isVolatile() ? MachineMemOperand::MOVolatile
1378                                            : MachineMemOperand::MONone);
1379       OutChains[i] = SDValue(Val.getNode(), 1);
1380 
1381       VM = DAG.getMachineNode(VE::LVMir_m, DL, MVT::i64,
1382                               DAG.getTargetConstant(i, DL, MVT::i64), Val,
1383                               SDValue(VM, 0));
1384     }
1385     SDValue OutChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
1386     SDValue Ops[2] = {SDValue(VM, 0), OutChain};
1387     return DAG.getMergeValues(Ops, DL);
1388   } else if (MemVT == MVT::v512i1 || MemVT == MVT::v8i64) {
1389     SDValue OutChains[8];
1390     SDNode *VM = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MemVT);
1391     for (int i = 0; i < 8; ++i) {
1392       // Generate load dag and prepare chains.
1393       SDValue Addr = DAG.getNode(ISD::ADD, DL, AddrVT, BasePtr,
1394                                  DAG.getConstant(8 * i, DL, AddrVT));
1395       SDValue Val =
1396           DAG.getLoad(MVT::i64, DL, LdNode->getChain(), Addr,
1397                       LdNode->getPointerInfo(), Alignment,
1398                       LdNode->isVolatile() ? MachineMemOperand::MOVolatile
1399                                            : MachineMemOperand::MONone);
1400       OutChains[i] = SDValue(Val.getNode(), 1);
1401 
1402       VM = DAG.getMachineNode(VE::LVMyir_y, DL, MVT::i64,
1403                               DAG.getTargetConstant(i, DL, MVT::i64), Val,
1404                               SDValue(VM, 0));
1405     }
1406     SDValue OutChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
1407     SDValue Ops[2] = {SDValue(VM, 0), OutChain};
1408     return DAG.getMergeValues(Ops, DL);
1409   } else {
1410     // Otherwise, ask llvm to expand it.
1411     return SDValue();
1412   }
1413 }
1414 
1415 SDValue VETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
1416   LoadSDNode *LdNode = cast<LoadSDNode>(Op.getNode());
1417 
1418   EVT MemVT = LdNode->getMemoryVT();
1419 
1420   // Dispatch to vector isel.
1421   if (MemVT.isVector() && !isMaskType(MemVT))
1422     return lowerToVVP(Op, DAG);
1423 
1424   SDValue BasePtr = LdNode->getBasePtr();
1425   if (isa<FrameIndexSDNode>(BasePtr.getNode())) {
1426     // Do not expand store instruction with frame index here because of
1427     // dependency problems.  We expand it later in eliminateFrameIndex().
1428     return Op;
1429   }
1430 
1431   if (MemVT == MVT::f128)
1432     return lowerLoadF128(Op, DAG);
1433   if (isMaskType(MemVT))
1434     return lowerLoadI1(Op, DAG);
1435 
1436   return Op;
1437 }
1438 
1439 // Lower a f128 store into two f64 stores.
1440 static SDValue lowerStoreF128(SDValue Op, SelectionDAG &DAG) {
1441   SDLoc DL(Op);
1442   StoreSDNode *StNode = dyn_cast<StoreSDNode>(Op.getNode());
1443   assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type");
1444 
1445   SDValue SubRegEven = DAG.getTargetConstant(VE::sub_even, DL, MVT::i32);
1446   SDValue SubRegOdd = DAG.getTargetConstant(VE::sub_odd, DL, MVT::i32);
1447 
1448   SDNode *Hi64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i64,
1449                                     StNode->getValue(), SubRegEven);
1450   SDNode *Lo64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i64,
1451                                     StNode->getValue(), SubRegOdd);
1452 
1453   unsigned Alignment = StNode->getAlign().value();
1454   if (Alignment > 8)
1455     Alignment = 8;
1456 
1457   // VE stores Hi64 to 8(addr) and Lo64 to 0(addr)
1458   SDValue OutChains[2];
1459   OutChains[0] =
1460       DAG.getStore(StNode->getChain(), DL, SDValue(Lo64, 0),
1461                    StNode->getBasePtr(), MachinePointerInfo(), Alignment,
1462                    StNode->isVolatile() ? MachineMemOperand::MOVolatile
1463                                         : MachineMemOperand::MONone);
1464   EVT AddrVT = StNode->getBasePtr().getValueType();
1465   SDValue HiPtr = DAG.getNode(ISD::ADD, DL, AddrVT, StNode->getBasePtr(),
1466                               DAG.getConstant(8, DL, AddrVT));
1467   OutChains[1] =
1468       DAG.getStore(StNode->getChain(), DL, SDValue(Hi64, 0), HiPtr,
1469                    MachinePointerInfo(), Alignment,
1470                    StNode->isVolatile() ? MachineMemOperand::MOVolatile
1471                                         : MachineMemOperand::MONone);
1472   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
1473 }
1474 
1475 // Lower a vXi1 store into following instructions
1476 //   SVMi  %1, %vm, 0
1477 //   STrii %1, (,%addr)
1478 //   SVMi  %2, %vm, 1
1479 //   STrii %2, 8(,%addr)
1480 //   ...
1481 static SDValue lowerStoreI1(SDValue Op, SelectionDAG &DAG) {
1482   SDLoc DL(Op);
1483   StoreSDNode *StNode = dyn_cast<StoreSDNode>(Op.getNode());
1484   assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type");
1485 
1486   SDValue BasePtr = StNode->getBasePtr();
1487   unsigned Alignment = StNode->getAlign().value();
1488   if (Alignment > 8)
1489     Alignment = 8;
1490   EVT AddrVT = BasePtr.getValueType();
1491   EVT MemVT = StNode->getMemoryVT();
1492   if (MemVT == MVT::v256i1 || MemVT == MVT::v4i64) {
1493     SDValue OutChains[4];
1494     for (int i = 0; i < 4; ++i) {
1495       SDNode *V =
1496           DAG.getMachineNode(VE::SVMmi, DL, MVT::i64, StNode->getValue(),
1497                              DAG.getTargetConstant(i, DL, MVT::i64));
1498       SDValue Addr = DAG.getNode(ISD::ADD, DL, AddrVT, BasePtr,
1499                                  DAG.getConstant(8 * i, DL, AddrVT));
1500       OutChains[i] =
1501           DAG.getStore(StNode->getChain(), DL, SDValue(V, 0), Addr,
1502                        MachinePointerInfo(), Alignment,
1503                        StNode->isVolatile() ? MachineMemOperand::MOVolatile
1504                                             : MachineMemOperand::MONone);
1505     }
1506     return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
1507   } else if (MemVT == MVT::v512i1 || MemVT == MVT::v8i64) {
1508     SDValue OutChains[8];
1509     for (int i = 0; i < 8; ++i) {
1510       SDNode *V =
1511           DAG.getMachineNode(VE::SVMyi, DL, MVT::i64, StNode->getValue(),
1512                              DAG.getTargetConstant(i, DL, MVT::i64));
1513       SDValue Addr = DAG.getNode(ISD::ADD, DL, AddrVT, BasePtr,
1514                                  DAG.getConstant(8 * i, DL, AddrVT));
1515       OutChains[i] =
1516           DAG.getStore(StNode->getChain(), DL, SDValue(V, 0), Addr,
1517                        MachinePointerInfo(), Alignment,
1518                        StNode->isVolatile() ? MachineMemOperand::MOVolatile
1519                                             : MachineMemOperand::MONone);
1520     }
1521     return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
1522   } else {
1523     // Otherwise, ask llvm to expand it.
1524     return SDValue();
1525   }
1526 }
1527 
1528 SDValue VETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1529   StoreSDNode *StNode = cast<StoreSDNode>(Op.getNode());
1530   assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type");
1531 
1532   // always expand non-mask vector loads to VVP
1533   EVT MemVT = StNode->getMemoryVT();
1534   if (MemVT.isVector() && !isMaskType(MemVT))
1535     return lowerToVVP(Op, DAG);
1536 
1537   SDValue BasePtr = StNode->getBasePtr();
1538   if (isa<FrameIndexSDNode>(BasePtr.getNode())) {
1539     // Do not expand store instruction with frame index here because of
1540     // dependency problems.  We expand it later in eliminateFrameIndex().
1541     return Op;
1542   }
1543 
1544   if (MemVT == MVT::f128)
1545     return lowerStoreF128(Op, DAG);
1546   if (isMaskType(MemVT))
1547     return lowerStoreI1(Op, DAG);
1548 
1549   // Otherwise, ask llvm to expand it.
1550   return SDValue();
1551 }
1552 
1553 SDValue VETargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
1554   MachineFunction &MF = DAG.getMachineFunction();
1555   VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
1556   auto PtrVT = getPointerTy(DAG.getDataLayout());
1557 
1558   // Need frame address to find the address of VarArgsFrameIndex.
1559   MF.getFrameInfo().setFrameAddressIsTaken(true);
1560 
1561   // vastart just stores the address of the VarArgsFrameIndex slot into the
1562   // memory location argument.
1563   SDLoc DL(Op);
1564   SDValue Offset =
1565       DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getRegister(VE::SX9, PtrVT),
1566                   DAG.getIntPtrConstant(FuncInfo->getVarArgsFrameOffset(), DL));
1567   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
1568   return DAG.getStore(Op.getOperand(0), DL, Offset, Op.getOperand(1),
1569                       MachinePointerInfo(SV));
1570 }
1571 
1572 SDValue VETargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
1573   SDNode *Node = Op.getNode();
1574   EVT VT = Node->getValueType(0);
1575   SDValue InChain = Node->getOperand(0);
1576   SDValue VAListPtr = Node->getOperand(1);
1577   EVT PtrVT = VAListPtr.getValueType();
1578   const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
1579   SDLoc DL(Node);
1580   SDValue VAList =
1581       DAG.getLoad(PtrVT, DL, InChain, VAListPtr, MachinePointerInfo(SV));
1582   SDValue Chain = VAList.getValue(1);
1583   SDValue NextPtr;
1584 
1585   if (VT == MVT::f128) {
1586     // VE f128 values must be stored with 16 bytes alignment.  We don't
1587     // know the actual alignment of VAList, so we take alignment of it
1588     // dynamically.
1589     int Align = 16;
1590     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
1591                          DAG.getConstant(Align - 1, DL, PtrVT));
1592     VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
1593                          DAG.getConstant(-Align, DL, PtrVT));
1594     // Increment the pointer, VAList, by 16 to the next vaarg.
1595     NextPtr =
1596         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(16, DL));
1597   } else if (VT == MVT::f32) {
1598     // float --> need special handling like below.
1599     //    0      4
1600     //    +------+------+
1601     //    | empty| float|
1602     //    +------+------+
1603     // Increment the pointer, VAList, by 8 to the next vaarg.
1604     NextPtr =
1605         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(8, DL));
1606     // Then, adjust VAList.
1607     unsigned InternalOffset = 4;
1608     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
1609                          DAG.getConstant(InternalOffset, DL, PtrVT));
1610   } else {
1611     // Increment the pointer, VAList, by 8 to the next vaarg.
1612     NextPtr =
1613         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(8, DL));
1614   }
1615 
1616   // Store the incremented VAList to the legalized pointer.
1617   InChain = DAG.getStore(Chain, DL, NextPtr, VAListPtr, MachinePointerInfo(SV));
1618 
1619   // Load the actual argument out of the pointer VAList.
1620   // We can't count on greater alignment than the word size.
1621   return DAG.getLoad(VT, DL, InChain, VAList, MachinePointerInfo(),
1622                      std::min(PtrVT.getSizeInBits(), VT.getSizeInBits()) / 8);
1623 }
1624 
1625 SDValue VETargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op,
1626                                                   SelectionDAG &DAG) const {
1627   // Generate following code.
1628   //   (void)__llvm_grow_stack(size);
1629   //   ret = GETSTACKTOP;        // pseudo instruction
1630   SDLoc DL(Op);
1631 
1632   // Get the inputs.
1633   SDNode *Node = Op.getNode();
1634   SDValue Chain = Op.getOperand(0);
1635   SDValue Size = Op.getOperand(1);
1636   MaybeAlign Alignment(Op.getConstantOperandVal(2));
1637   EVT VT = Node->getValueType(0);
1638 
1639   // Chain the dynamic stack allocation so that it doesn't modify the stack
1640   // pointer when other instructions are using the stack.
1641   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
1642 
1643   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
1644   Align StackAlign = TFI.getStackAlign();
1645   bool NeedsAlign = Alignment.valueOrOne() > StackAlign;
1646 
1647   // Prepare arguments
1648   TargetLowering::ArgListTy Args;
1649   TargetLowering::ArgListEntry Entry;
1650   Entry.Node = Size;
1651   Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
1652   Args.push_back(Entry);
1653   if (NeedsAlign) {
1654     Entry.Node = DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT);
1655     Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
1656     Args.push_back(Entry);
1657   }
1658   Type *RetTy = Type::getVoidTy(*DAG.getContext());
1659 
1660   EVT PtrVT = Op.getValueType();
1661   SDValue Callee;
1662   if (NeedsAlign) {
1663     Callee = DAG.getTargetExternalSymbol("__ve_grow_stack_align", PtrVT, 0);
1664   } else {
1665     Callee = DAG.getTargetExternalSymbol("__ve_grow_stack", PtrVT, 0);
1666   }
1667 
1668   TargetLowering::CallLoweringInfo CLI(DAG);
1669   CLI.setDebugLoc(DL)
1670       .setChain(Chain)
1671       .setCallee(CallingConv::PreserveAll, RetTy, Callee, std::move(Args))
1672       .setDiscardResult(true);
1673   std::pair<SDValue, SDValue> pair = LowerCallTo(CLI);
1674   Chain = pair.second;
1675   SDValue Result = DAG.getNode(VEISD::GETSTACKTOP, DL, VT, Chain);
1676   if (NeedsAlign) {
1677     Result = DAG.getNode(ISD::ADD, DL, VT, Result,
1678                          DAG.getConstant((Alignment->value() - 1ULL), DL, VT));
1679     Result = DAG.getNode(ISD::AND, DL, VT, Result,
1680                          DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT));
1681   }
1682   //  Chain = Result.getValue(1);
1683   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
1684                              DAG.getIntPtrConstant(0, DL, true), SDValue(), DL);
1685 
1686   SDValue Ops[2] = {Result, Chain};
1687   return DAG.getMergeValues(Ops, DL);
1688 }
1689 
1690 SDValue VETargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
1691                                                SelectionDAG &DAG) const {
1692   SDLoc DL(Op);
1693   return DAG.getNode(VEISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0),
1694                      Op.getOperand(1));
1695 }
1696 
1697 SDValue VETargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
1698                                               SelectionDAG &DAG) const {
1699   SDLoc DL(Op);
1700   return DAG.getNode(VEISD::EH_SJLJ_SETJMP, DL,
1701                      DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
1702                      Op.getOperand(1));
1703 }
1704 
1705 SDValue VETargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
1706                                                       SelectionDAG &DAG) const {
1707   SDLoc DL(Op);
1708   return DAG.getNode(VEISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
1709                      Op.getOperand(0));
1710 }
1711 
1712 static SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG,
1713                               const VETargetLowering &TLI,
1714                               const VESubtarget *Subtarget) {
1715   SDLoc DL(Op);
1716   MachineFunction &MF = DAG.getMachineFunction();
1717   EVT PtrVT = TLI.getPointerTy(MF.getDataLayout());
1718 
1719   MachineFrameInfo &MFI = MF.getFrameInfo();
1720   MFI.setFrameAddressIsTaken(true);
1721 
1722   unsigned Depth = Op.getConstantOperandVal(0);
1723   const VERegisterInfo *RegInfo = Subtarget->getRegisterInfo();
1724   Register FrameReg = RegInfo->getFrameRegister(MF);
1725   SDValue FrameAddr =
1726       DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameReg, PtrVT);
1727   while (Depth--)
1728     FrameAddr = DAG.getLoad(Op.getValueType(), DL, DAG.getEntryNode(),
1729                             FrameAddr, MachinePointerInfo());
1730   return FrameAddr;
1731 }
1732 
1733 static SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
1734                                const VETargetLowering &TLI,
1735                                const VESubtarget *Subtarget) {
1736   MachineFunction &MF = DAG.getMachineFunction();
1737   MachineFrameInfo &MFI = MF.getFrameInfo();
1738   MFI.setReturnAddressIsTaken(true);
1739 
1740   if (TLI.verifyReturnAddressArgumentIsConstant(Op, DAG))
1741     return SDValue();
1742 
1743   SDValue FrameAddr = lowerFRAMEADDR(Op, DAG, TLI, Subtarget);
1744 
1745   SDLoc DL(Op);
1746   EVT VT = Op.getValueType();
1747   SDValue Offset = DAG.getConstant(8, DL, VT);
1748   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1749                      DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
1750                      MachinePointerInfo());
1751 }
1752 
1753 SDValue VETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
1754                                                   SelectionDAG &DAG) const {
1755   SDLoc DL(Op);
1756   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1757   switch (IntNo) {
1758   default: // Don't custom lower most intrinsics.
1759     return SDValue();
1760   case Intrinsic::eh_sjlj_lsda: {
1761     MachineFunction &MF = DAG.getMachineFunction();
1762     MVT VT = Op.getSimpleValueType();
1763     const VETargetMachine *TM =
1764         static_cast<const VETargetMachine *>(&DAG.getTarget());
1765 
1766     // Create GCC_except_tableXX string.  The real symbol for that will be
1767     // generated in EHStreamer::emitExceptionTable() later.  So, we just
1768     // borrow it's name here.
1769     TM->getStrList()->push_back(std::string(
1770         (Twine("GCC_except_table") + Twine(MF.getFunctionNumber())).str()));
1771     SDValue Addr =
1772         DAG.getTargetExternalSymbol(TM->getStrList()->back().c_str(), VT, 0);
1773     if (isPositionIndependent()) {
1774       Addr = makeHiLoPair(Addr, VEMCExpr::VK_VE_GOTOFF_HI32,
1775                           VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
1776       SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, VT);
1777       return DAG.getNode(ISD::ADD, DL, VT, GlobalBase, Addr);
1778     }
1779     return makeHiLoPair(Addr, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
1780   }
1781   }
1782 }
1783 
1784 static bool getUniqueInsertion(SDNode *N, unsigned &UniqueIdx) {
1785   if (!isa<BuildVectorSDNode>(N))
1786     return false;
1787   const auto *BVN = cast<BuildVectorSDNode>(N);
1788 
1789   // Find first non-undef insertion.
1790   unsigned Idx;
1791   for (Idx = 0; Idx < BVN->getNumOperands(); ++Idx) {
1792     auto ElemV = BVN->getOperand(Idx);
1793     if (!ElemV->isUndef())
1794       break;
1795   }
1796   // Catch the (hypothetical) all-undef case.
1797   if (Idx == BVN->getNumOperands())
1798     return false;
1799   // Remember insertion.
1800   UniqueIdx = Idx++;
1801   // Verify that all other insertions are undef.
1802   for (; Idx < BVN->getNumOperands(); ++Idx) {
1803     auto ElemV = BVN->getOperand(Idx);
1804     if (!ElemV->isUndef())
1805       return false;
1806   }
1807   return true;
1808 }
1809 
1810 static SDValue getSplatValue(SDNode *N) {
1811   if (auto *BuildVec = dyn_cast<BuildVectorSDNode>(N)) {
1812     return BuildVec->getSplatValue();
1813   }
1814   return SDValue();
1815 }
1816 
1817 SDValue VETargetLowering::lowerBUILD_VECTOR(SDValue Op,
1818                                             SelectionDAG &DAG) const {
1819   VECustomDAG CDAG(DAG, Op);
1820   MVT ResultVT = Op.getSimpleValueType();
1821 
1822   // If there is just one element, expand to INSERT_VECTOR_ELT.
1823   unsigned UniqueIdx;
1824   if (getUniqueInsertion(Op.getNode(), UniqueIdx)) {
1825     SDValue AccuV = CDAG.getUNDEF(Op.getValueType());
1826     auto ElemV = Op->getOperand(UniqueIdx);
1827     SDValue IdxV = CDAG.getConstant(UniqueIdx, MVT::i64);
1828     return CDAG.getNode(ISD::INSERT_VECTOR_ELT, ResultVT, {AccuV, ElemV, IdxV});
1829   }
1830 
1831   // Else emit a broadcast.
1832   if (SDValue ScalarV = getSplatValue(Op.getNode())) {
1833     unsigned NumEls = ResultVT.getVectorNumElements();
1834     auto AVL = CDAG.getConstant(NumEls, MVT::i32);
1835     return CDAG.getBroadcast(ResultVT, ScalarV, AVL);
1836   }
1837 
1838   // Expand
1839   return SDValue();
1840 }
1841 
1842 TargetLowering::LegalizeAction
1843 VETargetLowering::getCustomOperationAction(SDNode &Op) const {
1844   // Custom legalization on VVP_* and VEC_* opcodes is required to pack-legalize
1845   // these operations (transform nodes such that their AVL parameter refers to
1846   // packs of 64bit, instead of number of elements.
1847 
1848   // Packing opcodes are created with a pack-legal AVL (LEGALAVL). No need to
1849   // re-visit them.
1850   if (isPackingSupportOpcode(Op.getOpcode()))
1851     return Legal;
1852 
1853   // Custom lower to legalize AVL for packed mode.
1854   if (isVVPOrVEC(Op.getOpcode()))
1855     return Custom;
1856   return Legal;
1857 }
1858 
1859 SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
1860   LLVM_DEBUG(dbgs() << "::LowerOperation"; Op->print(dbgs()););
1861   unsigned Opcode = Op.getOpcode();
1862 
1863   /// Scalar isel.
1864   switch (Opcode) {
1865   case ISD::ATOMIC_FENCE:
1866     return lowerATOMIC_FENCE(Op, DAG);
1867   case ISD::ATOMIC_SWAP:
1868     return lowerATOMIC_SWAP(Op, DAG);
1869   case ISD::BlockAddress:
1870     return lowerBlockAddress(Op, DAG);
1871   case ISD::ConstantPool:
1872     return lowerConstantPool(Op, DAG);
1873   case ISD::DYNAMIC_STACKALLOC:
1874     return lowerDYNAMIC_STACKALLOC(Op, DAG);
1875   case ISD::EH_SJLJ_LONGJMP:
1876     return lowerEH_SJLJ_LONGJMP(Op, DAG);
1877   case ISD::EH_SJLJ_SETJMP:
1878     return lowerEH_SJLJ_SETJMP(Op, DAG);
1879   case ISD::EH_SJLJ_SETUP_DISPATCH:
1880     return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
1881   case ISD::FRAMEADDR:
1882     return lowerFRAMEADDR(Op, DAG, *this, Subtarget);
1883   case ISD::GlobalAddress:
1884     return lowerGlobalAddress(Op, DAG);
1885   case ISD::GlobalTLSAddress:
1886     return lowerGlobalTLSAddress(Op, DAG);
1887   case ISD::INTRINSIC_WO_CHAIN:
1888     return lowerINTRINSIC_WO_CHAIN(Op, DAG);
1889   case ISD::JumpTable:
1890     return lowerJumpTable(Op, DAG);
1891   case ISD::LOAD:
1892     return lowerLOAD(Op, DAG);
1893   case ISD::RETURNADDR:
1894     return lowerRETURNADDR(Op, DAG, *this, Subtarget);
1895   case ISD::BUILD_VECTOR:
1896     return lowerBUILD_VECTOR(Op, DAG);
1897   case ISD::STORE:
1898     return lowerSTORE(Op, DAG);
1899   case ISD::VASTART:
1900     return lowerVASTART(Op, DAG);
1901   case ISD::VAARG:
1902     return lowerVAARG(Op, DAG);
1903 
1904   case ISD::INSERT_VECTOR_ELT:
1905     return lowerINSERT_VECTOR_ELT(Op, DAG);
1906   case ISD::EXTRACT_VECTOR_ELT:
1907     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
1908   }
1909 
1910   /// Vector isel.
1911   LLVM_DEBUG(dbgs() << "::LowerOperation_VVP"; Op->print(dbgs()););
1912   if (ISD::isVPOpcode(Opcode))
1913     return lowerToVVP(Op, DAG);
1914 
1915   switch (Opcode) {
1916   default:
1917     llvm_unreachable("Should not custom lower this!");
1918 
1919   // Legalize the AVL of this internal node.
1920   case VEISD::VEC_BROADCAST:
1921 #define ADD_VVP_OP(VVP_NAME, ...) case VEISD::VVP_NAME:
1922 #include "VVPNodes.def"
1923     // AVL already legalized.
1924     if (getAnnotatedNodeAVL(Op).second)
1925       return Op;
1926     return legalizeInternalVectorOp(Op, DAG);
1927 
1928     // Translate into a VEC_*/VVP_* layer operation.
1929   case ISD::MLOAD:
1930   case ISD::MSTORE:
1931 #define ADD_VVP_OP(VVP_NAME, ISD_NAME) case ISD::ISD_NAME:
1932 #include "VVPNodes.def"
1933     if (isMaskArithmetic(Op) && isPackedVectorType(Op.getValueType()))
1934       return splitMaskArithmetic(Op, DAG);
1935     return lowerToVVP(Op, DAG);
1936   }
1937 }
1938 /// } Custom Lower
1939 
1940 void VETargetLowering::ReplaceNodeResults(SDNode *N,
1941                                           SmallVectorImpl<SDValue> &Results,
1942                                           SelectionDAG &DAG) const {
1943   switch (N->getOpcode()) {
1944   case ISD::ATOMIC_SWAP:
1945     // Let LLVM expand atomic swap instruction through LowerOperation.
1946     return;
1947   default:
1948     LLVM_DEBUG(N->dumpr(&DAG));
1949     llvm_unreachable("Do not know how to custom type legalize this operation!");
1950   }
1951 }
1952 
1953 /// JumpTable for VE.
1954 ///
1955 ///   VE cannot generate relocatable symbol in jump table.  VE cannot
1956 ///   generate expressions using symbols in both text segment and data
1957 ///   segment like below.
1958 ///             .4byte  .LBB0_2-.LJTI0_0
1959 ///   So, we generate offset from the top of function like below as
1960 ///   a custom label.
1961 ///             .4byte  .LBB0_2-<function name>
1962 
1963 unsigned VETargetLowering::getJumpTableEncoding() const {
1964   // Use custom label for PIC.
1965   if (isPositionIndependent())
1966     return MachineJumpTableInfo::EK_Custom32;
1967 
1968   // Otherwise, use the normal jump table encoding heuristics.
1969   return TargetLowering::getJumpTableEncoding();
1970 }
1971 
1972 const MCExpr *VETargetLowering::LowerCustomJumpTableEntry(
1973     const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB,
1974     unsigned Uid, MCContext &Ctx) const {
1975   assert(isPositionIndependent());
1976 
1977   // Generate custom label for PIC like below.
1978   //    .4bytes  .LBB0_2-<function name>
1979   const auto *Value = MCSymbolRefExpr::create(MBB->getSymbol(), Ctx);
1980   MCSymbol *Sym = Ctx.getOrCreateSymbol(MBB->getParent()->getName().data());
1981   const auto *Base = MCSymbolRefExpr::create(Sym, Ctx);
1982   return MCBinaryExpr::createSub(Value, Base, Ctx);
1983 }
1984 
1985 SDValue VETargetLowering::getPICJumpTableRelocBase(SDValue Table,
1986                                                    SelectionDAG &DAG) const {
1987   assert(isPositionIndependent());
1988   SDLoc DL(Table);
1989   Function *Function = &DAG.getMachineFunction().getFunction();
1990   assert(Function != nullptr);
1991   auto PtrTy = getPointerTy(DAG.getDataLayout(), Function->getAddressSpace());
1992 
1993   // In the jump table, we have following values in PIC mode.
1994   //    .4bytes  .LBB0_2-<function name>
1995   // We need to add this value and the address of this function to generate
1996   // .LBB0_2 label correctly under PIC mode.  So, we want to generate following
1997   // instructions:
1998   //     lea %reg, fun@gotoff_lo
1999   //     and %reg, %reg, (32)0
2000   //     lea.sl %reg, fun@gotoff_hi(%reg, %got)
2001   // In order to do so, we need to genarate correctly marked DAG node using
2002   // makeHiLoPair.
2003   SDValue Op = DAG.getGlobalAddress(Function, DL, PtrTy);
2004   SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOTOFF_HI32,
2005                               VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
2006   SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrTy);
2007   return DAG.getNode(ISD::ADD, DL, PtrTy, GlobalBase, HiLo);
2008 }
2009 
2010 Register VETargetLowering::prepareMBB(MachineBasicBlock &MBB,
2011                                       MachineBasicBlock::iterator I,
2012                                       MachineBasicBlock *TargetBB,
2013                                       const DebugLoc &DL) const {
2014   MachineFunction *MF = MBB.getParent();
2015   MachineRegisterInfo &MRI = MF->getRegInfo();
2016   const VEInstrInfo *TII = Subtarget->getInstrInfo();
2017 
2018   const TargetRegisterClass *RC = &VE::I64RegClass;
2019   Register Tmp1 = MRI.createVirtualRegister(RC);
2020   Register Tmp2 = MRI.createVirtualRegister(RC);
2021   Register Result = MRI.createVirtualRegister(RC);
2022 
2023   if (isPositionIndependent()) {
2024     // Create following instructions for local linkage PIC code.
2025     //     lea %Tmp1, TargetBB@gotoff_lo
2026     //     and %Tmp2, %Tmp1, (32)0
2027     //     lea.sl %Result, TargetBB@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
2028     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
2029         .addImm(0)
2030         .addImm(0)
2031         .addMBB(TargetBB, VEMCExpr::VK_VE_GOTOFF_LO32);
2032     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
2033         .addReg(Tmp1, getKillRegState(true))
2034         .addImm(M0(32));
2035     BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Result)
2036         .addReg(VE::SX15)
2037         .addReg(Tmp2, getKillRegState(true))
2038         .addMBB(TargetBB, VEMCExpr::VK_VE_GOTOFF_HI32);
2039   } else {
2040     // Create following instructions for non-PIC code.
2041     //     lea     %Tmp1, TargetBB@lo
2042     //     and     %Tmp2, %Tmp1, (32)0
2043     //     lea.sl  %Result, TargetBB@hi(%Tmp2)
2044     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
2045         .addImm(0)
2046         .addImm(0)
2047         .addMBB(TargetBB, VEMCExpr::VK_VE_LO32);
2048     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
2049         .addReg(Tmp1, getKillRegState(true))
2050         .addImm(M0(32));
2051     BuildMI(MBB, I, DL, TII->get(VE::LEASLrii), Result)
2052         .addReg(Tmp2, getKillRegState(true))
2053         .addImm(0)
2054         .addMBB(TargetBB, VEMCExpr::VK_VE_HI32);
2055   }
2056   return Result;
2057 }
2058 
2059 Register VETargetLowering::prepareSymbol(MachineBasicBlock &MBB,
2060                                          MachineBasicBlock::iterator I,
2061                                          StringRef Symbol, const DebugLoc &DL,
2062                                          bool IsLocal = false,
2063                                          bool IsCall = false) const {
2064   MachineFunction *MF = MBB.getParent();
2065   MachineRegisterInfo &MRI = MF->getRegInfo();
2066   const VEInstrInfo *TII = Subtarget->getInstrInfo();
2067 
2068   const TargetRegisterClass *RC = &VE::I64RegClass;
2069   Register Result = MRI.createVirtualRegister(RC);
2070 
2071   if (isPositionIndependent()) {
2072     if (IsCall && !IsLocal) {
2073       // Create following instructions for non-local linkage PIC code function
2074       // calls.  These instructions uses IC and magic number -24, so we expand
2075       // them in VEAsmPrinter.cpp from GETFUNPLT pseudo instruction.
2076       //     lea %Reg, Symbol@plt_lo(-24)
2077       //     and %Reg, %Reg, (32)0
2078       //     sic %s16
2079       //     lea.sl %Result, Symbol@plt_hi(%Reg, %s16) ; %s16 is PLT
2080       BuildMI(MBB, I, DL, TII->get(VE::GETFUNPLT), Result)
2081           .addExternalSymbol("abort");
2082     } else if (IsLocal) {
2083       Register Tmp1 = MRI.createVirtualRegister(RC);
2084       Register Tmp2 = MRI.createVirtualRegister(RC);
2085       // Create following instructions for local linkage PIC code.
2086       //     lea %Tmp1, Symbol@gotoff_lo
2087       //     and %Tmp2, %Tmp1, (32)0
2088       //     lea.sl %Result, Symbol@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
2089       BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
2090           .addImm(0)
2091           .addImm(0)
2092           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOTOFF_LO32);
2093       BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
2094           .addReg(Tmp1, getKillRegState(true))
2095           .addImm(M0(32));
2096       BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Result)
2097           .addReg(VE::SX15)
2098           .addReg(Tmp2, getKillRegState(true))
2099           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOTOFF_HI32);
2100     } else {
2101       Register Tmp1 = MRI.createVirtualRegister(RC);
2102       Register Tmp2 = MRI.createVirtualRegister(RC);
2103       // Create following instructions for not local linkage PIC code.
2104       //     lea %Tmp1, Symbol@got_lo
2105       //     and %Tmp2, %Tmp1, (32)0
2106       //     lea.sl %Tmp3, Symbol@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
2107       //     ld %Result, 0(%Tmp3)
2108       Register Tmp3 = MRI.createVirtualRegister(RC);
2109       BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
2110           .addImm(0)
2111           .addImm(0)
2112           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOT_LO32);
2113       BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
2114           .addReg(Tmp1, getKillRegState(true))
2115           .addImm(M0(32));
2116       BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Tmp3)
2117           .addReg(VE::SX15)
2118           .addReg(Tmp2, getKillRegState(true))
2119           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOT_HI32);
2120       BuildMI(MBB, I, DL, TII->get(VE::LDrii), Result)
2121           .addReg(Tmp3, getKillRegState(true))
2122           .addImm(0)
2123           .addImm(0);
2124     }
2125   } else {
2126     Register Tmp1 = MRI.createVirtualRegister(RC);
2127     Register Tmp2 = MRI.createVirtualRegister(RC);
2128     // Create following instructions for non-PIC code.
2129     //     lea     %Tmp1, Symbol@lo
2130     //     and     %Tmp2, %Tmp1, (32)0
2131     //     lea.sl  %Result, Symbol@hi(%Tmp2)
2132     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
2133         .addImm(0)
2134         .addImm(0)
2135         .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_LO32);
2136     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
2137         .addReg(Tmp1, getKillRegState(true))
2138         .addImm(M0(32));
2139     BuildMI(MBB, I, DL, TII->get(VE::LEASLrii), Result)
2140         .addReg(Tmp2, getKillRegState(true))
2141         .addImm(0)
2142         .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_HI32);
2143   }
2144   return Result;
2145 }
2146 
2147 void VETargetLowering::setupEntryBlockForSjLj(MachineInstr &MI,
2148                                               MachineBasicBlock *MBB,
2149                                               MachineBasicBlock *DispatchBB,
2150                                               int FI, int Offset) const {
2151   DebugLoc DL = MI.getDebugLoc();
2152   const VEInstrInfo *TII = Subtarget->getInstrInfo();
2153 
2154   Register LabelReg =
2155       prepareMBB(*MBB, MachineBasicBlock::iterator(MI), DispatchBB, DL);
2156 
2157   // Store an address of DispatchBB to a given jmpbuf[1] where has next IC
2158   // referenced by longjmp (throw) later.
2159   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
2160   addFrameReference(MIB, FI, Offset); // jmpbuf[1]
2161   MIB.addReg(LabelReg, getKillRegState(true));
2162 }
2163 
2164 MachineBasicBlock *
2165 VETargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
2166                                    MachineBasicBlock *MBB) const {
2167   DebugLoc DL = MI.getDebugLoc();
2168   MachineFunction *MF = MBB->getParent();
2169   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2170   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
2171   MachineRegisterInfo &MRI = MF->getRegInfo();
2172 
2173   const BasicBlock *BB = MBB->getBasicBlock();
2174   MachineFunction::iterator I = ++MBB->getIterator();
2175 
2176   // Memory Reference.
2177   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
2178                                            MI.memoperands_end());
2179   Register BufReg = MI.getOperand(1).getReg();
2180 
2181   Register DstReg;
2182 
2183   DstReg = MI.getOperand(0).getReg();
2184   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
2185   assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
2186   (void)TRI;
2187   Register MainDestReg = MRI.createVirtualRegister(RC);
2188   Register RestoreDestReg = MRI.createVirtualRegister(RC);
2189 
2190   // For `v = call @llvm.eh.sjlj.setjmp(buf)`, we generate following
2191   // instructions.  SP/FP must be saved in jmpbuf before `llvm.eh.sjlj.setjmp`.
2192   //
2193   // ThisMBB:
2194   //   buf[3] = %s17 iff %s17 is used as BP
2195   //   buf[1] = RestoreMBB as IC after longjmp
2196   //   # SjLjSetup RestoreMBB
2197   //
2198   // MainMBB:
2199   //   v_main = 0
2200   //
2201   // SinkMBB:
2202   //   v = phi(v_main, MainMBB, v_restore, RestoreMBB)
2203   //   ...
2204   //
2205   // RestoreMBB:
2206   //   %s17 = buf[3] = iff %s17 is used as BP
2207   //   v_restore = 1
2208   //   goto SinkMBB
2209 
2210   MachineBasicBlock *ThisMBB = MBB;
2211   MachineBasicBlock *MainMBB = MF->CreateMachineBasicBlock(BB);
2212   MachineBasicBlock *SinkMBB = MF->CreateMachineBasicBlock(BB);
2213   MachineBasicBlock *RestoreMBB = MF->CreateMachineBasicBlock(BB);
2214   MF->insert(I, MainMBB);
2215   MF->insert(I, SinkMBB);
2216   MF->push_back(RestoreMBB);
2217   RestoreMBB->setHasAddressTaken();
2218 
2219   // Transfer the remainder of BB and its successor edges to SinkMBB.
2220   SinkMBB->splice(SinkMBB->begin(), MBB,
2221                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
2222   SinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
2223 
2224   // ThisMBB:
2225   Register LabelReg =
2226       prepareMBB(*MBB, MachineBasicBlock::iterator(MI), RestoreMBB, DL);
2227 
2228   // Store BP in buf[3] iff this function is using BP.
2229   const VEFrameLowering *TFI = Subtarget->getFrameLowering();
2230   if (TFI->hasBP(*MF)) {
2231     MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
2232     MIB.addReg(BufReg);
2233     MIB.addImm(0);
2234     MIB.addImm(24);
2235     MIB.addReg(VE::SX17);
2236     MIB.setMemRefs(MMOs);
2237   }
2238 
2239   // Store IP in buf[1].
2240   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
2241   MIB.add(MI.getOperand(1)); // we can preserve the kill flags here.
2242   MIB.addImm(0);
2243   MIB.addImm(8);
2244   MIB.addReg(LabelReg, getKillRegState(true));
2245   MIB.setMemRefs(MMOs);
2246 
2247   // SP/FP are already stored in jmpbuf before `llvm.eh.sjlj.setjmp`.
2248 
2249   // Insert setup.
2250   MIB =
2251       BuildMI(*ThisMBB, MI, DL, TII->get(VE::EH_SjLj_Setup)).addMBB(RestoreMBB);
2252 
2253   const VERegisterInfo *RegInfo = Subtarget->getRegisterInfo();
2254   MIB.addRegMask(RegInfo->getNoPreservedMask());
2255   ThisMBB->addSuccessor(MainMBB);
2256   ThisMBB->addSuccessor(RestoreMBB);
2257 
2258   // MainMBB:
2259   BuildMI(MainMBB, DL, TII->get(VE::LEAzii), MainDestReg)
2260       .addImm(0)
2261       .addImm(0)
2262       .addImm(0);
2263   MainMBB->addSuccessor(SinkMBB);
2264 
2265   // SinkMBB:
2266   BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(VE::PHI), DstReg)
2267       .addReg(MainDestReg)
2268       .addMBB(MainMBB)
2269       .addReg(RestoreDestReg)
2270       .addMBB(RestoreMBB);
2271 
2272   // RestoreMBB:
2273   // Restore BP from buf[3] iff this function is using BP.  The address of
2274   // buf is in SX10.
2275   // FIXME: Better to not use SX10 here
2276   if (TFI->hasBP(*MF)) {
2277     MachineInstrBuilder MIB =
2278         BuildMI(RestoreMBB, DL, TII->get(VE::LDrii), VE::SX17);
2279     MIB.addReg(VE::SX10);
2280     MIB.addImm(0);
2281     MIB.addImm(24);
2282     MIB.setMemRefs(MMOs);
2283   }
2284   BuildMI(RestoreMBB, DL, TII->get(VE::LEAzii), RestoreDestReg)
2285       .addImm(0)
2286       .addImm(0)
2287       .addImm(1);
2288   BuildMI(RestoreMBB, DL, TII->get(VE::BRCFLa_t)).addMBB(SinkMBB);
2289   RestoreMBB->addSuccessor(SinkMBB);
2290 
2291   MI.eraseFromParent();
2292   return SinkMBB;
2293 }
2294 
2295 MachineBasicBlock *
2296 VETargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
2297                                     MachineBasicBlock *MBB) const {
2298   DebugLoc DL = MI.getDebugLoc();
2299   MachineFunction *MF = MBB->getParent();
2300   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2301   MachineRegisterInfo &MRI = MF->getRegInfo();
2302 
2303   // Memory Reference.
2304   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
2305                                            MI.memoperands_end());
2306   Register BufReg = MI.getOperand(0).getReg();
2307 
2308   Register Tmp = MRI.createVirtualRegister(&VE::I64RegClass);
2309   // Since FP is only updated here but NOT referenced, it's treated as GPR.
2310   Register FP = VE::SX9;
2311   Register SP = VE::SX11;
2312 
2313   MachineInstrBuilder MIB;
2314 
2315   MachineBasicBlock *ThisMBB = MBB;
2316 
2317   // For `call @llvm.eh.sjlj.longjmp(buf)`, we generate following instructions.
2318   //
2319   // ThisMBB:
2320   //   %fp = load buf[0]
2321   //   %jmp = load buf[1]
2322   //   %s10 = buf        ; Store an address of buf to SX10 for RestoreMBB
2323   //   %sp = load buf[2] ; generated by llvm.eh.sjlj.setjmp.
2324   //   jmp %jmp
2325 
2326   // Reload FP.
2327   MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), FP);
2328   MIB.addReg(BufReg);
2329   MIB.addImm(0);
2330   MIB.addImm(0);
2331   MIB.setMemRefs(MMOs);
2332 
2333   // Reload IP.
2334   MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), Tmp);
2335   MIB.addReg(BufReg);
2336   MIB.addImm(0);
2337   MIB.addImm(8);
2338   MIB.setMemRefs(MMOs);
2339 
2340   // Copy BufReg to SX10 for later use in setjmp.
2341   // FIXME: Better to not use SX10 here
2342   BuildMI(*ThisMBB, MI, DL, TII->get(VE::ORri), VE::SX10)
2343       .addReg(BufReg)
2344       .addImm(0);
2345 
2346   // Reload SP.
2347   MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), SP);
2348   MIB.add(MI.getOperand(0)); // we can preserve the kill flags here.
2349   MIB.addImm(0);
2350   MIB.addImm(16);
2351   MIB.setMemRefs(MMOs);
2352 
2353   // Jump.
2354   BuildMI(*ThisMBB, MI, DL, TII->get(VE::BCFLari_t))
2355       .addReg(Tmp, getKillRegState(true))
2356       .addImm(0);
2357 
2358   MI.eraseFromParent();
2359   return ThisMBB;
2360 }
2361 
2362 MachineBasicBlock *
2363 VETargetLowering::emitSjLjDispatchBlock(MachineInstr &MI,
2364                                         MachineBasicBlock *BB) const {
2365   DebugLoc DL = MI.getDebugLoc();
2366   MachineFunction *MF = BB->getParent();
2367   MachineFrameInfo &MFI = MF->getFrameInfo();
2368   MachineRegisterInfo &MRI = MF->getRegInfo();
2369   const VEInstrInfo *TII = Subtarget->getInstrInfo();
2370   int FI = MFI.getFunctionContextIndex();
2371 
2372   // Get a mapping of the call site numbers to all of the landing pads they're
2373   // associated with.
2374   DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
2375   unsigned MaxCSNum = 0;
2376   for (auto &MBB : *MF) {
2377     if (!MBB.isEHPad())
2378       continue;
2379 
2380     MCSymbol *Sym = nullptr;
2381     for (const auto &MI : MBB) {
2382       if (MI.isDebugInstr())
2383         continue;
2384 
2385       assert(MI.isEHLabel() && "expected EH_LABEL");
2386       Sym = MI.getOperand(0).getMCSymbol();
2387       break;
2388     }
2389 
2390     if (!MF->hasCallSiteLandingPad(Sym))
2391       continue;
2392 
2393     for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
2394       CallSiteNumToLPad[CSI].push_back(&MBB);
2395       MaxCSNum = std::max(MaxCSNum, CSI);
2396     }
2397   }
2398 
2399   // Get an ordered list of the machine basic blocks for the jump table.
2400   std::vector<MachineBasicBlock *> LPadList;
2401   SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
2402   LPadList.reserve(CallSiteNumToLPad.size());
2403 
2404   for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
2405     for (auto &LP : CallSiteNumToLPad[CSI]) {
2406       LPadList.push_back(LP);
2407       InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
2408     }
2409   }
2410 
2411   assert(!LPadList.empty() &&
2412          "No landing pad destinations for the dispatch jump table!");
2413 
2414   // The %fn_context is allocated like below (from --print-after=sjljehprepare):
2415   //   %fn_context = alloca { i8*, i64, [4 x i64], i8*, i8*, [5 x i8*] }
2416   //
2417   // This `[5 x i8*]` is jmpbuf, so jmpbuf[1] is FI+72.
2418   // First `i64` is callsite, so callsite is FI+8.
2419   static const int OffsetIC = 72;
2420   static const int OffsetCS = 8;
2421 
2422   // Create the MBBs for the dispatch code like following:
2423   //
2424   // ThisMBB:
2425   //   Prepare DispatchBB address and store it to buf[1].
2426   //   ...
2427   //
2428   // DispatchBB:
2429   //   %s15 = GETGOT iff isPositionIndependent
2430   //   %callsite = load callsite
2431   //   brgt.l.t #size of callsites, %callsite, DispContBB
2432   //
2433   // TrapBB:
2434   //   Call abort.
2435   //
2436   // DispContBB:
2437   //   %breg = address of jump table
2438   //   %pc = load and calculate next pc from %breg and %callsite
2439   //   jmp %pc
2440 
2441   // Shove the dispatch's address into the return slot in the function context.
2442   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
2443   DispatchBB->setIsEHPad(true);
2444 
2445   // Trap BB will causes trap like `assert(0)`.
2446   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
2447   DispatchBB->addSuccessor(TrapBB);
2448 
2449   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
2450   DispatchBB->addSuccessor(DispContBB);
2451 
2452   // Insert MBBs.
2453   MF->push_back(DispatchBB);
2454   MF->push_back(DispContBB);
2455   MF->push_back(TrapBB);
2456 
2457   // Insert code to call abort in the TrapBB.
2458   Register Abort = prepareSymbol(*TrapBB, TrapBB->end(), "abort", DL,
2459                                  /* Local */ false, /* Call */ true);
2460   BuildMI(TrapBB, DL, TII->get(VE::BSICrii), VE::SX10)
2461       .addReg(Abort, getKillRegState(true))
2462       .addImm(0)
2463       .addImm(0);
2464 
2465   // Insert code into the entry block that creates and registers the function
2466   // context.
2467   setupEntryBlockForSjLj(MI, BB, DispatchBB, FI, OffsetIC);
2468 
2469   // Create the jump table and associated information
2470   unsigned JTE = getJumpTableEncoding();
2471   MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
2472   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
2473 
2474   const VERegisterInfo &RI = TII->getRegisterInfo();
2475   // Add a register mask with no preserved registers.  This results in all
2476   // registers being marked as clobbered.
2477   BuildMI(DispatchBB, DL, TII->get(VE::NOP))
2478       .addRegMask(RI.getNoPreservedMask());
2479 
2480   if (isPositionIndependent()) {
2481     // Force to generate GETGOT, since current implementation doesn't store GOT
2482     // register.
2483     BuildMI(DispatchBB, DL, TII->get(VE::GETGOT), VE::SX15);
2484   }
2485 
2486   // IReg is used as an index in a memory operand and therefore can't be SP
2487   const TargetRegisterClass *RC = &VE::I64RegClass;
2488   Register IReg = MRI.createVirtualRegister(RC);
2489   addFrameReference(BuildMI(DispatchBB, DL, TII->get(VE::LDLZXrii), IReg), FI,
2490                     OffsetCS);
2491   if (LPadList.size() < 64) {
2492     BuildMI(DispatchBB, DL, TII->get(VE::BRCFLir_t))
2493         .addImm(VECC::CC_ILE)
2494         .addImm(LPadList.size())
2495         .addReg(IReg)
2496         .addMBB(TrapBB);
2497   } else {
2498     assert(LPadList.size() <= 0x7FFFFFFF && "Too large Landing Pad!");
2499     Register TmpReg = MRI.createVirtualRegister(RC);
2500     BuildMI(DispatchBB, DL, TII->get(VE::LEAzii), TmpReg)
2501         .addImm(0)
2502         .addImm(0)
2503         .addImm(LPadList.size());
2504     BuildMI(DispatchBB, DL, TII->get(VE::BRCFLrr_t))
2505         .addImm(VECC::CC_ILE)
2506         .addReg(TmpReg, getKillRegState(true))
2507         .addReg(IReg)
2508         .addMBB(TrapBB);
2509   }
2510 
2511   Register BReg = MRI.createVirtualRegister(RC);
2512   Register Tmp1 = MRI.createVirtualRegister(RC);
2513   Register Tmp2 = MRI.createVirtualRegister(RC);
2514 
2515   if (isPositionIndependent()) {
2516     // Create following instructions for local linkage PIC code.
2517     //     lea    %Tmp1, .LJTI0_0@gotoff_lo
2518     //     and    %Tmp2, %Tmp1, (32)0
2519     //     lea.sl %BReg, .LJTI0_0@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
2520     BuildMI(DispContBB, DL, TII->get(VE::LEAzii), Tmp1)
2521         .addImm(0)
2522         .addImm(0)
2523         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_GOTOFF_LO32);
2524     BuildMI(DispContBB, DL, TII->get(VE::ANDrm), Tmp2)
2525         .addReg(Tmp1, getKillRegState(true))
2526         .addImm(M0(32));
2527     BuildMI(DispContBB, DL, TII->get(VE::LEASLrri), BReg)
2528         .addReg(VE::SX15)
2529         .addReg(Tmp2, getKillRegState(true))
2530         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_GOTOFF_HI32);
2531   } else {
2532     // Create following instructions for non-PIC code.
2533     //     lea     %Tmp1, .LJTI0_0@lo
2534     //     and     %Tmp2, %Tmp1, (32)0
2535     //     lea.sl  %BReg, .LJTI0_0@hi(%Tmp2)
2536     BuildMI(DispContBB, DL, TII->get(VE::LEAzii), Tmp1)
2537         .addImm(0)
2538         .addImm(0)
2539         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_LO32);
2540     BuildMI(DispContBB, DL, TII->get(VE::ANDrm), Tmp2)
2541         .addReg(Tmp1, getKillRegState(true))
2542         .addImm(M0(32));
2543     BuildMI(DispContBB, DL, TII->get(VE::LEASLrii), BReg)
2544         .addReg(Tmp2, getKillRegState(true))
2545         .addImm(0)
2546         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_HI32);
2547   }
2548 
2549   switch (JTE) {
2550   case MachineJumpTableInfo::EK_BlockAddress: {
2551     // Generate simple block address code for no-PIC model.
2552     //     sll %Tmp1, %IReg, 3
2553     //     lds %TReg, 0(%Tmp1, %BReg)
2554     //     bcfla %TReg
2555 
2556     Register TReg = MRI.createVirtualRegister(RC);
2557     Register Tmp1 = MRI.createVirtualRegister(RC);
2558 
2559     BuildMI(DispContBB, DL, TII->get(VE::SLLri), Tmp1)
2560         .addReg(IReg, getKillRegState(true))
2561         .addImm(3);
2562     BuildMI(DispContBB, DL, TII->get(VE::LDrri), TReg)
2563         .addReg(BReg, getKillRegState(true))
2564         .addReg(Tmp1, getKillRegState(true))
2565         .addImm(0);
2566     BuildMI(DispContBB, DL, TII->get(VE::BCFLari_t))
2567         .addReg(TReg, getKillRegState(true))
2568         .addImm(0);
2569     break;
2570   }
2571   case MachineJumpTableInfo::EK_Custom32: {
2572     // Generate block address code using differences from the function pointer
2573     // for PIC model.
2574     //     sll %Tmp1, %IReg, 2
2575     //     ldl.zx %OReg, 0(%Tmp1, %BReg)
2576     //     Prepare function address in BReg2.
2577     //     adds.l %TReg, %BReg2, %OReg
2578     //     bcfla %TReg
2579 
2580     assert(isPositionIndependent());
2581     Register OReg = MRI.createVirtualRegister(RC);
2582     Register TReg = MRI.createVirtualRegister(RC);
2583     Register Tmp1 = MRI.createVirtualRegister(RC);
2584 
2585     BuildMI(DispContBB, DL, TII->get(VE::SLLri), Tmp1)
2586         .addReg(IReg, getKillRegState(true))
2587         .addImm(2);
2588     BuildMI(DispContBB, DL, TII->get(VE::LDLZXrri), OReg)
2589         .addReg(BReg, getKillRegState(true))
2590         .addReg(Tmp1, getKillRegState(true))
2591         .addImm(0);
2592     Register BReg2 =
2593         prepareSymbol(*DispContBB, DispContBB->end(),
2594                       DispContBB->getParent()->getName(), DL, /* Local */ true);
2595     BuildMI(DispContBB, DL, TII->get(VE::ADDSLrr), TReg)
2596         .addReg(OReg, getKillRegState(true))
2597         .addReg(BReg2, getKillRegState(true));
2598     BuildMI(DispContBB, DL, TII->get(VE::BCFLari_t))
2599         .addReg(TReg, getKillRegState(true))
2600         .addImm(0);
2601     break;
2602   }
2603   default:
2604     llvm_unreachable("Unexpected jump table encoding");
2605   }
2606 
2607   // Add the jump table entries as successors to the MBB.
2608   SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
2609   for (auto &LP : LPadList)
2610     if (SeenMBBs.insert(LP).second)
2611       DispContBB->addSuccessor(LP);
2612 
2613   // N.B. the order the invoke BBs are processed in doesn't matter here.
2614   SmallVector<MachineBasicBlock *, 64> MBBLPads;
2615   const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
2616   for (MachineBasicBlock *MBB : InvokeBBs) {
2617     // Remove the landing pad successor from the invoke block and replace it
2618     // with the new dispatch block.
2619     // Keep a copy of Successors since it's modified inside the loop.
2620     SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
2621                                                    MBB->succ_rend());
2622     // FIXME: Avoid quadratic complexity.
2623     for (auto MBBS : Successors) {
2624       if (MBBS->isEHPad()) {
2625         MBB->removeSuccessor(MBBS);
2626         MBBLPads.push_back(MBBS);
2627       }
2628     }
2629 
2630     MBB->addSuccessor(DispatchBB);
2631 
2632     // Find the invoke call and mark all of the callee-saved registers as
2633     // 'implicit defined' so that they're spilled.  This prevents code from
2634     // moving instructions to before the EH block, where they will never be
2635     // executed.
2636     for (auto &II : reverse(*MBB)) {
2637       if (!II.isCall())
2638         continue;
2639 
2640       DenseMap<Register, bool> DefRegs;
2641       for (auto &MOp : II.operands())
2642         if (MOp.isReg())
2643           DefRegs[MOp.getReg()] = true;
2644 
2645       MachineInstrBuilder MIB(*MF, &II);
2646       for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
2647         Register Reg = SavedRegs[RI];
2648         if (!DefRegs[Reg])
2649           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
2650       }
2651 
2652       break;
2653     }
2654   }
2655 
2656   // Mark all former landing pads as non-landing pads.  The dispatch is the only
2657   // landing pad now.
2658   for (auto &LP : MBBLPads)
2659     LP->setIsEHPad(false);
2660 
2661   // The instruction is gone now.
2662   MI.eraseFromParent();
2663   return BB;
2664 }
2665 
2666 MachineBasicBlock *
2667 VETargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
2668                                               MachineBasicBlock *BB) const {
2669   switch (MI.getOpcode()) {
2670   default:
2671     llvm_unreachable("Unknown Custom Instruction!");
2672   case VE::EH_SjLj_LongJmp:
2673     return emitEHSjLjLongJmp(MI, BB);
2674   case VE::EH_SjLj_SetJmp:
2675     return emitEHSjLjSetJmp(MI, BB);
2676   case VE::EH_SjLj_Setup_Dispatch:
2677     return emitSjLjDispatchBlock(MI, BB);
2678   }
2679 }
2680 
2681 static bool isI32Insn(const SDNode *User, const SDNode *N) {
2682   switch (User->getOpcode()) {
2683   default:
2684     return false;
2685   case ISD::ADD:
2686   case ISD::SUB:
2687   case ISD::MUL:
2688   case ISD::SDIV:
2689   case ISD::UDIV:
2690   case ISD::SETCC:
2691   case ISD::SMIN:
2692   case ISD::SMAX:
2693   case ISD::SHL:
2694   case ISD::SRA:
2695   case ISD::BSWAP:
2696   case ISD::SINT_TO_FP:
2697   case ISD::UINT_TO_FP:
2698   case ISD::BR_CC:
2699   case ISD::BITCAST:
2700   case ISD::ATOMIC_CMP_SWAP:
2701   case ISD::ATOMIC_SWAP:
2702     return true;
2703   case ISD::SRL:
2704     if (N->getOperand(0).getOpcode() != ISD::SRL)
2705       return true;
2706     // (srl (trunc (srl ...))) may be optimized by combining srl, so
2707     // doesn't optimize trunc now.
2708     return false;
2709   case ISD::SELECT_CC:
2710     if (User->getOperand(2).getNode() != N &&
2711         User->getOperand(3).getNode() != N)
2712       return true;
2713     LLVM_FALLTHROUGH;
2714   case ISD::AND:
2715   case ISD::OR:
2716   case ISD::XOR:
2717   case ISD::SELECT:
2718   case ISD::CopyToReg:
2719     // Check all use of selections, bit operations, and copies.  If all of them
2720     // are safe, optimize truncate to extract_subreg.
2721     for (const SDNode *U : User->uses()) {
2722       switch (U->getOpcode()) {
2723       default:
2724         // If the use is an instruction which treats the source operand as i32,
2725         // it is safe to avoid truncate here.
2726         if (isI32Insn(U, N))
2727           continue;
2728         break;
2729       case ISD::ANY_EXTEND:
2730       case ISD::SIGN_EXTEND:
2731       case ISD::ZERO_EXTEND: {
2732         // Special optimizations to the combination of ext and trunc.
2733         // (ext ... (select ... (trunc ...))) is safe to avoid truncate here
2734         // since this truncate instruction clears higher 32 bits which is filled
2735         // by one of ext instructions later.
2736         assert(N->getValueType(0) == MVT::i32 &&
2737                "find truncate to not i32 integer");
2738         if (User->getOpcode() == ISD::SELECT_CC ||
2739             User->getOpcode() == ISD::SELECT)
2740           continue;
2741         break;
2742       }
2743       }
2744       return false;
2745     }
2746     return true;
2747   }
2748 }
2749 
2750 // Optimize TRUNCATE in DAG combining.  Optimizing it in CUSTOM lower is
2751 // sometime too early.  Optimizing it in DAG pattern matching in VEInstrInfo.td
2752 // is sometime too late.  So, doing it at here.
2753 SDValue VETargetLowering::combineTRUNCATE(SDNode *N,
2754                                           DAGCombinerInfo &DCI) const {
2755   assert(N->getOpcode() == ISD::TRUNCATE &&
2756          "Should be called with a TRUNCATE node");
2757 
2758   SelectionDAG &DAG = DCI.DAG;
2759   SDLoc DL(N);
2760   EVT VT = N->getValueType(0);
2761 
2762   // We prefer to do this when all types are legal.
2763   if (!DCI.isAfterLegalizeDAG())
2764     return SDValue();
2765 
2766   // Skip combine TRUNCATE atm if the operand of TRUNCATE might be a constant.
2767   if (N->getOperand(0)->getOpcode() == ISD::SELECT_CC &&
2768       isa<ConstantSDNode>(N->getOperand(0)->getOperand(0)) &&
2769       isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
2770     return SDValue();
2771 
2772   // Check all use of this TRUNCATE.
2773   for (const SDNode *User : N->uses()) {
2774     // Make sure that we're not going to replace TRUNCATE for non i32
2775     // instructions.
2776     //
2777     // FIXME: Although we could sometimes handle this, and it does occur in
2778     // practice that one of the condition inputs to the select is also one of
2779     // the outputs, we currently can't deal with this.
2780     if (isI32Insn(User, N))
2781       continue;
2782 
2783     return SDValue();
2784   }
2785 
2786   SDValue SubI32 = DAG.getTargetConstant(VE::sub_i32, DL, MVT::i32);
2787   return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT,
2788                                     N->getOperand(0), SubI32),
2789                  0);
2790 }
2791 
2792 SDValue VETargetLowering::PerformDAGCombine(SDNode *N,
2793                                             DAGCombinerInfo &DCI) const {
2794   switch (N->getOpcode()) {
2795   default:
2796     break;
2797   case ISD::TRUNCATE:
2798     return combineTRUNCATE(N, DCI);
2799   }
2800 
2801   return SDValue();
2802 }
2803 
2804 //===----------------------------------------------------------------------===//
2805 // VE Inline Assembly Support
2806 //===----------------------------------------------------------------------===//
2807 
2808 VETargetLowering::ConstraintType
2809 VETargetLowering::getConstraintType(StringRef Constraint) const {
2810   if (Constraint.size() == 1) {
2811     switch (Constraint[0]) {
2812     default:
2813       break;
2814     case 'v': // vector registers
2815       return C_RegisterClass;
2816     }
2817   }
2818   return TargetLowering::getConstraintType(Constraint);
2819 }
2820 
2821 std::pair<unsigned, const TargetRegisterClass *>
2822 VETargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
2823                                                StringRef Constraint,
2824                                                MVT VT) const {
2825   const TargetRegisterClass *RC = nullptr;
2826   if (Constraint.size() == 1) {
2827     switch (Constraint[0]) {
2828     default:
2829       return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
2830     case 'r':
2831       RC = &VE::I64RegClass;
2832       break;
2833     case 'v':
2834       RC = &VE::V64RegClass;
2835       break;
2836     }
2837     return std::make_pair(0U, RC);
2838   }
2839 
2840   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
2841 }
2842 
2843 //===----------------------------------------------------------------------===//
2844 // VE Target Optimization Support
2845 //===----------------------------------------------------------------------===//
2846 
2847 unsigned VETargetLowering::getMinimumJumpTableEntries() const {
2848   // Specify 8 for PIC model to relieve the impact of PIC load instructions.
2849   if (isJumpTableRelative())
2850     return 8;
2851 
2852   return TargetLowering::getMinimumJumpTableEntries();
2853 }
2854 
2855 bool VETargetLowering::hasAndNot(SDValue Y) const {
2856   EVT VT = Y.getValueType();
2857 
2858   // VE doesn't have vector and not instruction.
2859   if (VT.isVector())
2860     return false;
2861 
2862   // VE allows different immediate values for X and Y where ~X & Y.
2863   // Only simm7 works for X, and only mimm works for Y on VE.  However, this
2864   // function is used to check whether an immediate value is OK for and-not
2865   // instruction as both X and Y.  Generating additional instruction to
2866   // retrieve an immediate value is no good since the purpose of this
2867   // function is to convert a series of 3 instructions to another series of
2868   // 3 instructions with better parallelism.  Therefore, we return false
2869   // for all immediate values now.
2870   // FIXME: Change hasAndNot function to have two operands to make it work
2871   //        correctly with Aurora VE.
2872   if (isa<ConstantSDNode>(Y))
2873     return false;
2874 
2875   // It's ok for generic registers.
2876   return true;
2877 }
2878 
2879 SDValue VETargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
2880                                                   SelectionDAG &DAG) const {
2881   assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
2882   MVT VT = Op.getOperand(0).getSimpleValueType();
2883 
2884   // Special treatment for packed V64 types.
2885   assert(VT == MVT::v512i32 || VT == MVT::v512f32);
2886   (void)VT;
2887   // Example of codes:
2888   //   %packed_v = extractelt %vr, %idx / 2
2889   //   %v = %packed_v >> (%idx % 2 * 32)
2890   //   %res = %v & 0xffffffff
2891 
2892   SDValue Vec = Op.getOperand(0);
2893   SDValue Idx = Op.getOperand(1);
2894   SDLoc DL(Op);
2895   SDValue Result = Op;
2896   if (false /* Idx->isConstant() */) {
2897     // TODO: optimized implementation using constant values
2898   } else {
2899     SDValue Const1 = DAG.getConstant(1, DL, MVT::i64);
2900     SDValue HalfIdx = DAG.getNode(ISD::SRL, DL, MVT::i64, {Idx, Const1});
2901     SDValue PackedElt =
2902         SDValue(DAG.getMachineNode(VE::LVSvr, DL, MVT::i64, {Vec, HalfIdx}), 0);
2903     SDValue AndIdx = DAG.getNode(ISD::AND, DL, MVT::i64, {Idx, Const1});
2904     SDValue Shift = DAG.getNode(ISD::XOR, DL, MVT::i64, {AndIdx, Const1});
2905     SDValue Const5 = DAG.getConstant(5, DL, MVT::i64);
2906     Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, {Shift, Const5});
2907     PackedElt = DAG.getNode(ISD::SRL, DL, MVT::i64, {PackedElt, Shift});
2908     SDValue Mask = DAG.getConstant(0xFFFFFFFFL, DL, MVT::i64);
2909     PackedElt = DAG.getNode(ISD::AND, DL, MVT::i64, {PackedElt, Mask});
2910     SDValue SubI32 = DAG.getTargetConstant(VE::sub_i32, DL, MVT::i32);
2911     Result = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
2912                                         MVT::i32, PackedElt, SubI32),
2913                      0);
2914 
2915     if (Op.getSimpleValueType() == MVT::f32) {
2916       Result = DAG.getBitcast(MVT::f32, Result);
2917     } else {
2918       assert(Op.getSimpleValueType() == MVT::i32);
2919     }
2920   }
2921   return Result;
2922 }
2923 
2924 SDValue VETargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
2925                                                  SelectionDAG &DAG) const {
2926   assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
2927   MVT VT = Op.getOperand(0).getSimpleValueType();
2928 
2929   // Special treatment for packed V64 types.
2930   assert(VT == MVT::v512i32 || VT == MVT::v512f32);
2931   (void)VT;
2932   // The v512i32 and v512f32 starts from upper bits (0..31).  This "upper
2933   // bits" required `val << 32` from C implementation's point of view.
2934   //
2935   // Example of codes:
2936   //   %packed_elt = extractelt %vr, (%idx >> 1)
2937   //   %shift = ((%idx & 1) ^ 1) << 5
2938   //   %packed_elt &= 0xffffffff00000000 >> shift
2939   //   %packed_elt |= (zext %val) << shift
2940   //   %vr = insertelt %vr, %packed_elt, (%idx >> 1)
2941 
2942   SDLoc DL(Op);
2943   SDValue Vec = Op.getOperand(0);
2944   SDValue Val = Op.getOperand(1);
2945   SDValue Idx = Op.getOperand(2);
2946   if (Idx.getSimpleValueType() == MVT::i32)
2947     Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
2948   if (Val.getSimpleValueType() == MVT::f32)
2949     Val = DAG.getBitcast(MVT::i32, Val);
2950   assert(Val.getSimpleValueType() == MVT::i32);
2951   Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
2952 
2953   SDValue Result = Op;
2954   if (false /* Idx->isConstant()*/) {
2955     // TODO: optimized implementation using constant values
2956   } else {
2957     SDValue Const1 = DAG.getConstant(1, DL, MVT::i64);
2958     SDValue HalfIdx = DAG.getNode(ISD::SRL, DL, MVT::i64, {Idx, Const1});
2959     SDValue PackedElt =
2960         SDValue(DAG.getMachineNode(VE::LVSvr, DL, MVT::i64, {Vec, HalfIdx}), 0);
2961     SDValue AndIdx = DAG.getNode(ISD::AND, DL, MVT::i64, {Idx, Const1});
2962     SDValue Shift = DAG.getNode(ISD::XOR, DL, MVT::i64, {AndIdx, Const1});
2963     SDValue Const5 = DAG.getConstant(5, DL, MVT::i64);
2964     Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, {Shift, Const5});
2965     SDValue Mask = DAG.getConstant(0xFFFFFFFF00000000L, DL, MVT::i64);
2966     Mask = DAG.getNode(ISD::SRL, DL, MVT::i64, {Mask, Shift});
2967     PackedElt = DAG.getNode(ISD::AND, DL, MVT::i64, {PackedElt, Mask});
2968     Val = DAG.getNode(ISD::SHL, DL, MVT::i64, {Val, Shift});
2969     PackedElt = DAG.getNode(ISD::OR, DL, MVT::i64, {PackedElt, Val});
2970     Result =
2971         SDValue(DAG.getMachineNode(VE::LSVrr_v, DL, Vec.getSimpleValueType(),
2972                                    {HalfIdx, PackedElt, Vec}),
2973                 0);
2974   }
2975   return Result;
2976 }
2977