1 //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines an instruction selector for the NVPTX target.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "NVPTXISelDAGToDAG.h"
14 #include "MCTargetDesc/NVPTXBaseInfo.h"
15 #include "NVPTXUtilities.h"
16 #include "llvm/Analysis/ValueTracking.h"
17 #include "llvm/IR/GlobalValue.h"
18 #include "llvm/IR/Instructions.h"
19 #include "llvm/IR/IntrinsicsNVPTX.h"
20 #include "llvm/Support/AtomicOrdering.h"
21 #include "llvm/Support/CommandLine.h"
22 #include "llvm/Support/Debug.h"
23 #include "llvm/Support/ErrorHandling.h"
24 #include "llvm/Support/raw_ostream.h"
25 #include "llvm/Target/TargetIntrinsicInfo.h"
26 
27 using namespace llvm;
28 
29 #define DEBUG_TYPE "nvptx-isel"
30 
31 /// createNVPTXISelDag - This pass converts a legalized DAG into a
32 /// NVPTX-specific DAG, ready for instruction scheduling.
33 FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
34                                        llvm::CodeGenOpt::Level OptLevel) {
35   return new NVPTXDAGToDAGISel(TM, OptLevel);
36 }
37 
38 NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
39                                      CodeGenOpt::Level OptLevel)
40     : SelectionDAGISel(tm, OptLevel), TM(tm) {
41   doMulWide = (OptLevel > 0);
42 }
43 
44 bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
45   Subtarget = &MF.getSubtarget<NVPTXSubtarget>();
46   return SelectionDAGISel::runOnMachineFunction(MF);
47 }
48 
49 int NVPTXDAGToDAGISel::getDivF32Level() const {
50   return Subtarget->getTargetLowering()->getDivF32Level();
51 }
52 
53 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
54   return Subtarget->getTargetLowering()->usePrecSqrtF32();
55 }
56 
57 bool NVPTXDAGToDAGISel::useF32FTZ() const {
58   return Subtarget->getTargetLowering()->useF32FTZ(*MF);
59 }
60 
61 bool NVPTXDAGToDAGISel::allowFMA() const {
62   const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
63   return TL->allowFMA(*MF, OptLevel);
64 }
65 
66 bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
67   const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
68   return TL->allowUnsafeFPMath(*MF);
69 }
70 
71 bool NVPTXDAGToDAGISel::useShortPointers() const {
72   return TM.useShortPointers();
73 }
74 
75 /// Select - Select instructions not customized! Used for
76 /// expanded, promoted and normal instructions.
77 void NVPTXDAGToDAGISel::Select(SDNode *N) {
78 
79   if (N->isMachineOpcode()) {
80     N->setNodeId(-1);
81     return; // Already selected.
82   }
83 
84   switch (N->getOpcode()) {
85   case ISD::LOAD:
86   case ISD::ATOMIC_LOAD:
87     if (tryLoad(N))
88       return;
89     break;
90   case ISD::STORE:
91   case ISD::ATOMIC_STORE:
92     if (tryStore(N))
93       return;
94     break;
95   case ISD::EXTRACT_VECTOR_ELT:
96     if (tryEXTRACT_VECTOR_ELEMENT(N))
97       return;
98     break;
99   case NVPTXISD::SETP_F16X2:
100     SelectSETP_F16X2(N);
101     return;
102 
103   case NVPTXISD::LoadV2:
104   case NVPTXISD::LoadV4:
105     if (tryLoadVector(N))
106       return;
107     break;
108   case NVPTXISD::LDGV2:
109   case NVPTXISD::LDGV4:
110   case NVPTXISD::LDUV2:
111   case NVPTXISD::LDUV4:
112     if (tryLDGLDU(N))
113       return;
114     break;
115   case NVPTXISD::StoreV2:
116   case NVPTXISD::StoreV4:
117     if (tryStoreVector(N))
118       return;
119     break;
120   case NVPTXISD::LoadParam:
121   case NVPTXISD::LoadParamV2:
122   case NVPTXISD::LoadParamV4:
123     if (tryLoadParam(N))
124       return;
125     break;
126   case NVPTXISD::StoreRetval:
127   case NVPTXISD::StoreRetvalV2:
128   case NVPTXISD::StoreRetvalV4:
129     if (tryStoreRetval(N))
130       return;
131     break;
132   case NVPTXISD::StoreParam:
133   case NVPTXISD::StoreParamV2:
134   case NVPTXISD::StoreParamV4:
135   case NVPTXISD::StoreParamS32:
136   case NVPTXISD::StoreParamU32:
137     if (tryStoreParam(N))
138       return;
139     break;
140   case ISD::INTRINSIC_WO_CHAIN:
141     if (tryIntrinsicNoChain(N))
142       return;
143     break;
144   case ISD::INTRINSIC_W_CHAIN:
145     if (tryIntrinsicChain(N))
146       return;
147     break;
148   case NVPTXISD::Tex1DFloatS32:
149   case NVPTXISD::Tex1DFloatFloat:
150   case NVPTXISD::Tex1DFloatFloatLevel:
151   case NVPTXISD::Tex1DFloatFloatGrad:
152   case NVPTXISD::Tex1DS32S32:
153   case NVPTXISD::Tex1DS32Float:
154   case NVPTXISD::Tex1DS32FloatLevel:
155   case NVPTXISD::Tex1DS32FloatGrad:
156   case NVPTXISD::Tex1DU32S32:
157   case NVPTXISD::Tex1DU32Float:
158   case NVPTXISD::Tex1DU32FloatLevel:
159   case NVPTXISD::Tex1DU32FloatGrad:
160   case NVPTXISD::Tex1DArrayFloatS32:
161   case NVPTXISD::Tex1DArrayFloatFloat:
162   case NVPTXISD::Tex1DArrayFloatFloatLevel:
163   case NVPTXISD::Tex1DArrayFloatFloatGrad:
164   case NVPTXISD::Tex1DArrayS32S32:
165   case NVPTXISD::Tex1DArrayS32Float:
166   case NVPTXISD::Tex1DArrayS32FloatLevel:
167   case NVPTXISD::Tex1DArrayS32FloatGrad:
168   case NVPTXISD::Tex1DArrayU32S32:
169   case NVPTXISD::Tex1DArrayU32Float:
170   case NVPTXISD::Tex1DArrayU32FloatLevel:
171   case NVPTXISD::Tex1DArrayU32FloatGrad:
172   case NVPTXISD::Tex2DFloatS32:
173   case NVPTXISD::Tex2DFloatFloat:
174   case NVPTXISD::Tex2DFloatFloatLevel:
175   case NVPTXISD::Tex2DFloatFloatGrad:
176   case NVPTXISD::Tex2DS32S32:
177   case NVPTXISD::Tex2DS32Float:
178   case NVPTXISD::Tex2DS32FloatLevel:
179   case NVPTXISD::Tex2DS32FloatGrad:
180   case NVPTXISD::Tex2DU32S32:
181   case NVPTXISD::Tex2DU32Float:
182   case NVPTXISD::Tex2DU32FloatLevel:
183   case NVPTXISD::Tex2DU32FloatGrad:
184   case NVPTXISD::Tex2DArrayFloatS32:
185   case NVPTXISD::Tex2DArrayFloatFloat:
186   case NVPTXISD::Tex2DArrayFloatFloatLevel:
187   case NVPTXISD::Tex2DArrayFloatFloatGrad:
188   case NVPTXISD::Tex2DArrayS32S32:
189   case NVPTXISD::Tex2DArrayS32Float:
190   case NVPTXISD::Tex2DArrayS32FloatLevel:
191   case NVPTXISD::Tex2DArrayS32FloatGrad:
192   case NVPTXISD::Tex2DArrayU32S32:
193   case NVPTXISD::Tex2DArrayU32Float:
194   case NVPTXISD::Tex2DArrayU32FloatLevel:
195   case NVPTXISD::Tex2DArrayU32FloatGrad:
196   case NVPTXISD::Tex3DFloatS32:
197   case NVPTXISD::Tex3DFloatFloat:
198   case NVPTXISD::Tex3DFloatFloatLevel:
199   case NVPTXISD::Tex3DFloatFloatGrad:
200   case NVPTXISD::Tex3DS32S32:
201   case NVPTXISD::Tex3DS32Float:
202   case NVPTXISD::Tex3DS32FloatLevel:
203   case NVPTXISD::Tex3DS32FloatGrad:
204   case NVPTXISD::Tex3DU32S32:
205   case NVPTXISD::Tex3DU32Float:
206   case NVPTXISD::Tex3DU32FloatLevel:
207   case NVPTXISD::Tex3DU32FloatGrad:
208   case NVPTXISD::TexCubeFloatFloat:
209   case NVPTXISD::TexCubeFloatFloatLevel:
210   case NVPTXISD::TexCubeS32Float:
211   case NVPTXISD::TexCubeS32FloatLevel:
212   case NVPTXISD::TexCubeU32Float:
213   case NVPTXISD::TexCubeU32FloatLevel:
214   case NVPTXISD::TexCubeArrayFloatFloat:
215   case NVPTXISD::TexCubeArrayFloatFloatLevel:
216   case NVPTXISD::TexCubeArrayS32Float:
217   case NVPTXISD::TexCubeArrayS32FloatLevel:
218   case NVPTXISD::TexCubeArrayU32Float:
219   case NVPTXISD::TexCubeArrayU32FloatLevel:
220   case NVPTXISD::Tld4R2DFloatFloat:
221   case NVPTXISD::Tld4G2DFloatFloat:
222   case NVPTXISD::Tld4B2DFloatFloat:
223   case NVPTXISD::Tld4A2DFloatFloat:
224   case NVPTXISD::Tld4R2DS64Float:
225   case NVPTXISD::Tld4G2DS64Float:
226   case NVPTXISD::Tld4B2DS64Float:
227   case NVPTXISD::Tld4A2DS64Float:
228   case NVPTXISD::Tld4R2DU64Float:
229   case NVPTXISD::Tld4G2DU64Float:
230   case NVPTXISD::Tld4B2DU64Float:
231   case NVPTXISD::Tld4A2DU64Float:
232   case NVPTXISD::TexUnified1DFloatS32:
233   case NVPTXISD::TexUnified1DFloatFloat:
234   case NVPTXISD::TexUnified1DFloatFloatLevel:
235   case NVPTXISD::TexUnified1DFloatFloatGrad:
236   case NVPTXISD::TexUnified1DS32S32:
237   case NVPTXISD::TexUnified1DS32Float:
238   case NVPTXISD::TexUnified1DS32FloatLevel:
239   case NVPTXISD::TexUnified1DS32FloatGrad:
240   case NVPTXISD::TexUnified1DU32S32:
241   case NVPTXISD::TexUnified1DU32Float:
242   case NVPTXISD::TexUnified1DU32FloatLevel:
243   case NVPTXISD::TexUnified1DU32FloatGrad:
244   case NVPTXISD::TexUnified1DArrayFloatS32:
245   case NVPTXISD::TexUnified1DArrayFloatFloat:
246   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
247   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
248   case NVPTXISD::TexUnified1DArrayS32S32:
249   case NVPTXISD::TexUnified1DArrayS32Float:
250   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
251   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
252   case NVPTXISD::TexUnified1DArrayU32S32:
253   case NVPTXISD::TexUnified1DArrayU32Float:
254   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
255   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
256   case NVPTXISD::TexUnified2DFloatS32:
257   case NVPTXISD::TexUnified2DFloatFloat:
258   case NVPTXISD::TexUnified2DFloatFloatLevel:
259   case NVPTXISD::TexUnified2DFloatFloatGrad:
260   case NVPTXISD::TexUnified2DS32S32:
261   case NVPTXISD::TexUnified2DS32Float:
262   case NVPTXISD::TexUnified2DS32FloatLevel:
263   case NVPTXISD::TexUnified2DS32FloatGrad:
264   case NVPTXISD::TexUnified2DU32S32:
265   case NVPTXISD::TexUnified2DU32Float:
266   case NVPTXISD::TexUnified2DU32FloatLevel:
267   case NVPTXISD::TexUnified2DU32FloatGrad:
268   case NVPTXISD::TexUnified2DArrayFloatS32:
269   case NVPTXISD::TexUnified2DArrayFloatFloat:
270   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
271   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
272   case NVPTXISD::TexUnified2DArrayS32S32:
273   case NVPTXISD::TexUnified2DArrayS32Float:
274   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
275   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
276   case NVPTXISD::TexUnified2DArrayU32S32:
277   case NVPTXISD::TexUnified2DArrayU32Float:
278   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
279   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
280   case NVPTXISD::TexUnified3DFloatS32:
281   case NVPTXISD::TexUnified3DFloatFloat:
282   case NVPTXISD::TexUnified3DFloatFloatLevel:
283   case NVPTXISD::TexUnified3DFloatFloatGrad:
284   case NVPTXISD::TexUnified3DS32S32:
285   case NVPTXISD::TexUnified3DS32Float:
286   case NVPTXISD::TexUnified3DS32FloatLevel:
287   case NVPTXISD::TexUnified3DS32FloatGrad:
288   case NVPTXISD::TexUnified3DU32S32:
289   case NVPTXISD::TexUnified3DU32Float:
290   case NVPTXISD::TexUnified3DU32FloatLevel:
291   case NVPTXISD::TexUnified3DU32FloatGrad:
292   case NVPTXISD::TexUnifiedCubeFloatFloat:
293   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
294   case NVPTXISD::TexUnifiedCubeS32Float:
295   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
296   case NVPTXISD::TexUnifiedCubeU32Float:
297   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
298   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
299   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
300   case NVPTXISD::TexUnifiedCubeArrayS32Float:
301   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
302   case NVPTXISD::TexUnifiedCubeArrayU32Float:
303   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
304   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
305   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
306   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
307   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
308   case NVPTXISD::Tld4UnifiedR2DS64Float:
309   case NVPTXISD::Tld4UnifiedG2DS64Float:
310   case NVPTXISD::Tld4UnifiedB2DS64Float:
311   case NVPTXISD::Tld4UnifiedA2DS64Float:
312   case NVPTXISD::Tld4UnifiedR2DU64Float:
313   case NVPTXISD::Tld4UnifiedG2DU64Float:
314   case NVPTXISD::Tld4UnifiedB2DU64Float:
315   case NVPTXISD::Tld4UnifiedA2DU64Float:
316     if (tryTextureIntrinsic(N))
317       return;
318     break;
319   case NVPTXISD::Suld1DI8Clamp:
320   case NVPTXISD::Suld1DI16Clamp:
321   case NVPTXISD::Suld1DI32Clamp:
322   case NVPTXISD::Suld1DI64Clamp:
323   case NVPTXISD::Suld1DV2I8Clamp:
324   case NVPTXISD::Suld1DV2I16Clamp:
325   case NVPTXISD::Suld1DV2I32Clamp:
326   case NVPTXISD::Suld1DV2I64Clamp:
327   case NVPTXISD::Suld1DV4I8Clamp:
328   case NVPTXISD::Suld1DV4I16Clamp:
329   case NVPTXISD::Suld1DV4I32Clamp:
330   case NVPTXISD::Suld1DArrayI8Clamp:
331   case NVPTXISD::Suld1DArrayI16Clamp:
332   case NVPTXISD::Suld1DArrayI32Clamp:
333   case NVPTXISD::Suld1DArrayI64Clamp:
334   case NVPTXISD::Suld1DArrayV2I8Clamp:
335   case NVPTXISD::Suld1DArrayV2I16Clamp:
336   case NVPTXISD::Suld1DArrayV2I32Clamp:
337   case NVPTXISD::Suld1DArrayV2I64Clamp:
338   case NVPTXISD::Suld1DArrayV4I8Clamp:
339   case NVPTXISD::Suld1DArrayV4I16Clamp:
340   case NVPTXISD::Suld1DArrayV4I32Clamp:
341   case NVPTXISD::Suld2DI8Clamp:
342   case NVPTXISD::Suld2DI16Clamp:
343   case NVPTXISD::Suld2DI32Clamp:
344   case NVPTXISD::Suld2DI64Clamp:
345   case NVPTXISD::Suld2DV2I8Clamp:
346   case NVPTXISD::Suld2DV2I16Clamp:
347   case NVPTXISD::Suld2DV2I32Clamp:
348   case NVPTXISD::Suld2DV2I64Clamp:
349   case NVPTXISD::Suld2DV4I8Clamp:
350   case NVPTXISD::Suld2DV4I16Clamp:
351   case NVPTXISD::Suld2DV4I32Clamp:
352   case NVPTXISD::Suld2DArrayI8Clamp:
353   case NVPTXISD::Suld2DArrayI16Clamp:
354   case NVPTXISD::Suld2DArrayI32Clamp:
355   case NVPTXISD::Suld2DArrayI64Clamp:
356   case NVPTXISD::Suld2DArrayV2I8Clamp:
357   case NVPTXISD::Suld2DArrayV2I16Clamp:
358   case NVPTXISD::Suld2DArrayV2I32Clamp:
359   case NVPTXISD::Suld2DArrayV2I64Clamp:
360   case NVPTXISD::Suld2DArrayV4I8Clamp:
361   case NVPTXISD::Suld2DArrayV4I16Clamp:
362   case NVPTXISD::Suld2DArrayV4I32Clamp:
363   case NVPTXISD::Suld3DI8Clamp:
364   case NVPTXISD::Suld3DI16Clamp:
365   case NVPTXISD::Suld3DI32Clamp:
366   case NVPTXISD::Suld3DI64Clamp:
367   case NVPTXISD::Suld3DV2I8Clamp:
368   case NVPTXISD::Suld3DV2I16Clamp:
369   case NVPTXISD::Suld3DV2I32Clamp:
370   case NVPTXISD::Suld3DV2I64Clamp:
371   case NVPTXISD::Suld3DV4I8Clamp:
372   case NVPTXISD::Suld3DV4I16Clamp:
373   case NVPTXISD::Suld3DV4I32Clamp:
374   case NVPTXISD::Suld1DI8Trap:
375   case NVPTXISD::Suld1DI16Trap:
376   case NVPTXISD::Suld1DI32Trap:
377   case NVPTXISD::Suld1DI64Trap:
378   case NVPTXISD::Suld1DV2I8Trap:
379   case NVPTXISD::Suld1DV2I16Trap:
380   case NVPTXISD::Suld1DV2I32Trap:
381   case NVPTXISD::Suld1DV2I64Trap:
382   case NVPTXISD::Suld1DV4I8Trap:
383   case NVPTXISD::Suld1DV4I16Trap:
384   case NVPTXISD::Suld1DV4I32Trap:
385   case NVPTXISD::Suld1DArrayI8Trap:
386   case NVPTXISD::Suld1DArrayI16Trap:
387   case NVPTXISD::Suld1DArrayI32Trap:
388   case NVPTXISD::Suld1DArrayI64Trap:
389   case NVPTXISD::Suld1DArrayV2I8Trap:
390   case NVPTXISD::Suld1DArrayV2I16Trap:
391   case NVPTXISD::Suld1DArrayV2I32Trap:
392   case NVPTXISD::Suld1DArrayV2I64Trap:
393   case NVPTXISD::Suld1DArrayV4I8Trap:
394   case NVPTXISD::Suld1DArrayV4I16Trap:
395   case NVPTXISD::Suld1DArrayV4I32Trap:
396   case NVPTXISD::Suld2DI8Trap:
397   case NVPTXISD::Suld2DI16Trap:
398   case NVPTXISD::Suld2DI32Trap:
399   case NVPTXISD::Suld2DI64Trap:
400   case NVPTXISD::Suld2DV2I8Trap:
401   case NVPTXISD::Suld2DV2I16Trap:
402   case NVPTXISD::Suld2DV2I32Trap:
403   case NVPTXISD::Suld2DV2I64Trap:
404   case NVPTXISD::Suld2DV4I8Trap:
405   case NVPTXISD::Suld2DV4I16Trap:
406   case NVPTXISD::Suld2DV4I32Trap:
407   case NVPTXISD::Suld2DArrayI8Trap:
408   case NVPTXISD::Suld2DArrayI16Trap:
409   case NVPTXISD::Suld2DArrayI32Trap:
410   case NVPTXISD::Suld2DArrayI64Trap:
411   case NVPTXISD::Suld2DArrayV2I8Trap:
412   case NVPTXISD::Suld2DArrayV2I16Trap:
413   case NVPTXISD::Suld2DArrayV2I32Trap:
414   case NVPTXISD::Suld2DArrayV2I64Trap:
415   case NVPTXISD::Suld2DArrayV4I8Trap:
416   case NVPTXISD::Suld2DArrayV4I16Trap:
417   case NVPTXISD::Suld2DArrayV4I32Trap:
418   case NVPTXISD::Suld3DI8Trap:
419   case NVPTXISD::Suld3DI16Trap:
420   case NVPTXISD::Suld3DI32Trap:
421   case NVPTXISD::Suld3DI64Trap:
422   case NVPTXISD::Suld3DV2I8Trap:
423   case NVPTXISD::Suld3DV2I16Trap:
424   case NVPTXISD::Suld3DV2I32Trap:
425   case NVPTXISD::Suld3DV2I64Trap:
426   case NVPTXISD::Suld3DV4I8Trap:
427   case NVPTXISD::Suld3DV4I16Trap:
428   case NVPTXISD::Suld3DV4I32Trap:
429   case NVPTXISD::Suld1DI8Zero:
430   case NVPTXISD::Suld1DI16Zero:
431   case NVPTXISD::Suld1DI32Zero:
432   case NVPTXISD::Suld1DI64Zero:
433   case NVPTXISD::Suld1DV2I8Zero:
434   case NVPTXISD::Suld1DV2I16Zero:
435   case NVPTXISD::Suld1DV2I32Zero:
436   case NVPTXISD::Suld1DV2I64Zero:
437   case NVPTXISD::Suld1DV4I8Zero:
438   case NVPTXISD::Suld1DV4I16Zero:
439   case NVPTXISD::Suld1DV4I32Zero:
440   case NVPTXISD::Suld1DArrayI8Zero:
441   case NVPTXISD::Suld1DArrayI16Zero:
442   case NVPTXISD::Suld1DArrayI32Zero:
443   case NVPTXISD::Suld1DArrayI64Zero:
444   case NVPTXISD::Suld1DArrayV2I8Zero:
445   case NVPTXISD::Suld1DArrayV2I16Zero:
446   case NVPTXISD::Suld1DArrayV2I32Zero:
447   case NVPTXISD::Suld1DArrayV2I64Zero:
448   case NVPTXISD::Suld1DArrayV4I8Zero:
449   case NVPTXISD::Suld1DArrayV4I16Zero:
450   case NVPTXISD::Suld1DArrayV4I32Zero:
451   case NVPTXISD::Suld2DI8Zero:
452   case NVPTXISD::Suld2DI16Zero:
453   case NVPTXISD::Suld2DI32Zero:
454   case NVPTXISD::Suld2DI64Zero:
455   case NVPTXISD::Suld2DV2I8Zero:
456   case NVPTXISD::Suld2DV2I16Zero:
457   case NVPTXISD::Suld2DV2I32Zero:
458   case NVPTXISD::Suld2DV2I64Zero:
459   case NVPTXISD::Suld2DV4I8Zero:
460   case NVPTXISD::Suld2DV4I16Zero:
461   case NVPTXISD::Suld2DV4I32Zero:
462   case NVPTXISD::Suld2DArrayI8Zero:
463   case NVPTXISD::Suld2DArrayI16Zero:
464   case NVPTXISD::Suld2DArrayI32Zero:
465   case NVPTXISD::Suld2DArrayI64Zero:
466   case NVPTXISD::Suld2DArrayV2I8Zero:
467   case NVPTXISD::Suld2DArrayV2I16Zero:
468   case NVPTXISD::Suld2DArrayV2I32Zero:
469   case NVPTXISD::Suld2DArrayV2I64Zero:
470   case NVPTXISD::Suld2DArrayV4I8Zero:
471   case NVPTXISD::Suld2DArrayV4I16Zero:
472   case NVPTXISD::Suld2DArrayV4I32Zero:
473   case NVPTXISD::Suld3DI8Zero:
474   case NVPTXISD::Suld3DI16Zero:
475   case NVPTXISD::Suld3DI32Zero:
476   case NVPTXISD::Suld3DI64Zero:
477   case NVPTXISD::Suld3DV2I8Zero:
478   case NVPTXISD::Suld3DV2I16Zero:
479   case NVPTXISD::Suld3DV2I32Zero:
480   case NVPTXISD::Suld3DV2I64Zero:
481   case NVPTXISD::Suld3DV4I8Zero:
482   case NVPTXISD::Suld3DV4I16Zero:
483   case NVPTXISD::Suld3DV4I32Zero:
484     if (trySurfaceIntrinsic(N))
485       return;
486     break;
487   case ISD::AND:
488   case ISD::SRA:
489   case ISD::SRL:
490     // Try to select BFE
491     if (tryBFE(N))
492       return;
493     break;
494   case ISD::ADDRSPACECAST:
495     SelectAddrSpaceCast(N);
496     return;
497   case ISD::ConstantFP:
498     if (tryConstantFP16(N))
499       return;
500     break;
501   default:
502     break;
503   }
504   SelectCode(N);
505 }
506 
507 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
508   unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
509   switch (IID) {
510   default:
511     return false;
512   case Intrinsic::nvvm_ldg_global_f:
513   case Intrinsic::nvvm_ldg_global_i:
514   case Intrinsic::nvvm_ldg_global_p:
515   case Intrinsic::nvvm_ldu_global_f:
516   case Intrinsic::nvvm_ldu_global_i:
517   case Intrinsic::nvvm_ldu_global_p:
518     return tryLDGLDU(N);
519   }
520 }
521 
522 // There's no way to specify FP16 immediates in .f16 ops, so we have to
523 // load them into an .f16 register first.
524 bool NVPTXDAGToDAGISel::tryConstantFP16(SDNode *N) {
525   if (N->getValueType(0) != MVT::f16)
526     return false;
527   SDValue Val = CurDAG->getTargetConstantFP(
528       cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), MVT::f16);
529   SDNode *LoadConstF16 =
530       CurDAG->getMachineNode(NVPTX::LOAD_CONST_F16, SDLoc(N), MVT::f16, Val);
531   ReplaceNode(N, LoadConstF16);
532   return true;
533 }
534 
535 // Map ISD:CONDCODE value to appropriate CmpMode expected by
536 // NVPTXInstPrinter::printCmpMode()
537 static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
538   using NVPTX::PTXCmpMode::CmpMode;
539   unsigned PTXCmpMode = [](ISD::CondCode CC) {
540     switch (CC) {
541     default:
542       llvm_unreachable("Unexpected condition code.");
543     case ISD::SETOEQ:
544       return CmpMode::EQ;
545     case ISD::SETOGT:
546       return CmpMode::GT;
547     case ISD::SETOGE:
548       return CmpMode::GE;
549     case ISD::SETOLT:
550       return CmpMode::LT;
551     case ISD::SETOLE:
552       return CmpMode::LE;
553     case ISD::SETONE:
554       return CmpMode::NE;
555     case ISD::SETO:
556       return CmpMode::NUM;
557     case ISD::SETUO:
558       return CmpMode::NotANumber;
559     case ISD::SETUEQ:
560       return CmpMode::EQU;
561     case ISD::SETUGT:
562       return CmpMode::GTU;
563     case ISD::SETUGE:
564       return CmpMode::GEU;
565     case ISD::SETULT:
566       return CmpMode::LTU;
567     case ISD::SETULE:
568       return CmpMode::LEU;
569     case ISD::SETUNE:
570       return CmpMode::NEU;
571     case ISD::SETEQ:
572       return CmpMode::EQ;
573     case ISD::SETGT:
574       return CmpMode::GT;
575     case ISD::SETGE:
576       return CmpMode::GE;
577     case ISD::SETLT:
578       return CmpMode::LT;
579     case ISD::SETLE:
580       return CmpMode::LE;
581     case ISD::SETNE:
582       return CmpMode::NE;
583     }
584   }(CondCode.get());
585 
586   if (FTZ)
587     PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
588 
589   return PTXCmpMode;
590 }
591 
592 bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
593   unsigned PTXCmpMode =
594       getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
595   SDLoc DL(N);
596   SDNode *SetP = CurDAG->getMachineNode(
597       NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
598       N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
599   ReplaceNode(N, SetP);
600   return true;
601 }
602 
603 // Find all instances of extract_vector_elt that use this v2f16 vector
604 // and coalesce them into a scattering move instruction.
605 bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
606   SDValue Vector = N->getOperand(0);
607 
608   // We only care about f16x2 as it's the only real vector type we
609   // need to deal with.
610   if (Vector.getSimpleValueType() != MVT::v2f16)
611     return false;
612 
613   // Find and record all uses of this vector that extract element 0 or 1.
614   SmallVector<SDNode *, 4> E0, E1;
615   for (auto U : Vector.getNode()->uses()) {
616     if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
617       continue;
618     if (U->getOperand(0) != Vector)
619       continue;
620     if (const ConstantSDNode *IdxConst =
621             dyn_cast<ConstantSDNode>(U->getOperand(1))) {
622       if (IdxConst->getZExtValue() == 0)
623         E0.push_back(U);
624       else if (IdxConst->getZExtValue() == 1)
625         E1.push_back(U);
626       else
627         llvm_unreachable("Invalid vector index.");
628     }
629   }
630 
631   // There's no point scattering f16x2 if we only ever access one
632   // element of it.
633   if (E0.empty() || E1.empty())
634     return false;
635 
636   unsigned Op = NVPTX::SplitF16x2;
637   // If the vector has been BITCAST'ed from i32, we can use original
638   // value directly and avoid register-to-register move.
639   SDValue Source = Vector;
640   if (Vector->getOpcode() == ISD::BITCAST) {
641     Op = NVPTX::SplitI32toF16x2;
642     Source = Vector->getOperand(0);
643   }
644   // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
645   // into f16,f16 SplitF16x2(V)
646   SDNode *ScatterOp =
647       CurDAG->getMachineNode(Op, SDLoc(N), MVT::f16, MVT::f16, Source);
648   for (auto *Node : E0)
649     ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
650   for (auto *Node : E1)
651     ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
652 
653   return true;
654 }
655 
656 static unsigned int getCodeAddrSpace(MemSDNode *N) {
657   const Value *Src = N->getMemOperand()->getValue();
658 
659   if (!Src)
660     return NVPTX::PTXLdStInstCode::GENERIC;
661 
662   if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
663     switch (PT->getAddressSpace()) {
664     case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
665     case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
666     case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED;
667     case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC;
668     case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM;
669     case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT;
670     default: break;
671     }
672   }
673   return NVPTX::PTXLdStInstCode::GENERIC;
674 }
675 
676 static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
677                           unsigned CodeAddrSpace, MachineFunction *F) {
678   // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
679   // space.
680   //
681   // We have two ways of identifying invariant loads: Loads may be explicitly
682   // marked as invariant, or we may infer them to be invariant.
683   //
684   // We currently infer invariance for loads from
685   //  - constant global variables, and
686   //  - kernel function pointer params that are noalias (i.e. __restrict) and
687   //    never written to.
688   //
689   // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
690   // not during the SelectionDAG phase).
691   //
692   // TODO: Infer invariance only at -O2.  We still want to use ldg at -O0 for
693   // explicitly invariant loads because these are how clang tells us to use ldg
694   // when the user uses a builtin.
695   if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
696     return false;
697 
698   if (N->isInvariant())
699     return true;
700 
701   bool IsKernelFn = isKernelFunction(F->getFunction());
702 
703   // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly
704   // because the former looks through phi nodes while the latter does not. We
705   // need to look through phi nodes to handle pointer induction variables.
706   SmallVector<const Value *, 8> Objs;
707   getUnderlyingObjects(N->getMemOperand()->getValue(), Objs);
708 
709   return all_of(Objs, [&](const Value *V) {
710     if (auto *A = dyn_cast<const Argument>(V))
711       return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
712     if (auto *GV = dyn_cast<const GlobalVariable>(V))
713       return GV->isConstant();
714     return false;
715   });
716 }
717 
718 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
719   unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
720   switch (IID) {
721   default:
722     return false;
723   case Intrinsic::nvvm_texsurf_handle_internal:
724     SelectTexSurfHandle(N);
725     return true;
726   }
727 }
728 
729 void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
730   // Op 0 is the intrinsic ID
731   SDValue Wrapper = N->getOperand(1);
732   SDValue GlobalVal = Wrapper.getOperand(0);
733   ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
734                                         MVT::i64, GlobalVal));
735 }
736 
737 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
738   SDValue Src = N->getOperand(0);
739   AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
740   unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
741   unsigned DstAddrSpace = CastN->getDestAddressSpace();
742   assert(SrcAddrSpace != DstAddrSpace &&
743          "addrspacecast must be between different address spaces");
744 
745   if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
746     // Specific to generic
747     unsigned Opc;
748     switch (SrcAddrSpace) {
749     default: report_fatal_error("Bad address space in addrspacecast");
750     case ADDRESS_SPACE_GLOBAL:
751       Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes;
752       break;
753     case ADDRESS_SPACE_SHARED:
754       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_shared_yes_6432
755                                                : NVPTX::cvta_shared_yes_64)
756                          : NVPTX::cvta_shared_yes;
757       break;
758     case ADDRESS_SPACE_CONST:
759       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_const_yes_6432
760                                                : NVPTX::cvta_const_yes_64)
761                          : NVPTX::cvta_const_yes;
762       break;
763     case ADDRESS_SPACE_LOCAL:
764       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_local_yes_6432
765                                                : NVPTX::cvta_local_yes_64)
766                          : NVPTX::cvta_local_yes;
767       break;
768     }
769     ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
770                                           Src));
771     return;
772   } else {
773     // Generic to specific
774     if (SrcAddrSpace != 0)
775       report_fatal_error("Cannot cast between two non-generic address spaces");
776     unsigned Opc;
777     switch (DstAddrSpace) {
778     default: report_fatal_error("Bad address space in addrspacecast");
779     case ADDRESS_SPACE_GLOBAL:
780       Opc = TM.is64Bit() ? NVPTX::cvta_to_global_yes_64
781                          : NVPTX::cvta_to_global_yes;
782       break;
783     case ADDRESS_SPACE_SHARED:
784       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_shared_yes_3264
785                                                 : NVPTX::cvta_to_shared_yes_64)
786                          : NVPTX::cvta_to_shared_yes;
787       break;
788     case ADDRESS_SPACE_CONST:
789       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_const_yes_3264
790                                              : NVPTX::cvta_to_const_yes_64)
791                          : NVPTX::cvta_to_const_yes;
792       break;
793     case ADDRESS_SPACE_LOCAL:
794       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_local_yes_3264
795                                                : NVPTX::cvta_to_local_yes_64)
796                          : NVPTX::cvta_to_local_yes;
797       break;
798     case ADDRESS_SPACE_PARAM:
799       Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
800                          : NVPTX::nvvm_ptr_gen_to_param;
801       break;
802     }
803     ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
804                                           Src));
805     return;
806   }
807 }
808 
809 // Helper function template to reduce amount of boilerplate code for
810 // opcode selection.
811 static Optional<unsigned> pickOpcodeForVT(
812     MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16,
813     unsigned Opcode_i32, Optional<unsigned> Opcode_i64, unsigned Opcode_f16,
814     unsigned Opcode_f16x2, unsigned Opcode_f32, Optional<unsigned> Opcode_f64) {
815   switch (VT) {
816   case MVT::i1:
817   case MVT::i8:
818     return Opcode_i8;
819   case MVT::i16:
820     return Opcode_i16;
821   case MVT::i32:
822     return Opcode_i32;
823   case MVT::i64:
824     return Opcode_i64;
825   case MVT::f16:
826     return Opcode_f16;
827   case MVT::v2f16:
828     return Opcode_f16x2;
829   case MVT::f32:
830     return Opcode_f32;
831   case MVT::f64:
832     return Opcode_f64;
833   default:
834     return None;
835   }
836 }
837 
838 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
839   SDLoc dl(N);
840   MemSDNode *LD = cast<MemSDNode>(N);
841   assert(LD->readMem() && "Expected load");
842   LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
843   EVT LoadedVT = LD->getMemoryVT();
844   SDNode *NVPTXLD = nullptr;
845 
846   // do not support pre/post inc/dec
847   if (PlainLoad && PlainLoad->isIndexed())
848     return false;
849 
850   if (!LoadedVT.isSimple())
851     return false;
852 
853   AtomicOrdering Ordering = LD->getSuccessOrdering();
854   // In order to lower atomic loads with stronger guarantees we would need to
855   // use load.acquire or insert fences. However these features were only added
856   // with PTX ISA 6.0 / sm_70.
857   // TODO: Check if we can actually use the new instructions and implement them.
858   if (isStrongerThanMonotonic(Ordering))
859     return false;
860 
861   // Address Space Setting
862   unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
863   if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
864     return tryLDGLDU(N);
865   }
866 
867   unsigned int PointerSize =
868       CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
869 
870   // Volatile Setting
871   // - .volatile is only available for .global and .shared
872   // - .volatile has the same memory synchronization semantics as .relaxed.sys
873   bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
874   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
875       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
876       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
877     isVolatile = false;
878 
879   // Type Setting: fromType + fromTypeWidth
880   //
881   // Sign   : ISD::SEXTLOAD
882   // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
883   //          type is integer
884   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
885   MVT SimpleVT = LoadedVT.getSimpleVT();
886   MVT ScalarVT = SimpleVT.getScalarType();
887   // Read at least 8 bits (predicates are stored as 8-bit values)
888   unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
889   unsigned int fromType;
890 
891   // Vector Setting
892   unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
893   if (SimpleVT.isVector()) {
894     assert(LoadedVT == MVT::v2f16 && "Unexpected vector type");
895     // v2f16 is loaded using ld.b32
896     fromTypeWidth = 32;
897   }
898 
899   if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
900     fromType = NVPTX::PTXLdStInstCode::Signed;
901   else if (ScalarVT.isFloatingPoint())
902     // f16 uses .b16 as its storage type.
903     fromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
904                                              : NVPTX::PTXLdStInstCode::Float;
905   else
906     fromType = NVPTX::PTXLdStInstCode::Unsigned;
907 
908   // Create the machine instruction DAG
909   SDValue Chain = N->getOperand(0);
910   SDValue N1 = N->getOperand(1);
911   SDValue Addr;
912   SDValue Offset, Base;
913   Optional<unsigned> Opcode;
914   MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
915 
916   if (SelectDirectAddr(N1, Addr)) {
917     Opcode = pickOpcodeForVT(
918         TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, NVPTX::LD_i32_avar,
919         NVPTX::LD_i64_avar, NVPTX::LD_f16_avar, NVPTX::LD_f16x2_avar,
920         NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
921     if (!Opcode)
922       return false;
923     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
924                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
925                       getI32Imm(fromTypeWidth, dl), Addr, Chain };
926     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
927   } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
928                                : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
929     Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
930                                  NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
931                                  NVPTX::LD_f16_asi, NVPTX::LD_f16x2_asi,
932                                  NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
933     if (!Opcode)
934       return false;
935     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
936                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
937                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
938     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
939   } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
940                                : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
941     if (PointerSize == 64)
942       Opcode = pickOpcodeForVT(
943           TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
944           NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, NVPTX::LD_f16_ari_64,
945           NVPTX::LD_f16x2_ari_64, NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
946     else
947       Opcode = pickOpcodeForVT(
948           TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari, NVPTX::LD_i32_ari,
949           NVPTX::LD_i64_ari, NVPTX::LD_f16_ari, NVPTX::LD_f16x2_ari,
950           NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
951     if (!Opcode)
952       return false;
953     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
954                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
955                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
956     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
957   } else {
958     if (PointerSize == 64)
959       Opcode = pickOpcodeForVT(
960           TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
961           NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, NVPTX::LD_f16_areg_64,
962           NVPTX::LD_f16x2_areg_64, NVPTX::LD_f32_areg_64,
963           NVPTX::LD_f64_areg_64);
964     else
965       Opcode = pickOpcodeForVT(
966           TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg, NVPTX::LD_i32_areg,
967           NVPTX::LD_i64_areg, NVPTX::LD_f16_areg, NVPTX::LD_f16x2_areg,
968           NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
969     if (!Opcode)
970       return false;
971     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
972                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
973                       getI32Imm(fromTypeWidth, dl), N1, Chain };
974     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
975   }
976 
977   if (!NVPTXLD)
978     return false;
979 
980   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
981   CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
982 
983   ReplaceNode(N, NVPTXLD);
984   return true;
985 }
986 
987 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
988 
989   SDValue Chain = N->getOperand(0);
990   SDValue Op1 = N->getOperand(1);
991   SDValue Addr, Offset, Base;
992   Optional<unsigned> Opcode;
993   SDLoc DL(N);
994   SDNode *LD;
995   MemSDNode *MemSD = cast<MemSDNode>(N);
996   EVT LoadedVT = MemSD->getMemoryVT();
997 
998   if (!LoadedVT.isSimple())
999     return false;
1000 
1001   // Address Space Setting
1002   unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
1003   if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
1004     return tryLDGLDU(N);
1005   }
1006 
1007   unsigned int PointerSize =
1008       CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
1009 
1010   // Volatile Setting
1011   // - .volatile is only availalble for .global and .shared
1012   bool IsVolatile = MemSD->isVolatile();
1013   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1014       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1015       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1016     IsVolatile = false;
1017 
1018   // Vector Setting
1019   MVT SimpleVT = LoadedVT.getSimpleVT();
1020 
1021   // Type Setting: fromType + fromTypeWidth
1022   //
1023   // Sign   : ISD::SEXTLOAD
1024   // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1025   //          type is integer
1026   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1027   MVT ScalarVT = SimpleVT.getScalarType();
1028   // Read at least 8 bits (predicates are stored as 8-bit values)
1029   unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
1030   unsigned int FromType;
1031   // The last operand holds the original LoadSDNode::getExtensionType() value
1032   unsigned ExtensionType = cast<ConstantSDNode>(
1033       N->getOperand(N->getNumOperands() - 1))->getZExtValue();
1034   if (ExtensionType == ISD::SEXTLOAD)
1035     FromType = NVPTX::PTXLdStInstCode::Signed;
1036   else if (ScalarVT.isFloatingPoint())
1037     FromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
1038                                              : NVPTX::PTXLdStInstCode::Float;
1039   else
1040     FromType = NVPTX::PTXLdStInstCode::Unsigned;
1041 
1042   unsigned VecType;
1043 
1044   switch (N->getOpcode()) {
1045   case NVPTXISD::LoadV2:
1046     VecType = NVPTX::PTXLdStInstCode::V2;
1047     break;
1048   case NVPTXISD::LoadV4:
1049     VecType = NVPTX::PTXLdStInstCode::V4;
1050     break;
1051   default:
1052     return false;
1053   }
1054 
1055   EVT EltVT = N->getValueType(0);
1056 
1057   // v8f16 is a special case. PTX doesn't have ld.v8.f16
1058   // instruction. Instead, we split the vector into v2f16 chunks and
1059   // load them with ld.v4.b32.
1060   if (EltVT == MVT::v2f16) {
1061     assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
1062     EltVT = MVT::i32;
1063     FromType = NVPTX::PTXLdStInstCode::Untyped;
1064     FromTypeWidth = 32;
1065   }
1066 
1067   if (SelectDirectAddr(Op1, Addr)) {
1068     switch (N->getOpcode()) {
1069     default:
1070       return false;
1071     case NVPTXISD::LoadV2:
1072       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1073                                NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
1074                                NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
1075                                NVPTX::LDV_f16_v2_avar, NVPTX::LDV_f16x2_v2_avar,
1076                                NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
1077       break;
1078     case NVPTXISD::LoadV4:
1079       Opcode =
1080           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
1081                           NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar, None,
1082                           NVPTX::LDV_f16_v4_avar, NVPTX::LDV_f16x2_v4_avar,
1083                           NVPTX::LDV_f32_v4_avar, None);
1084       break;
1085     }
1086     if (!Opcode)
1087       return false;
1088     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1089                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1090                       getI32Imm(FromTypeWidth, DL), Addr, Chain };
1091     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1092   } else if (PointerSize == 64
1093                  ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
1094                  : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
1095     switch (N->getOpcode()) {
1096     default:
1097       return false;
1098     case NVPTXISD::LoadV2:
1099       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1100                                NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
1101                                NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
1102                                NVPTX::LDV_f16_v2_asi, NVPTX::LDV_f16x2_v2_asi,
1103                                NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
1104       break;
1105     case NVPTXISD::LoadV4:
1106       Opcode =
1107           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
1108                           NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi, None,
1109                           NVPTX::LDV_f16_v4_asi, NVPTX::LDV_f16x2_v4_asi,
1110                           NVPTX::LDV_f32_v4_asi, None);
1111       break;
1112     }
1113     if (!Opcode)
1114       return false;
1115     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1116                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1117                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1118     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1119   } else if (PointerSize == 64
1120                  ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1121                  : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1122     if (PointerSize == 64) {
1123       switch (N->getOpcode()) {
1124       default:
1125         return false;
1126       case NVPTXISD::LoadV2:
1127         Opcode = pickOpcodeForVT(
1128             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_ari_64,
1129             NVPTX::LDV_i16_v2_ari_64, NVPTX::LDV_i32_v2_ari_64,
1130             NVPTX::LDV_i64_v2_ari_64, NVPTX::LDV_f16_v2_ari_64,
1131             NVPTX::LDV_f16x2_v2_ari_64, NVPTX::LDV_f32_v2_ari_64,
1132             NVPTX::LDV_f64_v2_ari_64);
1133         break;
1134       case NVPTXISD::LoadV4:
1135         Opcode = pickOpcodeForVT(
1136             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
1137             NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, None,
1138             NVPTX::LDV_f16_v4_ari_64, NVPTX::LDV_f16x2_v4_ari_64,
1139             NVPTX::LDV_f32_v4_ari_64, None);
1140         break;
1141       }
1142     } else {
1143       switch (N->getOpcode()) {
1144       default:
1145         return false;
1146       case NVPTXISD::LoadV2:
1147         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1148                                  NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
1149                                  NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
1150                                  NVPTX::LDV_f16_v2_ari, NVPTX::LDV_f16x2_v2_ari,
1151                                  NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
1152         break;
1153       case NVPTXISD::LoadV4:
1154         Opcode =
1155             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
1156                             NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari, None,
1157                             NVPTX::LDV_f16_v4_ari, NVPTX::LDV_f16x2_v4_ari,
1158                             NVPTX::LDV_f32_v4_ari, None);
1159         break;
1160       }
1161     }
1162     if (!Opcode)
1163       return false;
1164     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1165                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1166                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1167 
1168     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1169   } else {
1170     if (PointerSize == 64) {
1171       switch (N->getOpcode()) {
1172       default:
1173         return false;
1174       case NVPTXISD::LoadV2:
1175         Opcode = pickOpcodeForVT(
1176             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
1177             NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
1178             NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f16_v2_areg_64,
1179             NVPTX::LDV_f16x2_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
1180             NVPTX::LDV_f64_v2_areg_64);
1181         break;
1182       case NVPTXISD::LoadV4:
1183         Opcode = pickOpcodeForVT(
1184             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
1185             NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, None,
1186             NVPTX::LDV_f16_v4_areg_64, NVPTX::LDV_f16x2_v4_areg_64,
1187             NVPTX::LDV_f32_v4_areg_64, None);
1188         break;
1189       }
1190     } else {
1191       switch (N->getOpcode()) {
1192       default:
1193         return false;
1194       case NVPTXISD::LoadV2:
1195         Opcode =
1196             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
1197                             NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
1198                             NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f16_v2_areg,
1199                             NVPTX::LDV_f16x2_v2_areg, NVPTX::LDV_f32_v2_areg,
1200                             NVPTX::LDV_f64_v2_areg);
1201         break;
1202       case NVPTXISD::LoadV4:
1203         Opcode = pickOpcodeForVT(
1204             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
1205             NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg, None,
1206             NVPTX::LDV_f16_v4_areg, NVPTX::LDV_f16x2_v4_areg,
1207             NVPTX::LDV_f32_v4_areg, None);
1208         break;
1209       }
1210     }
1211     if (!Opcode)
1212       return false;
1213     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1214                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1215                       getI32Imm(FromTypeWidth, DL), Op1, Chain };
1216     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1217   }
1218 
1219   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1220   CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1221 
1222   ReplaceNode(N, LD);
1223   return true;
1224 }
1225 
1226 bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
1227 
1228   SDValue Chain = N->getOperand(0);
1229   SDValue Op1;
1230   MemSDNode *Mem;
1231   bool IsLDG = true;
1232 
1233   // If this is an LDG intrinsic, the address is the third operand. If its an
1234   // LDG/LDU SD node (from custom vector handling), then its the second operand
1235   if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
1236     Op1 = N->getOperand(2);
1237     Mem = cast<MemIntrinsicSDNode>(N);
1238     unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
1239     switch (IID) {
1240     default:
1241       return false;
1242     case Intrinsic::nvvm_ldg_global_f:
1243     case Intrinsic::nvvm_ldg_global_i:
1244     case Intrinsic::nvvm_ldg_global_p:
1245       IsLDG = true;
1246       break;
1247     case Intrinsic::nvvm_ldu_global_f:
1248     case Intrinsic::nvvm_ldu_global_i:
1249     case Intrinsic::nvvm_ldu_global_p:
1250       IsLDG = false;
1251       break;
1252     }
1253   } else {
1254     Op1 = N->getOperand(1);
1255     Mem = cast<MemSDNode>(N);
1256   }
1257 
1258   Optional<unsigned> Opcode;
1259   SDLoc DL(N);
1260   SDNode *LD;
1261   SDValue Base, Offset, Addr;
1262 
1263   EVT EltVT = Mem->getMemoryVT();
1264   unsigned NumElts = 1;
1265   if (EltVT.isVector()) {
1266     NumElts = EltVT.getVectorNumElements();
1267     EltVT = EltVT.getVectorElementType();
1268     // vectors of f16 are loaded/stored as multiples of v2f16 elements.
1269     if (EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) {
1270       assert(NumElts % 2 == 0 && "Vector must have even number of elements");
1271       EltVT = MVT::v2f16;
1272       NumElts /= 2;
1273     }
1274   }
1275 
1276   // Build the "promoted" result VTList for the load. If we are really loading
1277   // i8s, then the return type will be promoted to i16 since we do not expose
1278   // 8-bit registers in NVPTX.
1279   EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
1280   SmallVector<EVT, 5> InstVTs;
1281   for (unsigned i = 0; i != NumElts; ++i) {
1282     InstVTs.push_back(NodeVT);
1283   }
1284   InstVTs.push_back(MVT::Other);
1285   SDVTList InstVTList = CurDAG->getVTList(InstVTs);
1286 
1287   if (SelectDirectAddr(Op1, Addr)) {
1288     switch (N->getOpcode()) {
1289     default:
1290       return false;
1291     case ISD::LOAD:
1292     case ISD::INTRINSIC_W_CHAIN:
1293       if (IsLDG)
1294         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1295                                      NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1296                                      NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
1297                                      NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1298                                      NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
1299                                      NVPTX::INT_PTX_LDG_GLOBAL_f16avar,
1300                                      NVPTX::INT_PTX_LDG_GLOBAL_f16x2avar,
1301                                      NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1302                                      NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1303       else
1304         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1305                                      NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1306                                      NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
1307                                      NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1308                                      NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
1309                                      NVPTX::INT_PTX_LDU_GLOBAL_f16avar,
1310                                      NVPTX::INT_PTX_LDU_GLOBAL_f16x2avar,
1311                                      NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1312                                      NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1313       break;
1314     case NVPTXISD::LoadV2:
1315     case NVPTXISD::LDGV2:
1316       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1317                                    NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1318                                    NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1319                                    NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1320                                    NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1321                                    NVPTX::INT_PTX_LDG_G_v2f16_ELE_avar,
1322                                    NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_avar,
1323                                    NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1324                                    NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1325       break;
1326     case NVPTXISD::LDUV2:
1327       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1328                                    NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1329                                    NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1330                                    NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1331                                    NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1332                                    NVPTX::INT_PTX_LDU_G_v2f16_ELE_avar,
1333                                    NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_avar,
1334                                    NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1335                                    NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1336       break;
1337     case NVPTXISD::LoadV4:
1338     case NVPTXISD::LDGV4:
1339       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1340                                NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1341                                NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1342                                NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, None,
1343                                NVPTX::INT_PTX_LDG_G_v4f16_ELE_avar,
1344                                NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_avar,
1345                                NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, None);
1346       break;
1347     case NVPTXISD::LDUV4:
1348       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1349                                NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1350                                NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1351                                NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, None,
1352                                NVPTX::INT_PTX_LDU_G_v4f16_ELE_avar,
1353                                NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_avar,
1354                                NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, None);
1355       break;
1356     }
1357     if (!Opcode)
1358       return false;
1359     SDValue Ops[] = { Addr, Chain };
1360     LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1361   } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1362                           : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1363     if (TM.is64Bit()) {
1364       switch (N->getOpcode()) {
1365       default:
1366         return false;
1367       case ISD::LOAD:
1368       case ISD::INTRINSIC_W_CHAIN:
1369         if (IsLDG)
1370           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1371                                        NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
1372                                        NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
1373                                        NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
1374                                        NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
1375                                        NVPTX::INT_PTX_LDG_GLOBAL_f16ari64,
1376                                        NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari64,
1377                                        NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
1378                                        NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
1379         else
1380           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1381                                        NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
1382                                        NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
1383                                        NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
1384                                        NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
1385                                        NVPTX::INT_PTX_LDU_GLOBAL_f16ari64,
1386                                        NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari64,
1387                                        NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
1388                                        NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
1389         break;
1390       case NVPTXISD::LoadV2:
1391       case NVPTXISD::LDGV2:
1392         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1393                                      NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
1394                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
1395                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
1396                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
1397                                      NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari64,
1398                                      NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari64,
1399                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
1400                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
1401         break;
1402       case NVPTXISD::LDUV2:
1403         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1404                                      NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
1405                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
1406                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
1407                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
1408                                      NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari64,
1409                                      NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari64,
1410                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
1411                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
1412         break;
1413       case NVPTXISD::LoadV4:
1414       case NVPTXISD::LDGV4:
1415         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1416                                  NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
1417                                  NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
1418                                  NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, None,
1419                                  NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari64,
1420                                  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari64,
1421                                  NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, None);
1422         break;
1423       case NVPTXISD::LDUV4:
1424         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1425                                  NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
1426                                  NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
1427                                  NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, None,
1428                                  NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari64,
1429                                  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari64,
1430                                  NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, None);
1431         break;
1432       }
1433     } else {
1434       switch (N->getOpcode()) {
1435       default:
1436         return false;
1437       case ISD::LOAD:
1438       case ISD::INTRINSIC_W_CHAIN:
1439         if (IsLDG)
1440           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1441                                        NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
1442                                        NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
1443                                        NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
1444                                        NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
1445                                        NVPTX::INT_PTX_LDG_GLOBAL_f16ari,
1446                                        NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari,
1447                                        NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
1448                                        NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
1449         else
1450           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1451                                        NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
1452                                        NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
1453                                        NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
1454                                        NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
1455                                        NVPTX::INT_PTX_LDU_GLOBAL_f16ari,
1456                                        NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari,
1457                                        NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
1458                                        NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
1459         break;
1460       case NVPTXISD::LoadV2:
1461       case NVPTXISD::LDGV2:
1462         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1463                                      NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
1464                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
1465                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
1466                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
1467                                      NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari32,
1468                                      NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari32,
1469                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
1470                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
1471         break;
1472       case NVPTXISD::LDUV2:
1473         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1474                                      NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
1475                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
1476                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
1477                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
1478                                      NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari32,
1479                                      NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari32,
1480                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
1481                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
1482         break;
1483       case NVPTXISD::LoadV4:
1484       case NVPTXISD::LDGV4:
1485         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1486                                  NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
1487                                  NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
1488                                  NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, None,
1489                                  NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari32,
1490                                  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari32,
1491                                  NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, None);
1492         break;
1493       case NVPTXISD::LDUV4:
1494         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1495                                  NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
1496                                  NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
1497                                  NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, None,
1498                                  NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari32,
1499                                  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari32,
1500                                  NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, None);
1501         break;
1502       }
1503     }
1504     if (!Opcode)
1505       return false;
1506     SDValue Ops[] = {Base, Offset, Chain};
1507     LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1508   } else {
1509     if (TM.is64Bit()) {
1510       switch (N->getOpcode()) {
1511       default:
1512         return false;
1513       case ISD::LOAD:
1514       case ISD::INTRINSIC_W_CHAIN:
1515         if (IsLDG)
1516           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1517                                        NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
1518                                        NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
1519                                        NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
1520                                        NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
1521                                        NVPTX::INT_PTX_LDG_GLOBAL_f16areg64,
1522                                        NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg64,
1523                                        NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
1524                                        NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
1525         else
1526           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1527                                        NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
1528                                        NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
1529                                        NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
1530                                        NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
1531                                        NVPTX::INT_PTX_LDU_GLOBAL_f16areg64,
1532                                        NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg64,
1533                                        NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
1534                                        NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
1535         break;
1536       case NVPTXISD::LoadV2:
1537       case NVPTXISD::LDGV2:
1538         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1539                                      NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
1540                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
1541                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
1542                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
1543                                      NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg64,
1544                                      NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg64,
1545                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
1546                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
1547         break;
1548       case NVPTXISD::LDUV2:
1549         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1550                                      NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
1551                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
1552                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
1553                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
1554                                      NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg64,
1555                                      NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg64,
1556                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
1557                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
1558         break;
1559       case NVPTXISD::LoadV4:
1560       case NVPTXISD::LDGV4:
1561         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1562                                  NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
1563                                  NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
1564                                  NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, None,
1565                                  NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg64,
1566                                  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg64,
1567                                  NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, None);
1568         break;
1569       case NVPTXISD::LDUV4:
1570         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1571                                  NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
1572                                  NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
1573                                  NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, None,
1574                                  NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg64,
1575                                  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg64,
1576                                  NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, None);
1577         break;
1578       }
1579     } else {
1580       switch (N->getOpcode()) {
1581       default:
1582         return false;
1583       case ISD::LOAD:
1584       case ISD::INTRINSIC_W_CHAIN:
1585         if (IsLDG)
1586           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1587                                    NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
1588                                    NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
1589                                    NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
1590                                    NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
1591                                    NVPTX::INT_PTX_LDG_GLOBAL_f16areg,
1592                                    NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg,
1593                                    NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
1594                                    NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
1595         else
1596           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1597                                    NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
1598                                    NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
1599                                    NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
1600                                    NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
1601                                    NVPTX::INT_PTX_LDU_GLOBAL_f16areg,
1602                                    NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg,
1603                                    NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
1604                                    NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
1605         break;
1606       case NVPTXISD::LoadV2:
1607       case NVPTXISD::LDGV2:
1608         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1609                                  NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
1610                                  NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
1611                                  NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
1612                                  NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
1613                                  NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg32,
1614                                  NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg32,
1615                                  NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
1616                                  NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
1617         break;
1618       case NVPTXISD::LDUV2:
1619         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1620                                  NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
1621                                  NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
1622                                  NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
1623                                  NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
1624                                  NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg32,
1625                                  NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg32,
1626                                  NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
1627                                  NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
1628         break;
1629       case NVPTXISD::LoadV4:
1630       case NVPTXISD::LDGV4:
1631         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1632                                  NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
1633                                  NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
1634                                  NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, None,
1635                                  NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg32,
1636                                  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg32,
1637                                  NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, None);
1638         break;
1639       case NVPTXISD::LDUV4:
1640         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1641                                  NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
1642                                  NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
1643                                  NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, None,
1644                                  NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg32,
1645                                  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg32,
1646                                  NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, None);
1647         break;
1648       }
1649     }
1650     if (!Opcode)
1651       return false;
1652     SDValue Ops[] = { Op1, Chain };
1653     LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1654   }
1655 
1656   MachineMemOperand *MemRef = Mem->getMemOperand();
1657   CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1658 
1659   // For automatic generation of LDG (through SelectLoad[Vector], not the
1660   // intrinsics), we may have an extending load like:
1661   //
1662   //   i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
1663   //
1664   // In this case, the matching logic above will select a load for the original
1665   // memory type (in this case, i8) and our types will not match (the node needs
1666   // to return an i32 in this case). Our LDG/LDU nodes do not support the
1667   // concept of sign-/zero-extension, so emulate it here by adding an explicit
1668   // CVT instruction. Ptxas should clean up any redundancies here.
1669 
1670   EVT OrigType = N->getValueType(0);
1671   LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
1672 
1673   if (OrigType != EltVT && LdNode) {
1674     // We have an extending-load. The instruction we selected operates on the
1675     // smaller type, but the SDNode we are replacing has the larger type. We
1676     // need to emit a CVT to make the types match.
1677     bool IsSigned = LdNode->getExtensionType() == ISD::SEXTLOAD;
1678     unsigned CvtOpc = GetConvertOpcode(OrigType.getSimpleVT(),
1679                                        EltVT.getSimpleVT(), IsSigned);
1680 
1681     // For each output value, apply the manual sign/zero-extension and make sure
1682     // all users of the load go through that CVT.
1683     for (unsigned i = 0; i != NumElts; ++i) {
1684       SDValue Res(LD, i);
1685       SDValue OrigVal(N, i);
1686 
1687       SDNode *CvtNode =
1688         CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
1689                                CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
1690                                                          DL, MVT::i32));
1691       ReplaceUses(OrigVal, SDValue(CvtNode, 0));
1692     }
1693   }
1694 
1695   ReplaceNode(N, LD);
1696   return true;
1697 }
1698 
1699 bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1700   SDLoc dl(N);
1701   MemSDNode *ST = cast<MemSDNode>(N);
1702   assert(ST->writeMem() && "Expected store");
1703   StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
1704   AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
1705   assert((PlainStore || AtomicStore) && "Expected store");
1706   EVT StoreVT = ST->getMemoryVT();
1707   SDNode *NVPTXST = nullptr;
1708 
1709   // do not support pre/post inc/dec
1710   if (PlainStore && PlainStore->isIndexed())
1711     return false;
1712 
1713   if (!StoreVT.isSimple())
1714     return false;
1715 
1716   AtomicOrdering Ordering = ST->getSuccessOrdering();
1717   // In order to lower atomic loads with stronger guarantees we would need to
1718   // use store.release or insert fences. However these features were only added
1719   // with PTX ISA 6.0 / sm_70.
1720   // TODO: Check if we can actually use the new instructions and implement them.
1721   if (isStrongerThanMonotonic(Ordering))
1722     return false;
1723 
1724   // Address Space Setting
1725   unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
1726   unsigned int PointerSize =
1727       CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
1728 
1729   // Volatile Setting
1730   // - .volatile is only available for .global and .shared
1731   // - .volatile has the same memory synchronization semantics as .relaxed.sys
1732   bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;
1733   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1734       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1735       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1736     isVolatile = false;
1737 
1738   // Vector Setting
1739   MVT SimpleVT = StoreVT.getSimpleVT();
1740   unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
1741 
1742   // Type Setting: toType + toTypeWidth
1743   // - for integer type, always use 'u'
1744   //
1745   MVT ScalarVT = SimpleVT.getScalarType();
1746   unsigned toTypeWidth = ScalarVT.getSizeInBits();
1747   if (SimpleVT.isVector()) {
1748     assert(StoreVT == MVT::v2f16 && "Unexpected vector type");
1749     // v2f16 is stored using st.b32
1750     toTypeWidth = 32;
1751   }
1752 
1753   unsigned int toType;
1754   if (ScalarVT.isFloatingPoint())
1755     // f16 uses .b16 as its storage type.
1756     toType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
1757                                            : NVPTX::PTXLdStInstCode::Float;
1758   else
1759     toType = NVPTX::PTXLdStInstCode::Unsigned;
1760 
1761   // Create the machine instruction DAG
1762   SDValue Chain = ST->getChain();
1763   SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1764   SDValue BasePtr = ST->getBasePtr();
1765   SDValue Addr;
1766   SDValue Offset, Base;
1767   Optional<unsigned> Opcode;
1768   MVT::SimpleValueType SourceVT =
1769       Value.getNode()->getSimpleValueType(0).SimpleTy;
1770 
1771   if (SelectDirectAddr(BasePtr, Addr)) {
1772     Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
1773                              NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
1774                              NVPTX::ST_f16_avar, NVPTX::ST_f16x2_avar,
1775                              NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
1776     if (!Opcode)
1777       return false;
1778     SDValue Ops[] = {Value,
1779                      getI32Imm(isVolatile, dl),
1780                      getI32Imm(CodeAddrSpace, dl),
1781                      getI32Imm(vecType, dl),
1782                      getI32Imm(toType, dl),
1783                      getI32Imm(toTypeWidth, dl),
1784                      Addr,
1785                      Chain};
1786     NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1787   } else if (PointerSize == 64
1788                  ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
1789                  : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
1790     Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
1791                              NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
1792                              NVPTX::ST_f16_asi, NVPTX::ST_f16x2_asi,
1793                              NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
1794     if (!Opcode)
1795       return false;
1796     SDValue Ops[] = {Value,
1797                      getI32Imm(isVolatile, dl),
1798                      getI32Imm(CodeAddrSpace, dl),
1799                      getI32Imm(vecType, dl),
1800                      getI32Imm(toType, dl),
1801                      getI32Imm(toTypeWidth, dl),
1802                      Base,
1803                      Offset,
1804                      Chain};
1805     NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1806   } else if (PointerSize == 64
1807                  ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
1808                  : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
1809     if (PointerSize == 64)
1810       Opcode = pickOpcodeForVT(
1811           SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
1812           NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, NVPTX::ST_f16_ari_64,
1813           NVPTX::ST_f16x2_ari_64, NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
1814     else
1815       Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
1816                                NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
1817                                NVPTX::ST_f16_ari, NVPTX::ST_f16x2_ari,
1818                                NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
1819     if (!Opcode)
1820       return false;
1821 
1822     SDValue Ops[] = {Value,
1823                      getI32Imm(isVolatile, dl),
1824                      getI32Imm(CodeAddrSpace, dl),
1825                      getI32Imm(vecType, dl),
1826                      getI32Imm(toType, dl),
1827                      getI32Imm(toTypeWidth, dl),
1828                      Base,
1829                      Offset,
1830                      Chain};
1831     NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1832   } else {
1833     if (PointerSize == 64)
1834       Opcode =
1835           pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
1836                           NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
1837                           NVPTX::ST_f16_areg_64, NVPTX::ST_f16x2_areg_64,
1838                           NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
1839     else
1840       Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
1841                                NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
1842                                NVPTX::ST_f16_areg, NVPTX::ST_f16x2_areg,
1843                                NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
1844     if (!Opcode)
1845       return false;
1846     SDValue Ops[] = {Value,
1847                      getI32Imm(isVolatile, dl),
1848                      getI32Imm(CodeAddrSpace, dl),
1849                      getI32Imm(vecType, dl),
1850                      getI32Imm(toType, dl),
1851                      getI32Imm(toTypeWidth, dl),
1852                      BasePtr,
1853                      Chain};
1854     NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1855   }
1856 
1857   if (!NVPTXST)
1858     return false;
1859 
1860   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1861   CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1862   ReplaceNode(N, NVPTXST);
1863   return true;
1864 }
1865 
1866 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1867   SDValue Chain = N->getOperand(0);
1868   SDValue Op1 = N->getOperand(1);
1869   SDValue Addr, Offset, Base;
1870   Optional<unsigned> Opcode;
1871   SDLoc DL(N);
1872   SDNode *ST;
1873   EVT EltVT = Op1.getValueType();
1874   MemSDNode *MemSD = cast<MemSDNode>(N);
1875   EVT StoreVT = MemSD->getMemoryVT();
1876 
1877   // Address Space Setting
1878   unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
1879   if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
1880     report_fatal_error("Cannot store to pointer that points to constant "
1881                        "memory space");
1882   }
1883   unsigned int PointerSize =
1884       CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
1885 
1886   // Volatile Setting
1887   // - .volatile is only availalble for .global and .shared
1888   bool IsVolatile = MemSD->isVolatile();
1889   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1890       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1891       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1892     IsVolatile = false;
1893 
1894   // Type Setting: toType + toTypeWidth
1895   // - for integer type, always use 'u'
1896   assert(StoreVT.isSimple() && "Store value is not simple");
1897   MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
1898   unsigned ToTypeWidth = ScalarVT.getSizeInBits();
1899   unsigned ToType;
1900   if (ScalarVT.isFloatingPoint())
1901     ToType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
1902                                            : NVPTX::PTXLdStInstCode::Float;
1903   else
1904     ToType = NVPTX::PTXLdStInstCode::Unsigned;
1905 
1906   SmallVector<SDValue, 12> StOps;
1907   SDValue N2;
1908   unsigned VecType;
1909 
1910   switch (N->getOpcode()) {
1911   case NVPTXISD::StoreV2:
1912     VecType = NVPTX::PTXLdStInstCode::V2;
1913     StOps.push_back(N->getOperand(1));
1914     StOps.push_back(N->getOperand(2));
1915     N2 = N->getOperand(3);
1916     break;
1917   case NVPTXISD::StoreV4:
1918     VecType = NVPTX::PTXLdStInstCode::V4;
1919     StOps.push_back(N->getOperand(1));
1920     StOps.push_back(N->getOperand(2));
1921     StOps.push_back(N->getOperand(3));
1922     StOps.push_back(N->getOperand(4));
1923     N2 = N->getOperand(5);
1924     break;
1925   default:
1926     return false;
1927   }
1928 
1929   // v8f16 is a special case. PTX doesn't have st.v8.f16
1930   // instruction. Instead, we split the vector into v2f16 chunks and
1931   // store them with st.v4.b32.
1932   if (EltVT == MVT::v2f16) {
1933     assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
1934     EltVT = MVT::i32;
1935     ToType = NVPTX::PTXLdStInstCode::Untyped;
1936     ToTypeWidth = 32;
1937   }
1938 
1939   StOps.push_back(getI32Imm(IsVolatile, DL));
1940   StOps.push_back(getI32Imm(CodeAddrSpace, DL));
1941   StOps.push_back(getI32Imm(VecType, DL));
1942   StOps.push_back(getI32Imm(ToType, DL));
1943   StOps.push_back(getI32Imm(ToTypeWidth, DL));
1944 
1945   if (SelectDirectAddr(N2, Addr)) {
1946     switch (N->getOpcode()) {
1947     default:
1948       return false;
1949     case NVPTXISD::StoreV2:
1950       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1951                                NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
1952                                NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
1953                                NVPTX::STV_f16_v2_avar, NVPTX::STV_f16x2_v2_avar,
1954                                NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
1955       break;
1956     case NVPTXISD::StoreV4:
1957       Opcode =
1958           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_avar,
1959                           NVPTX::STV_i16_v4_avar, NVPTX::STV_i32_v4_avar, None,
1960                           NVPTX::STV_f16_v4_avar, NVPTX::STV_f16x2_v4_avar,
1961                           NVPTX::STV_f32_v4_avar, None);
1962       break;
1963     }
1964     StOps.push_back(Addr);
1965   } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
1966                                : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
1967     switch (N->getOpcode()) {
1968     default:
1969       return false;
1970     case NVPTXISD::StoreV2:
1971       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1972                                NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
1973                                NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
1974                                NVPTX::STV_f16_v2_asi, NVPTX::STV_f16x2_v2_asi,
1975                                NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
1976       break;
1977     case NVPTXISD::StoreV4:
1978       Opcode =
1979           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
1980                           NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi, None,
1981                           NVPTX::STV_f16_v4_asi, NVPTX::STV_f16x2_v4_asi,
1982                           NVPTX::STV_f32_v4_asi, None);
1983       break;
1984     }
1985     StOps.push_back(Base);
1986     StOps.push_back(Offset);
1987   } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
1988                                : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
1989     if (PointerSize == 64) {
1990       switch (N->getOpcode()) {
1991       default:
1992         return false;
1993       case NVPTXISD::StoreV2:
1994         Opcode = pickOpcodeForVT(
1995             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_ari_64,
1996             NVPTX::STV_i16_v2_ari_64, NVPTX::STV_i32_v2_ari_64,
1997             NVPTX::STV_i64_v2_ari_64, NVPTX::STV_f16_v2_ari_64,
1998             NVPTX::STV_f16x2_v2_ari_64, NVPTX::STV_f32_v2_ari_64,
1999             NVPTX::STV_f64_v2_ari_64);
2000         break;
2001       case NVPTXISD::StoreV4:
2002         Opcode = pickOpcodeForVT(
2003             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
2004             NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, None,
2005             NVPTX::STV_f16_v4_ari_64, NVPTX::STV_f16x2_v4_ari_64,
2006             NVPTX::STV_f32_v4_ari_64, None);
2007         break;
2008       }
2009     } else {
2010       switch (N->getOpcode()) {
2011       default:
2012         return false;
2013       case NVPTXISD::StoreV2:
2014         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2015                                  NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
2016                                  NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
2017                                  NVPTX::STV_f16_v2_ari, NVPTX::STV_f16x2_v2_ari,
2018                                  NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
2019         break;
2020       case NVPTXISD::StoreV4:
2021         Opcode =
2022             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari,
2023                             NVPTX::STV_i16_v4_ari, NVPTX::STV_i32_v4_ari, None,
2024                             NVPTX::STV_f16_v4_ari, NVPTX::STV_f16x2_v4_ari,
2025                             NVPTX::STV_f32_v4_ari, None);
2026         break;
2027       }
2028     }
2029     StOps.push_back(Base);
2030     StOps.push_back(Offset);
2031   } else {
2032     if (PointerSize == 64) {
2033       switch (N->getOpcode()) {
2034       default:
2035         return false;
2036       case NVPTXISD::StoreV2:
2037         Opcode = pickOpcodeForVT(
2038             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
2039             NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
2040             NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f16_v2_areg_64,
2041             NVPTX::STV_f16x2_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
2042             NVPTX::STV_f64_v2_areg_64);
2043         break;
2044       case NVPTXISD::StoreV4:
2045         Opcode = pickOpcodeForVT(
2046             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
2047             NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, None,
2048             NVPTX::STV_f16_v4_areg_64, NVPTX::STV_f16x2_v4_areg_64,
2049             NVPTX::STV_f32_v4_areg_64, None);
2050         break;
2051       }
2052     } else {
2053       switch (N->getOpcode()) {
2054       default:
2055         return false;
2056       case NVPTXISD::StoreV2:
2057         Opcode =
2058             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
2059                             NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
2060                             NVPTX::STV_i64_v2_areg, NVPTX::STV_f16_v2_areg,
2061                             NVPTX::STV_f16x2_v2_areg, NVPTX::STV_f32_v2_areg,
2062                             NVPTX::STV_f64_v2_areg);
2063         break;
2064       case NVPTXISD::StoreV4:
2065         Opcode =
2066             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
2067                             NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg, None,
2068                             NVPTX::STV_f16_v4_areg, NVPTX::STV_f16x2_v4_areg,
2069                             NVPTX::STV_f32_v4_areg, None);
2070         break;
2071       }
2072     }
2073     StOps.push_back(N2);
2074   }
2075 
2076   if (!Opcode)
2077     return false;
2078 
2079   StOps.push_back(Chain);
2080 
2081   ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, StOps);
2082 
2083   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2084   CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
2085 
2086   ReplaceNode(N, ST);
2087   return true;
2088 }
2089 
2090 bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
2091   SDValue Chain = Node->getOperand(0);
2092   SDValue Offset = Node->getOperand(2);
2093   SDValue Flag = Node->getOperand(3);
2094   SDLoc DL(Node);
2095   MemSDNode *Mem = cast<MemSDNode>(Node);
2096 
2097   unsigned VecSize;
2098   switch (Node->getOpcode()) {
2099   default:
2100     return false;
2101   case NVPTXISD::LoadParam:
2102     VecSize = 1;
2103     break;
2104   case NVPTXISD::LoadParamV2:
2105     VecSize = 2;
2106     break;
2107   case NVPTXISD::LoadParamV4:
2108     VecSize = 4;
2109     break;
2110   }
2111 
2112   EVT EltVT = Node->getValueType(0);
2113   EVT MemVT = Mem->getMemoryVT();
2114 
2115   Optional<unsigned> Opcode;
2116 
2117   switch (VecSize) {
2118   default:
2119     return false;
2120   case 1:
2121     Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
2122                              NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
2123                              NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
2124                              NVPTX::LoadParamMemF16, NVPTX::LoadParamMemF16x2,
2125                              NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
2126     break;
2127   case 2:
2128     Opcode =
2129         pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
2130                         NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
2131                         NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F16,
2132                         NVPTX::LoadParamMemV2F16x2, NVPTX::LoadParamMemV2F32,
2133                         NVPTX::LoadParamMemV2F64);
2134     break;
2135   case 4:
2136     Opcode = pickOpcodeForVT(
2137         MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
2138         NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32, None,
2139         NVPTX::LoadParamMemV4F16, NVPTX::LoadParamMemV4F16x2,
2140         NVPTX::LoadParamMemV4F32, None);
2141     break;
2142   }
2143   if (!Opcode)
2144     return false;
2145 
2146   SDVTList VTs;
2147   if (VecSize == 1) {
2148     VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
2149   } else if (VecSize == 2) {
2150     VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
2151   } else {
2152     EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
2153     VTs = CurDAG->getVTList(EVTs);
2154   }
2155 
2156   unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2157 
2158   SmallVector<SDValue, 2> Ops;
2159   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2160   Ops.push_back(Chain);
2161   Ops.push_back(Flag);
2162 
2163   ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops));
2164   return true;
2165 }
2166 
2167 bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
2168   SDLoc DL(N);
2169   SDValue Chain = N->getOperand(0);
2170   SDValue Offset = N->getOperand(1);
2171   unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2172   MemSDNode *Mem = cast<MemSDNode>(N);
2173 
2174   // How many elements do we have?
2175   unsigned NumElts = 1;
2176   switch (N->getOpcode()) {
2177   default:
2178     return false;
2179   case NVPTXISD::StoreRetval:
2180     NumElts = 1;
2181     break;
2182   case NVPTXISD::StoreRetvalV2:
2183     NumElts = 2;
2184     break;
2185   case NVPTXISD::StoreRetvalV4:
2186     NumElts = 4;
2187     break;
2188   }
2189 
2190   // Build vector of operands
2191   SmallVector<SDValue, 6> Ops;
2192   for (unsigned i = 0; i < NumElts; ++i)
2193     Ops.push_back(N->getOperand(i + 2));
2194   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2195   Ops.push_back(Chain);
2196 
2197   // Determine target opcode
2198   // If we have an i1, use an 8-bit store. The lowering code in
2199   // NVPTXISelLowering will have already emitted an upcast.
2200   Optional<unsigned> Opcode = 0;
2201   switch (NumElts) {
2202   default:
2203     return false;
2204   case 1:
2205     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2206                              NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
2207                              NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
2208                              NVPTX::StoreRetvalF16, NVPTX::StoreRetvalF16x2,
2209                              NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
2210     break;
2211   case 2:
2212     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2213                              NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
2214                              NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
2215                              NVPTX::StoreRetvalV2F16, NVPTX::StoreRetvalV2F16x2,
2216                              NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
2217     break;
2218   case 4:
2219     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2220                              NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
2221                              NVPTX::StoreRetvalV4I32, None,
2222                              NVPTX::StoreRetvalV4F16, NVPTX::StoreRetvalV4F16x2,
2223                              NVPTX::StoreRetvalV4F32, None);
2224     break;
2225   }
2226   if (!Opcode)
2227     return false;
2228 
2229   SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
2230   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2231   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2232 
2233   ReplaceNode(N, Ret);
2234   return true;
2235 }
2236 
2237 bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
2238   SDLoc DL(N);
2239   SDValue Chain = N->getOperand(0);
2240   SDValue Param = N->getOperand(1);
2241   unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue();
2242   SDValue Offset = N->getOperand(2);
2243   unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2244   MemSDNode *Mem = cast<MemSDNode>(N);
2245   SDValue Flag = N->getOperand(N->getNumOperands() - 1);
2246 
2247   // How many elements do we have?
2248   unsigned NumElts = 1;
2249   switch (N->getOpcode()) {
2250   default:
2251     return false;
2252   case NVPTXISD::StoreParamU32:
2253   case NVPTXISD::StoreParamS32:
2254   case NVPTXISD::StoreParam:
2255     NumElts = 1;
2256     break;
2257   case NVPTXISD::StoreParamV2:
2258     NumElts = 2;
2259     break;
2260   case NVPTXISD::StoreParamV4:
2261     NumElts = 4;
2262     break;
2263   }
2264 
2265   // Build vector of operands
2266   SmallVector<SDValue, 8> Ops;
2267   for (unsigned i = 0; i < NumElts; ++i)
2268     Ops.push_back(N->getOperand(i + 3));
2269   Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32));
2270   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2271   Ops.push_back(Chain);
2272   Ops.push_back(Flag);
2273 
2274   // Determine target opcode
2275   // If we have an i1, use an 8-bit store. The lowering code in
2276   // NVPTXISelLowering will have already emitted an upcast.
2277   Optional<unsigned> Opcode = 0;
2278   switch (N->getOpcode()) {
2279   default:
2280     switch (NumElts) {
2281     default:
2282       return false;
2283     case 1:
2284       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2285                                NVPTX::StoreParamI8, NVPTX::StoreParamI16,
2286                                NVPTX::StoreParamI32, NVPTX::StoreParamI64,
2287                                NVPTX::StoreParamF16, NVPTX::StoreParamF16x2,
2288                                NVPTX::StoreParamF32, NVPTX::StoreParamF64);
2289       break;
2290     case 2:
2291       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2292                                NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16,
2293                                NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64,
2294                                NVPTX::StoreParamV2F16, NVPTX::StoreParamV2F16x2,
2295                                NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64);
2296       break;
2297     case 4:
2298       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2299                                NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16,
2300                                NVPTX::StoreParamV4I32, None,
2301                                NVPTX::StoreParamV4F16, NVPTX::StoreParamV4F16x2,
2302                                NVPTX::StoreParamV4F32, None);
2303       break;
2304     }
2305     if (!Opcode)
2306       return false;
2307     break;
2308   // Special case: if we have a sign-extend/zero-extend node, insert the
2309   // conversion instruction first, and use that as the value operand to
2310   // the selected StoreParam node.
2311   case NVPTXISD::StoreParamU32: {
2312     Opcode = NVPTX::StoreParamI32;
2313     SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
2314                                                 MVT::i32);
2315     SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
2316                                          MVT::i32, Ops[0], CvtNone);
2317     Ops[0] = SDValue(Cvt, 0);
2318     break;
2319   }
2320   case NVPTXISD::StoreParamS32: {
2321     Opcode = NVPTX::StoreParamI32;
2322     SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
2323                                                 MVT::i32);
2324     SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
2325                                          MVT::i32, Ops[0], CvtNone);
2326     Ops[0] = SDValue(Cvt, 0);
2327     break;
2328   }
2329   }
2330 
2331   SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
2332   SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops);
2333   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2334   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2335 
2336   ReplaceNode(N, Ret);
2337   return true;
2338 }
2339 
2340 bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
2341   unsigned Opc = 0;
2342 
2343   switch (N->getOpcode()) {
2344   default: return false;
2345   case NVPTXISD::Tex1DFloatS32:
2346     Opc = NVPTX::TEX_1D_F32_S32_RR;
2347     break;
2348   case NVPTXISD::Tex1DFloatFloat:
2349     Opc = NVPTX::TEX_1D_F32_F32_RR;
2350     break;
2351   case NVPTXISD::Tex1DFloatFloatLevel:
2352     Opc = NVPTX::TEX_1D_F32_F32_LEVEL_RR;
2353     break;
2354   case NVPTXISD::Tex1DFloatFloatGrad:
2355     Opc = NVPTX::TEX_1D_F32_F32_GRAD_RR;
2356     break;
2357   case NVPTXISD::Tex1DS32S32:
2358     Opc = NVPTX::TEX_1D_S32_S32_RR;
2359     break;
2360   case NVPTXISD::Tex1DS32Float:
2361     Opc = NVPTX::TEX_1D_S32_F32_RR;
2362     break;
2363   case NVPTXISD::Tex1DS32FloatLevel:
2364     Opc = NVPTX::TEX_1D_S32_F32_LEVEL_RR;
2365     break;
2366   case NVPTXISD::Tex1DS32FloatGrad:
2367     Opc = NVPTX::TEX_1D_S32_F32_GRAD_RR;
2368     break;
2369   case NVPTXISD::Tex1DU32S32:
2370     Opc = NVPTX::TEX_1D_U32_S32_RR;
2371     break;
2372   case NVPTXISD::Tex1DU32Float:
2373     Opc = NVPTX::TEX_1D_U32_F32_RR;
2374     break;
2375   case NVPTXISD::Tex1DU32FloatLevel:
2376     Opc = NVPTX::TEX_1D_U32_F32_LEVEL_RR;
2377     break;
2378   case NVPTXISD::Tex1DU32FloatGrad:
2379     Opc = NVPTX::TEX_1D_U32_F32_GRAD_RR;
2380     break;
2381   case NVPTXISD::Tex1DArrayFloatS32:
2382     Opc = NVPTX::TEX_1D_ARRAY_F32_S32_RR;
2383     break;
2384   case NVPTXISD::Tex1DArrayFloatFloat:
2385     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_RR;
2386     break;
2387   case NVPTXISD::Tex1DArrayFloatFloatLevel:
2388     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR;
2389     break;
2390   case NVPTXISD::Tex1DArrayFloatFloatGrad:
2391     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR;
2392     break;
2393   case NVPTXISD::Tex1DArrayS32S32:
2394     Opc = NVPTX::TEX_1D_ARRAY_S32_S32_RR;
2395     break;
2396   case NVPTXISD::Tex1DArrayS32Float:
2397     Opc = NVPTX::TEX_1D_ARRAY_S32_F32_RR;
2398     break;
2399   case NVPTXISD::Tex1DArrayS32FloatLevel:
2400     Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR;
2401     break;
2402   case NVPTXISD::Tex1DArrayS32FloatGrad:
2403     Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR;
2404     break;
2405   case NVPTXISD::Tex1DArrayU32S32:
2406     Opc = NVPTX::TEX_1D_ARRAY_U32_S32_RR;
2407     break;
2408   case NVPTXISD::Tex1DArrayU32Float:
2409     Opc = NVPTX::TEX_1D_ARRAY_U32_F32_RR;
2410     break;
2411   case NVPTXISD::Tex1DArrayU32FloatLevel:
2412     Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR;
2413     break;
2414   case NVPTXISD::Tex1DArrayU32FloatGrad:
2415     Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR;
2416     break;
2417   case NVPTXISD::Tex2DFloatS32:
2418     Opc = NVPTX::TEX_2D_F32_S32_RR;
2419     break;
2420   case NVPTXISD::Tex2DFloatFloat:
2421     Opc = NVPTX::TEX_2D_F32_F32_RR;
2422     break;
2423   case NVPTXISD::Tex2DFloatFloatLevel:
2424     Opc = NVPTX::TEX_2D_F32_F32_LEVEL_RR;
2425     break;
2426   case NVPTXISD::Tex2DFloatFloatGrad:
2427     Opc = NVPTX::TEX_2D_F32_F32_GRAD_RR;
2428     break;
2429   case NVPTXISD::Tex2DS32S32:
2430     Opc = NVPTX::TEX_2D_S32_S32_RR;
2431     break;
2432   case NVPTXISD::Tex2DS32Float:
2433     Opc = NVPTX::TEX_2D_S32_F32_RR;
2434     break;
2435   case NVPTXISD::Tex2DS32FloatLevel:
2436     Opc = NVPTX::TEX_2D_S32_F32_LEVEL_RR;
2437     break;
2438   case NVPTXISD::Tex2DS32FloatGrad:
2439     Opc = NVPTX::TEX_2D_S32_F32_GRAD_RR;
2440     break;
2441   case NVPTXISD::Tex2DU32S32:
2442     Opc = NVPTX::TEX_2D_U32_S32_RR;
2443     break;
2444   case NVPTXISD::Tex2DU32Float:
2445     Opc = NVPTX::TEX_2D_U32_F32_RR;
2446     break;
2447   case NVPTXISD::Tex2DU32FloatLevel:
2448     Opc = NVPTX::TEX_2D_U32_F32_LEVEL_RR;
2449     break;
2450   case NVPTXISD::Tex2DU32FloatGrad:
2451     Opc = NVPTX::TEX_2D_U32_F32_GRAD_RR;
2452     break;
2453   case NVPTXISD::Tex2DArrayFloatS32:
2454     Opc = NVPTX::TEX_2D_ARRAY_F32_S32_RR;
2455     break;
2456   case NVPTXISD::Tex2DArrayFloatFloat:
2457     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_RR;
2458     break;
2459   case NVPTXISD::Tex2DArrayFloatFloatLevel:
2460     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR;
2461     break;
2462   case NVPTXISD::Tex2DArrayFloatFloatGrad:
2463     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR;
2464     break;
2465   case NVPTXISD::Tex2DArrayS32S32:
2466     Opc = NVPTX::TEX_2D_ARRAY_S32_S32_RR;
2467     break;
2468   case NVPTXISD::Tex2DArrayS32Float:
2469     Opc = NVPTX::TEX_2D_ARRAY_S32_F32_RR;
2470     break;
2471   case NVPTXISD::Tex2DArrayS32FloatLevel:
2472     Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR;
2473     break;
2474   case NVPTXISD::Tex2DArrayS32FloatGrad:
2475     Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR;
2476     break;
2477   case NVPTXISD::Tex2DArrayU32S32:
2478     Opc = NVPTX::TEX_2D_ARRAY_U32_S32_RR;
2479     break;
2480   case NVPTXISD::Tex2DArrayU32Float:
2481     Opc = NVPTX::TEX_2D_ARRAY_U32_F32_RR;
2482     break;
2483   case NVPTXISD::Tex2DArrayU32FloatLevel:
2484     Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR;
2485     break;
2486   case NVPTXISD::Tex2DArrayU32FloatGrad:
2487     Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR;
2488     break;
2489   case NVPTXISD::Tex3DFloatS32:
2490     Opc = NVPTX::TEX_3D_F32_S32_RR;
2491     break;
2492   case NVPTXISD::Tex3DFloatFloat:
2493     Opc = NVPTX::TEX_3D_F32_F32_RR;
2494     break;
2495   case NVPTXISD::Tex3DFloatFloatLevel:
2496     Opc = NVPTX::TEX_3D_F32_F32_LEVEL_RR;
2497     break;
2498   case NVPTXISD::Tex3DFloatFloatGrad:
2499     Opc = NVPTX::TEX_3D_F32_F32_GRAD_RR;
2500     break;
2501   case NVPTXISD::Tex3DS32S32:
2502     Opc = NVPTX::TEX_3D_S32_S32_RR;
2503     break;
2504   case NVPTXISD::Tex3DS32Float:
2505     Opc = NVPTX::TEX_3D_S32_F32_RR;
2506     break;
2507   case NVPTXISD::Tex3DS32FloatLevel:
2508     Opc = NVPTX::TEX_3D_S32_F32_LEVEL_RR;
2509     break;
2510   case NVPTXISD::Tex3DS32FloatGrad:
2511     Opc = NVPTX::TEX_3D_S32_F32_GRAD_RR;
2512     break;
2513   case NVPTXISD::Tex3DU32S32:
2514     Opc = NVPTX::TEX_3D_U32_S32_RR;
2515     break;
2516   case NVPTXISD::Tex3DU32Float:
2517     Opc = NVPTX::TEX_3D_U32_F32_RR;
2518     break;
2519   case NVPTXISD::Tex3DU32FloatLevel:
2520     Opc = NVPTX::TEX_3D_U32_F32_LEVEL_RR;
2521     break;
2522   case NVPTXISD::Tex3DU32FloatGrad:
2523     Opc = NVPTX::TEX_3D_U32_F32_GRAD_RR;
2524     break;
2525   case NVPTXISD::TexCubeFloatFloat:
2526     Opc = NVPTX::TEX_CUBE_F32_F32_RR;
2527     break;
2528   case NVPTXISD::TexCubeFloatFloatLevel:
2529     Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL_RR;
2530     break;
2531   case NVPTXISD::TexCubeS32Float:
2532     Opc = NVPTX::TEX_CUBE_S32_F32_RR;
2533     break;
2534   case NVPTXISD::TexCubeS32FloatLevel:
2535     Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL_RR;
2536     break;
2537   case NVPTXISD::TexCubeU32Float:
2538     Opc = NVPTX::TEX_CUBE_U32_F32_RR;
2539     break;
2540   case NVPTXISD::TexCubeU32FloatLevel:
2541     Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL_RR;
2542     break;
2543   case NVPTXISD::TexCubeArrayFloatFloat:
2544     Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_RR;
2545     break;
2546   case NVPTXISD::TexCubeArrayFloatFloatLevel:
2547     Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR;
2548     break;
2549   case NVPTXISD::TexCubeArrayS32Float:
2550     Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_RR;
2551     break;
2552   case NVPTXISD::TexCubeArrayS32FloatLevel:
2553     Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR;
2554     break;
2555   case NVPTXISD::TexCubeArrayU32Float:
2556     Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_RR;
2557     break;
2558   case NVPTXISD::TexCubeArrayU32FloatLevel:
2559     Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR;
2560     break;
2561   case NVPTXISD::Tld4R2DFloatFloat:
2562     Opc = NVPTX::TLD4_R_2D_F32_F32_RR;
2563     break;
2564   case NVPTXISD::Tld4G2DFloatFloat:
2565     Opc = NVPTX::TLD4_G_2D_F32_F32_RR;
2566     break;
2567   case NVPTXISD::Tld4B2DFloatFloat:
2568     Opc = NVPTX::TLD4_B_2D_F32_F32_RR;
2569     break;
2570   case NVPTXISD::Tld4A2DFloatFloat:
2571     Opc = NVPTX::TLD4_A_2D_F32_F32_RR;
2572     break;
2573   case NVPTXISD::Tld4R2DS64Float:
2574     Opc = NVPTX::TLD4_R_2D_S32_F32_RR;
2575     break;
2576   case NVPTXISD::Tld4G2DS64Float:
2577     Opc = NVPTX::TLD4_G_2D_S32_F32_RR;
2578     break;
2579   case NVPTXISD::Tld4B2DS64Float:
2580     Opc = NVPTX::TLD4_B_2D_S32_F32_RR;
2581     break;
2582   case NVPTXISD::Tld4A2DS64Float:
2583     Opc = NVPTX::TLD4_A_2D_S32_F32_RR;
2584     break;
2585   case NVPTXISD::Tld4R2DU64Float:
2586     Opc = NVPTX::TLD4_R_2D_U32_F32_RR;
2587     break;
2588   case NVPTXISD::Tld4G2DU64Float:
2589     Opc = NVPTX::TLD4_G_2D_U32_F32_RR;
2590     break;
2591   case NVPTXISD::Tld4B2DU64Float:
2592     Opc = NVPTX::TLD4_B_2D_U32_F32_RR;
2593     break;
2594   case NVPTXISD::Tld4A2DU64Float:
2595     Opc = NVPTX::TLD4_A_2D_U32_F32_RR;
2596     break;
2597   case NVPTXISD::TexUnified1DFloatS32:
2598     Opc = NVPTX::TEX_UNIFIED_1D_F32_S32_R;
2599     break;
2600   case NVPTXISD::TexUnified1DFloatFloat:
2601     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_R;
2602     break;
2603   case NVPTXISD::TexUnified1DFloatFloatLevel:
2604     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R;
2605     break;
2606   case NVPTXISD::TexUnified1DFloatFloatGrad:
2607     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R;
2608     break;
2609   case NVPTXISD::TexUnified1DS32S32:
2610     Opc = NVPTX::TEX_UNIFIED_1D_S32_S32_R;
2611     break;
2612   case NVPTXISD::TexUnified1DS32Float:
2613     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_R;
2614     break;
2615   case NVPTXISD::TexUnified1DS32FloatLevel:
2616     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R;
2617     break;
2618   case NVPTXISD::TexUnified1DS32FloatGrad:
2619     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R;
2620     break;
2621   case NVPTXISD::TexUnified1DU32S32:
2622     Opc = NVPTX::TEX_UNIFIED_1D_U32_S32_R;
2623     break;
2624   case NVPTXISD::TexUnified1DU32Float:
2625     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_R;
2626     break;
2627   case NVPTXISD::TexUnified1DU32FloatLevel:
2628     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R;
2629     break;
2630   case NVPTXISD::TexUnified1DU32FloatGrad:
2631     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R;
2632     break;
2633   case NVPTXISD::TexUnified1DArrayFloatS32:
2634     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R;
2635     break;
2636   case NVPTXISD::TexUnified1DArrayFloatFloat:
2637     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R;
2638     break;
2639   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
2640     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R;
2641     break;
2642   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
2643     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R;
2644     break;
2645   case NVPTXISD::TexUnified1DArrayS32S32:
2646     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R;
2647     break;
2648   case NVPTXISD::TexUnified1DArrayS32Float:
2649     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R;
2650     break;
2651   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
2652     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R;
2653     break;
2654   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
2655     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R;
2656     break;
2657   case NVPTXISD::TexUnified1DArrayU32S32:
2658     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R;
2659     break;
2660   case NVPTXISD::TexUnified1DArrayU32Float:
2661     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R;
2662     break;
2663   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
2664     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R;
2665     break;
2666   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
2667     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R;
2668     break;
2669   case NVPTXISD::TexUnified2DFloatS32:
2670     Opc = NVPTX::TEX_UNIFIED_2D_F32_S32_R;
2671     break;
2672   case NVPTXISD::TexUnified2DFloatFloat:
2673     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_R;
2674     break;
2675   case NVPTXISD::TexUnified2DFloatFloatLevel:
2676     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R;
2677     break;
2678   case NVPTXISD::TexUnified2DFloatFloatGrad:
2679     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R;
2680     break;
2681   case NVPTXISD::TexUnified2DS32S32:
2682     Opc = NVPTX::TEX_UNIFIED_2D_S32_S32_R;
2683     break;
2684   case NVPTXISD::TexUnified2DS32Float:
2685     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_R;
2686     break;
2687   case NVPTXISD::TexUnified2DS32FloatLevel:
2688     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R;
2689     break;
2690   case NVPTXISD::TexUnified2DS32FloatGrad:
2691     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R;
2692     break;
2693   case NVPTXISD::TexUnified2DU32S32:
2694     Opc = NVPTX::TEX_UNIFIED_2D_U32_S32_R;
2695     break;
2696   case NVPTXISD::TexUnified2DU32Float:
2697     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_R;
2698     break;
2699   case NVPTXISD::TexUnified2DU32FloatLevel:
2700     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R;
2701     break;
2702   case NVPTXISD::TexUnified2DU32FloatGrad:
2703     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R;
2704     break;
2705   case NVPTXISD::TexUnified2DArrayFloatS32:
2706     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R;
2707     break;
2708   case NVPTXISD::TexUnified2DArrayFloatFloat:
2709     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R;
2710     break;
2711   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
2712     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R;
2713     break;
2714   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
2715     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R;
2716     break;
2717   case NVPTXISD::TexUnified2DArrayS32S32:
2718     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R;
2719     break;
2720   case NVPTXISD::TexUnified2DArrayS32Float:
2721     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R;
2722     break;
2723   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
2724     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R;
2725     break;
2726   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
2727     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R;
2728     break;
2729   case NVPTXISD::TexUnified2DArrayU32S32:
2730     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R;
2731     break;
2732   case NVPTXISD::TexUnified2DArrayU32Float:
2733     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R;
2734     break;
2735   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
2736     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R;
2737     break;
2738   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
2739     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R;
2740     break;
2741   case NVPTXISD::TexUnified3DFloatS32:
2742     Opc = NVPTX::TEX_UNIFIED_3D_F32_S32_R;
2743     break;
2744   case NVPTXISD::TexUnified3DFloatFloat:
2745     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_R;
2746     break;
2747   case NVPTXISD::TexUnified3DFloatFloatLevel:
2748     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R;
2749     break;
2750   case NVPTXISD::TexUnified3DFloatFloatGrad:
2751     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R;
2752     break;
2753   case NVPTXISD::TexUnified3DS32S32:
2754     Opc = NVPTX::TEX_UNIFIED_3D_S32_S32_R;
2755     break;
2756   case NVPTXISD::TexUnified3DS32Float:
2757     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_R;
2758     break;
2759   case NVPTXISD::TexUnified3DS32FloatLevel:
2760     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R;
2761     break;
2762   case NVPTXISD::TexUnified3DS32FloatGrad:
2763     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R;
2764     break;
2765   case NVPTXISD::TexUnified3DU32S32:
2766     Opc = NVPTX::TEX_UNIFIED_3D_U32_S32_R;
2767     break;
2768   case NVPTXISD::TexUnified3DU32Float:
2769     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_R;
2770     break;
2771   case NVPTXISD::TexUnified3DU32FloatLevel:
2772     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R;
2773     break;
2774   case NVPTXISD::TexUnified3DU32FloatGrad:
2775     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R;
2776     break;
2777   case NVPTXISD::TexUnifiedCubeFloatFloat:
2778     Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_R;
2779     break;
2780   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
2781     Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R;
2782     break;
2783   case NVPTXISD::TexUnifiedCubeS32Float:
2784     Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_R;
2785     break;
2786   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
2787     Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R;
2788     break;
2789   case NVPTXISD::TexUnifiedCubeU32Float:
2790     Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_R;
2791     break;
2792   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
2793     Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R;
2794     break;
2795   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
2796     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R;
2797     break;
2798   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
2799     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R;
2800     break;
2801   case NVPTXISD::TexUnifiedCubeArrayS32Float:
2802     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R;
2803     break;
2804   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
2805     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R;
2806     break;
2807   case NVPTXISD::TexUnifiedCubeArrayU32Float:
2808     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R;
2809     break;
2810   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
2811     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R;
2812     break;
2813   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
2814     Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R;
2815     break;
2816   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
2817     Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R;
2818     break;
2819   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
2820     Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R;
2821     break;
2822   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
2823     Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R;
2824     break;
2825   case NVPTXISD::Tld4UnifiedR2DS64Float:
2826     Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R;
2827     break;
2828   case NVPTXISD::Tld4UnifiedG2DS64Float:
2829     Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R;
2830     break;
2831   case NVPTXISD::Tld4UnifiedB2DS64Float:
2832     Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R;
2833     break;
2834   case NVPTXISD::Tld4UnifiedA2DS64Float:
2835     Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R;
2836     break;
2837   case NVPTXISD::Tld4UnifiedR2DU64Float:
2838     Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R;
2839     break;
2840   case NVPTXISD::Tld4UnifiedG2DU64Float:
2841     Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R;
2842     break;
2843   case NVPTXISD::Tld4UnifiedB2DU64Float:
2844     Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R;
2845     break;
2846   case NVPTXISD::Tld4UnifiedA2DU64Float:
2847     Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R;
2848     break;
2849   }
2850 
2851   // Copy over operands
2852   SmallVector<SDValue, 8> Ops(drop_begin(N->ops()));
2853   Ops.push_back(N->getOperand(0)); // Move chain to the back.
2854 
2855   ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
2856   return true;
2857 }
2858 
2859 bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
2860   unsigned Opc = 0;
2861   switch (N->getOpcode()) {
2862   default: return false;
2863   case NVPTXISD::Suld1DI8Clamp:
2864     Opc = NVPTX::SULD_1D_I8_CLAMP_R;
2865     break;
2866   case NVPTXISD::Suld1DI16Clamp:
2867     Opc = NVPTX::SULD_1D_I16_CLAMP_R;
2868     break;
2869   case NVPTXISD::Suld1DI32Clamp:
2870     Opc = NVPTX::SULD_1D_I32_CLAMP_R;
2871     break;
2872   case NVPTXISD::Suld1DI64Clamp:
2873     Opc = NVPTX::SULD_1D_I64_CLAMP_R;
2874     break;
2875   case NVPTXISD::Suld1DV2I8Clamp:
2876     Opc = NVPTX::SULD_1D_V2I8_CLAMP_R;
2877     break;
2878   case NVPTXISD::Suld1DV2I16Clamp:
2879     Opc = NVPTX::SULD_1D_V2I16_CLAMP_R;
2880     break;
2881   case NVPTXISD::Suld1DV2I32Clamp:
2882     Opc = NVPTX::SULD_1D_V2I32_CLAMP_R;
2883     break;
2884   case NVPTXISD::Suld1DV2I64Clamp:
2885     Opc = NVPTX::SULD_1D_V2I64_CLAMP_R;
2886     break;
2887   case NVPTXISD::Suld1DV4I8Clamp:
2888     Opc = NVPTX::SULD_1D_V4I8_CLAMP_R;
2889     break;
2890   case NVPTXISD::Suld1DV4I16Clamp:
2891     Opc = NVPTX::SULD_1D_V4I16_CLAMP_R;
2892     break;
2893   case NVPTXISD::Suld1DV4I32Clamp:
2894     Opc = NVPTX::SULD_1D_V4I32_CLAMP_R;
2895     break;
2896   case NVPTXISD::Suld1DArrayI8Clamp:
2897     Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP_R;
2898     break;
2899   case NVPTXISD::Suld1DArrayI16Clamp:
2900     Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP_R;
2901     break;
2902   case NVPTXISD::Suld1DArrayI32Clamp:
2903     Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP_R;
2904     break;
2905   case NVPTXISD::Suld1DArrayI64Clamp:
2906     Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP_R;
2907     break;
2908   case NVPTXISD::Suld1DArrayV2I8Clamp:
2909     Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R;
2910     break;
2911   case NVPTXISD::Suld1DArrayV2I16Clamp:
2912     Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R;
2913     break;
2914   case NVPTXISD::Suld1DArrayV2I32Clamp:
2915     Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R;
2916     break;
2917   case NVPTXISD::Suld1DArrayV2I64Clamp:
2918     Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R;
2919     break;
2920   case NVPTXISD::Suld1DArrayV4I8Clamp:
2921     Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R;
2922     break;
2923   case NVPTXISD::Suld1DArrayV4I16Clamp:
2924     Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R;
2925     break;
2926   case NVPTXISD::Suld1DArrayV4I32Clamp:
2927     Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R;
2928     break;
2929   case NVPTXISD::Suld2DI8Clamp:
2930     Opc = NVPTX::SULD_2D_I8_CLAMP_R;
2931     break;
2932   case NVPTXISD::Suld2DI16Clamp:
2933     Opc = NVPTX::SULD_2D_I16_CLAMP_R;
2934     break;
2935   case NVPTXISD::Suld2DI32Clamp:
2936     Opc = NVPTX::SULD_2D_I32_CLAMP_R;
2937     break;
2938   case NVPTXISD::Suld2DI64Clamp:
2939     Opc = NVPTX::SULD_2D_I64_CLAMP_R;
2940     break;
2941   case NVPTXISD::Suld2DV2I8Clamp:
2942     Opc = NVPTX::SULD_2D_V2I8_CLAMP_R;
2943     break;
2944   case NVPTXISD::Suld2DV2I16Clamp:
2945     Opc = NVPTX::SULD_2D_V2I16_CLAMP_R;
2946     break;
2947   case NVPTXISD::Suld2DV2I32Clamp:
2948     Opc = NVPTX::SULD_2D_V2I32_CLAMP_R;
2949     break;
2950   case NVPTXISD::Suld2DV2I64Clamp:
2951     Opc = NVPTX::SULD_2D_V2I64_CLAMP_R;
2952     break;
2953   case NVPTXISD::Suld2DV4I8Clamp:
2954     Opc = NVPTX::SULD_2D_V4I8_CLAMP_R;
2955     break;
2956   case NVPTXISD::Suld2DV4I16Clamp:
2957     Opc = NVPTX::SULD_2D_V4I16_CLAMP_R;
2958     break;
2959   case NVPTXISD::Suld2DV4I32Clamp:
2960     Opc = NVPTX::SULD_2D_V4I32_CLAMP_R;
2961     break;
2962   case NVPTXISD::Suld2DArrayI8Clamp:
2963     Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP_R;
2964     break;
2965   case NVPTXISD::Suld2DArrayI16Clamp:
2966     Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP_R;
2967     break;
2968   case NVPTXISD::Suld2DArrayI32Clamp:
2969     Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP_R;
2970     break;
2971   case NVPTXISD::Suld2DArrayI64Clamp:
2972     Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP_R;
2973     break;
2974   case NVPTXISD::Suld2DArrayV2I8Clamp:
2975     Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R;
2976     break;
2977   case NVPTXISD::Suld2DArrayV2I16Clamp:
2978     Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R;
2979     break;
2980   case NVPTXISD::Suld2DArrayV2I32Clamp:
2981     Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R;
2982     break;
2983   case NVPTXISD::Suld2DArrayV2I64Clamp:
2984     Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R;
2985     break;
2986   case NVPTXISD::Suld2DArrayV4I8Clamp:
2987     Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R;
2988     break;
2989   case NVPTXISD::Suld2DArrayV4I16Clamp:
2990     Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R;
2991     break;
2992   case NVPTXISD::Suld2DArrayV4I32Clamp:
2993     Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R;
2994     break;
2995   case NVPTXISD::Suld3DI8Clamp:
2996     Opc = NVPTX::SULD_3D_I8_CLAMP_R;
2997     break;
2998   case NVPTXISD::Suld3DI16Clamp:
2999     Opc = NVPTX::SULD_3D_I16_CLAMP_R;
3000     break;
3001   case NVPTXISD::Suld3DI32Clamp:
3002     Opc = NVPTX::SULD_3D_I32_CLAMP_R;
3003     break;
3004   case NVPTXISD::Suld3DI64Clamp:
3005     Opc = NVPTX::SULD_3D_I64_CLAMP_R;
3006     break;
3007   case NVPTXISD::Suld3DV2I8Clamp:
3008     Opc = NVPTX::SULD_3D_V2I8_CLAMP_R;
3009     break;
3010   case NVPTXISD::Suld3DV2I16Clamp:
3011     Opc = NVPTX::SULD_3D_V2I16_CLAMP_R;
3012     break;
3013   case NVPTXISD::Suld3DV2I32Clamp:
3014     Opc = NVPTX::SULD_3D_V2I32_CLAMP_R;
3015     break;
3016   case NVPTXISD::Suld3DV2I64Clamp:
3017     Opc = NVPTX::SULD_3D_V2I64_CLAMP_R;
3018     break;
3019   case NVPTXISD::Suld3DV4I8Clamp:
3020     Opc = NVPTX::SULD_3D_V4I8_CLAMP_R;
3021     break;
3022   case NVPTXISD::Suld3DV4I16Clamp:
3023     Opc = NVPTX::SULD_3D_V4I16_CLAMP_R;
3024     break;
3025   case NVPTXISD::Suld3DV4I32Clamp:
3026     Opc = NVPTX::SULD_3D_V4I32_CLAMP_R;
3027     break;
3028   case NVPTXISD::Suld1DI8Trap:
3029     Opc = NVPTX::SULD_1D_I8_TRAP_R;
3030     break;
3031   case NVPTXISD::Suld1DI16Trap:
3032     Opc = NVPTX::SULD_1D_I16_TRAP_R;
3033     break;
3034   case NVPTXISD::Suld1DI32Trap:
3035     Opc = NVPTX::SULD_1D_I32_TRAP_R;
3036     break;
3037   case NVPTXISD::Suld1DI64Trap:
3038     Opc = NVPTX::SULD_1D_I64_TRAP_R;
3039     break;
3040   case NVPTXISD::Suld1DV2I8Trap:
3041     Opc = NVPTX::SULD_1D_V2I8_TRAP_R;
3042     break;
3043   case NVPTXISD::Suld1DV2I16Trap:
3044     Opc = NVPTX::SULD_1D_V2I16_TRAP_R;
3045     break;
3046   case NVPTXISD::Suld1DV2I32Trap:
3047     Opc = NVPTX::SULD_1D_V2I32_TRAP_R;
3048     break;
3049   case NVPTXISD::Suld1DV2I64Trap:
3050     Opc = NVPTX::SULD_1D_V2I64_TRAP_R;
3051     break;
3052   case NVPTXISD::Suld1DV4I8Trap:
3053     Opc = NVPTX::SULD_1D_V4I8_TRAP_R;
3054     break;
3055   case NVPTXISD::Suld1DV4I16Trap:
3056     Opc = NVPTX::SULD_1D_V4I16_TRAP_R;
3057     break;
3058   case NVPTXISD::Suld1DV4I32Trap:
3059     Opc = NVPTX::SULD_1D_V4I32_TRAP_R;
3060     break;
3061   case NVPTXISD::Suld1DArrayI8Trap:
3062     Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP_R;
3063     break;
3064   case NVPTXISD::Suld1DArrayI16Trap:
3065     Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP_R;
3066     break;
3067   case NVPTXISD::Suld1DArrayI32Trap:
3068     Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP_R;
3069     break;
3070   case NVPTXISD::Suld1DArrayI64Trap:
3071     Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP_R;
3072     break;
3073   case NVPTXISD::Suld1DArrayV2I8Trap:
3074     Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R;
3075     break;
3076   case NVPTXISD::Suld1DArrayV2I16Trap:
3077     Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R;
3078     break;
3079   case NVPTXISD::Suld1DArrayV2I32Trap:
3080     Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R;
3081     break;
3082   case NVPTXISD::Suld1DArrayV2I64Trap:
3083     Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R;
3084     break;
3085   case NVPTXISD::Suld1DArrayV4I8Trap:
3086     Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R;
3087     break;
3088   case NVPTXISD::Suld1DArrayV4I16Trap:
3089     Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R;
3090     break;
3091   case NVPTXISD::Suld1DArrayV4I32Trap:
3092     Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R;
3093     break;
3094   case NVPTXISD::Suld2DI8Trap:
3095     Opc = NVPTX::SULD_2D_I8_TRAP_R;
3096     break;
3097   case NVPTXISD::Suld2DI16Trap:
3098     Opc = NVPTX::SULD_2D_I16_TRAP_R;
3099     break;
3100   case NVPTXISD::Suld2DI32Trap:
3101     Opc = NVPTX::SULD_2D_I32_TRAP_R;
3102     break;
3103   case NVPTXISD::Suld2DI64Trap:
3104     Opc = NVPTX::SULD_2D_I64_TRAP_R;
3105     break;
3106   case NVPTXISD::Suld2DV2I8Trap:
3107     Opc = NVPTX::SULD_2D_V2I8_TRAP_R;
3108     break;
3109   case NVPTXISD::Suld2DV2I16Trap:
3110     Opc = NVPTX::SULD_2D_V2I16_TRAP_R;
3111     break;
3112   case NVPTXISD::Suld2DV2I32Trap:
3113     Opc = NVPTX::SULD_2D_V2I32_TRAP_R;
3114     break;
3115   case NVPTXISD::Suld2DV2I64Trap:
3116     Opc = NVPTX::SULD_2D_V2I64_TRAP_R;
3117     break;
3118   case NVPTXISD::Suld2DV4I8Trap:
3119     Opc = NVPTX::SULD_2D_V4I8_TRAP_R;
3120     break;
3121   case NVPTXISD::Suld2DV4I16Trap:
3122     Opc = NVPTX::SULD_2D_V4I16_TRAP_R;
3123     break;
3124   case NVPTXISD::Suld2DV4I32Trap:
3125     Opc = NVPTX::SULD_2D_V4I32_TRAP_R;
3126     break;
3127   case NVPTXISD::Suld2DArrayI8Trap:
3128     Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP_R;
3129     break;
3130   case NVPTXISD::Suld2DArrayI16Trap:
3131     Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP_R;
3132     break;
3133   case NVPTXISD::Suld2DArrayI32Trap:
3134     Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP_R;
3135     break;
3136   case NVPTXISD::Suld2DArrayI64Trap:
3137     Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP_R;
3138     break;
3139   case NVPTXISD::Suld2DArrayV2I8Trap:
3140     Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R;
3141     break;
3142   case NVPTXISD::Suld2DArrayV2I16Trap:
3143     Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R;
3144     break;
3145   case NVPTXISD::Suld2DArrayV2I32Trap:
3146     Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R;
3147     break;
3148   case NVPTXISD::Suld2DArrayV2I64Trap:
3149     Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R;
3150     break;
3151   case NVPTXISD::Suld2DArrayV4I8Trap:
3152     Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R;
3153     break;
3154   case NVPTXISD::Suld2DArrayV4I16Trap:
3155     Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R;
3156     break;
3157   case NVPTXISD::Suld2DArrayV4I32Trap:
3158     Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R;
3159     break;
3160   case NVPTXISD::Suld3DI8Trap:
3161     Opc = NVPTX::SULD_3D_I8_TRAP_R;
3162     break;
3163   case NVPTXISD::Suld3DI16Trap:
3164     Opc = NVPTX::SULD_3D_I16_TRAP_R;
3165     break;
3166   case NVPTXISD::Suld3DI32Trap:
3167     Opc = NVPTX::SULD_3D_I32_TRAP_R;
3168     break;
3169   case NVPTXISD::Suld3DI64Trap:
3170     Opc = NVPTX::SULD_3D_I64_TRAP_R;
3171     break;
3172   case NVPTXISD::Suld3DV2I8Trap:
3173     Opc = NVPTX::SULD_3D_V2I8_TRAP_R;
3174     break;
3175   case NVPTXISD::Suld3DV2I16Trap:
3176     Opc = NVPTX::SULD_3D_V2I16_TRAP_R;
3177     break;
3178   case NVPTXISD::Suld3DV2I32Trap:
3179     Opc = NVPTX::SULD_3D_V2I32_TRAP_R;
3180     break;
3181   case NVPTXISD::Suld3DV2I64Trap:
3182     Opc = NVPTX::SULD_3D_V2I64_TRAP_R;
3183     break;
3184   case NVPTXISD::Suld3DV4I8Trap:
3185     Opc = NVPTX::SULD_3D_V4I8_TRAP_R;
3186     break;
3187   case NVPTXISD::Suld3DV4I16Trap:
3188     Opc = NVPTX::SULD_3D_V4I16_TRAP_R;
3189     break;
3190   case NVPTXISD::Suld3DV4I32Trap:
3191     Opc = NVPTX::SULD_3D_V4I32_TRAP_R;
3192     break;
3193   case NVPTXISD::Suld1DI8Zero:
3194     Opc = NVPTX::SULD_1D_I8_ZERO_R;
3195     break;
3196   case NVPTXISD::Suld1DI16Zero:
3197     Opc = NVPTX::SULD_1D_I16_ZERO_R;
3198     break;
3199   case NVPTXISD::Suld1DI32Zero:
3200     Opc = NVPTX::SULD_1D_I32_ZERO_R;
3201     break;
3202   case NVPTXISD::Suld1DI64Zero:
3203     Opc = NVPTX::SULD_1D_I64_ZERO_R;
3204     break;
3205   case NVPTXISD::Suld1DV2I8Zero:
3206     Opc = NVPTX::SULD_1D_V2I8_ZERO_R;
3207     break;
3208   case NVPTXISD::Suld1DV2I16Zero:
3209     Opc = NVPTX::SULD_1D_V2I16_ZERO_R;
3210     break;
3211   case NVPTXISD::Suld1DV2I32Zero:
3212     Opc = NVPTX::SULD_1D_V2I32_ZERO_R;
3213     break;
3214   case NVPTXISD::Suld1DV2I64Zero:
3215     Opc = NVPTX::SULD_1D_V2I64_ZERO_R;
3216     break;
3217   case NVPTXISD::Suld1DV4I8Zero:
3218     Opc = NVPTX::SULD_1D_V4I8_ZERO_R;
3219     break;
3220   case NVPTXISD::Suld1DV4I16Zero:
3221     Opc = NVPTX::SULD_1D_V4I16_ZERO_R;
3222     break;
3223   case NVPTXISD::Suld1DV4I32Zero:
3224     Opc = NVPTX::SULD_1D_V4I32_ZERO_R;
3225     break;
3226   case NVPTXISD::Suld1DArrayI8Zero:
3227     Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO_R;
3228     break;
3229   case NVPTXISD::Suld1DArrayI16Zero:
3230     Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO_R;
3231     break;
3232   case NVPTXISD::Suld1DArrayI32Zero:
3233     Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO_R;
3234     break;
3235   case NVPTXISD::Suld1DArrayI64Zero:
3236     Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO_R;
3237     break;
3238   case NVPTXISD::Suld1DArrayV2I8Zero:
3239     Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R;
3240     break;
3241   case NVPTXISD::Suld1DArrayV2I16Zero:
3242     Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R;
3243     break;
3244   case NVPTXISD::Suld1DArrayV2I32Zero:
3245     Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R;
3246     break;
3247   case NVPTXISD::Suld1DArrayV2I64Zero:
3248     Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R;
3249     break;
3250   case NVPTXISD::Suld1DArrayV4I8Zero:
3251     Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R;
3252     break;
3253   case NVPTXISD::Suld1DArrayV4I16Zero:
3254     Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R;
3255     break;
3256   case NVPTXISD::Suld1DArrayV4I32Zero:
3257     Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R;
3258     break;
3259   case NVPTXISD::Suld2DI8Zero:
3260     Opc = NVPTX::SULD_2D_I8_ZERO_R;
3261     break;
3262   case NVPTXISD::Suld2DI16Zero:
3263     Opc = NVPTX::SULD_2D_I16_ZERO_R;
3264     break;
3265   case NVPTXISD::Suld2DI32Zero:
3266     Opc = NVPTX::SULD_2D_I32_ZERO_R;
3267     break;
3268   case NVPTXISD::Suld2DI64Zero:
3269     Opc = NVPTX::SULD_2D_I64_ZERO_R;
3270     break;
3271   case NVPTXISD::Suld2DV2I8Zero:
3272     Opc = NVPTX::SULD_2D_V2I8_ZERO_R;
3273     break;
3274   case NVPTXISD::Suld2DV2I16Zero:
3275     Opc = NVPTX::SULD_2D_V2I16_ZERO_R;
3276     break;
3277   case NVPTXISD::Suld2DV2I32Zero:
3278     Opc = NVPTX::SULD_2D_V2I32_ZERO_R;
3279     break;
3280   case NVPTXISD::Suld2DV2I64Zero:
3281     Opc = NVPTX::SULD_2D_V2I64_ZERO_R;
3282     break;
3283   case NVPTXISD::Suld2DV4I8Zero:
3284     Opc = NVPTX::SULD_2D_V4I8_ZERO_R;
3285     break;
3286   case NVPTXISD::Suld2DV4I16Zero:
3287     Opc = NVPTX::SULD_2D_V4I16_ZERO_R;
3288     break;
3289   case NVPTXISD::Suld2DV4I32Zero:
3290     Opc = NVPTX::SULD_2D_V4I32_ZERO_R;
3291     break;
3292   case NVPTXISD::Suld2DArrayI8Zero:
3293     Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO_R;
3294     break;
3295   case NVPTXISD::Suld2DArrayI16Zero:
3296     Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO_R;
3297     break;
3298   case NVPTXISD::Suld2DArrayI32Zero:
3299     Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO_R;
3300     break;
3301   case NVPTXISD::Suld2DArrayI64Zero:
3302     Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO_R;
3303     break;
3304   case NVPTXISD::Suld2DArrayV2I8Zero:
3305     Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R;
3306     break;
3307   case NVPTXISD::Suld2DArrayV2I16Zero:
3308     Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R;
3309     break;
3310   case NVPTXISD::Suld2DArrayV2I32Zero:
3311     Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R;
3312     break;
3313   case NVPTXISD::Suld2DArrayV2I64Zero:
3314     Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R;
3315     break;
3316   case NVPTXISD::Suld2DArrayV4I8Zero:
3317     Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R;
3318     break;
3319   case NVPTXISD::Suld2DArrayV4I16Zero:
3320     Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R;
3321     break;
3322   case NVPTXISD::Suld2DArrayV4I32Zero:
3323     Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R;
3324     break;
3325   case NVPTXISD::Suld3DI8Zero:
3326     Opc = NVPTX::SULD_3D_I8_ZERO_R;
3327     break;
3328   case NVPTXISD::Suld3DI16Zero:
3329     Opc = NVPTX::SULD_3D_I16_ZERO_R;
3330     break;
3331   case NVPTXISD::Suld3DI32Zero:
3332     Opc = NVPTX::SULD_3D_I32_ZERO_R;
3333     break;
3334   case NVPTXISD::Suld3DI64Zero:
3335     Opc = NVPTX::SULD_3D_I64_ZERO_R;
3336     break;
3337   case NVPTXISD::Suld3DV2I8Zero:
3338     Opc = NVPTX::SULD_3D_V2I8_ZERO_R;
3339     break;
3340   case NVPTXISD::Suld3DV2I16Zero:
3341     Opc = NVPTX::SULD_3D_V2I16_ZERO_R;
3342     break;
3343   case NVPTXISD::Suld3DV2I32Zero:
3344     Opc = NVPTX::SULD_3D_V2I32_ZERO_R;
3345     break;
3346   case NVPTXISD::Suld3DV2I64Zero:
3347     Opc = NVPTX::SULD_3D_V2I64_ZERO_R;
3348     break;
3349   case NVPTXISD::Suld3DV4I8Zero:
3350     Opc = NVPTX::SULD_3D_V4I8_ZERO_R;
3351     break;
3352   case NVPTXISD::Suld3DV4I16Zero:
3353     Opc = NVPTX::SULD_3D_V4I16_ZERO_R;
3354     break;
3355   case NVPTXISD::Suld3DV4I32Zero:
3356     Opc = NVPTX::SULD_3D_V4I32_ZERO_R;
3357     break;
3358   }
3359 
3360   // Copy over operands
3361   SmallVector<SDValue, 8> Ops(drop_begin(N->ops()));
3362   Ops.push_back(N->getOperand(0)); // Move chain to the back.
3363 
3364   ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
3365   return true;
3366 }
3367 
3368 
3369 /// SelectBFE - Look for instruction sequences that can be made more efficient
3370 /// by using the 'bfe' (bit-field extract) PTX instruction
3371 bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
3372   SDLoc DL(N);
3373   SDValue LHS = N->getOperand(0);
3374   SDValue RHS = N->getOperand(1);
3375   SDValue Len;
3376   SDValue Start;
3377   SDValue Val;
3378   bool IsSigned = false;
3379 
3380   if (N->getOpcode() == ISD::AND) {
3381     // Canonicalize the operands
3382     // We want 'and %val, %mask'
3383     if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
3384       std::swap(LHS, RHS);
3385     }
3386 
3387     ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
3388     if (!Mask) {
3389       // We need a constant mask on the RHS of the AND
3390       return false;
3391     }
3392 
3393     // Extract the mask bits
3394     uint64_t MaskVal = Mask->getZExtValue();
3395     if (!isMask_64(MaskVal)) {
3396       // We *could* handle shifted masks here, but doing so would require an
3397       // 'and' operation to fix up the low-order bits so we would trade
3398       // shr+and for bfe+and, which has the same throughput
3399       return false;
3400     }
3401 
3402     // How many bits are in our mask?
3403     uint64_t NumBits = countTrailingOnes(MaskVal);
3404     Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3405 
3406     if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
3407       // We have a 'srl/and' pair, extract the effective start bit and length
3408       Val = LHS.getNode()->getOperand(0);
3409       Start = LHS.getNode()->getOperand(1);
3410       ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
3411       if (StartConst) {
3412         uint64_t StartVal = StartConst->getZExtValue();
3413         // How many "good" bits do we have left?  "good" is defined here as bits
3414         // that exist in the original value, not shifted in.
3415         uint64_t GoodBits = Start.getValueSizeInBits() - StartVal;
3416         if (NumBits > GoodBits) {
3417           // Do not handle the case where bits have been shifted in. In theory
3418           // we could handle this, but the cost is likely higher than just
3419           // emitting the srl/and pair.
3420           return false;
3421         }
3422         Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
3423       } else {
3424         // Do not handle the case where the shift amount (can be zero if no srl
3425         // was found) is not constant. We could handle this case, but it would
3426         // require run-time logic that would be more expensive than just
3427         // emitting the srl/and pair.
3428         return false;
3429       }
3430     } else {
3431       // Do not handle the case where the LHS of the and is not a shift. While
3432       // it would be trivial to handle this case, it would just transform
3433       // 'and' -> 'bfe', but 'and' has higher-throughput.
3434       return false;
3435     }
3436   } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
3437     if (LHS->getOpcode() == ISD::AND) {
3438       ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
3439       if (!ShiftCnst) {
3440         // Shift amount must be constant
3441         return false;
3442       }
3443 
3444       uint64_t ShiftAmt = ShiftCnst->getZExtValue();
3445 
3446       SDValue AndLHS = LHS->getOperand(0);
3447       SDValue AndRHS = LHS->getOperand(1);
3448 
3449       // Canonicalize the AND to have the mask on the RHS
3450       if (isa<ConstantSDNode>(AndLHS)) {
3451         std::swap(AndLHS, AndRHS);
3452       }
3453 
3454       ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
3455       if (!MaskCnst) {
3456         // Mask must be constant
3457         return false;
3458       }
3459 
3460       uint64_t MaskVal = MaskCnst->getZExtValue();
3461       uint64_t NumZeros;
3462       uint64_t NumBits;
3463       if (isMask_64(MaskVal)) {
3464         NumZeros = 0;
3465         // The number of bits in the result bitfield will be the number of
3466         // trailing ones (the AND) minus the number of bits we shift off
3467         NumBits = countTrailingOnes(MaskVal) - ShiftAmt;
3468       } else if (isShiftedMask_64(MaskVal)) {
3469         NumZeros = countTrailingZeros(MaskVal);
3470         unsigned NumOnes = countTrailingOnes(MaskVal >> NumZeros);
3471         // The number of bits in the result bitfield will be the number of
3472         // trailing zeros plus the number of set bits in the mask minus the
3473         // number of bits we shift off
3474         NumBits = NumZeros + NumOnes - ShiftAmt;
3475       } else {
3476         // This is not a mask we can handle
3477         return false;
3478       }
3479 
3480       if (ShiftAmt < NumZeros) {
3481         // Handling this case would require extra logic that would make this
3482         // transformation non-profitable
3483         return false;
3484       }
3485 
3486       Val = AndLHS;
3487       Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
3488       Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3489     } else if (LHS->getOpcode() == ISD::SHL) {
3490       // Here, we have a pattern like:
3491       //
3492       // (sra (shl val, NN), MM)
3493       // or
3494       // (srl (shl val, NN), MM)
3495       //
3496       // If MM >= NN, we can efficiently optimize this with bfe
3497       Val = LHS->getOperand(0);
3498 
3499       SDValue ShlRHS = LHS->getOperand(1);
3500       ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
3501       if (!ShlCnst) {
3502         // Shift amount must be constant
3503         return false;
3504       }
3505       uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
3506 
3507       SDValue ShrRHS = RHS;
3508       ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
3509       if (!ShrCnst) {
3510         // Shift amount must be constant
3511         return false;
3512       }
3513       uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
3514 
3515       // To avoid extra codegen and be profitable, we need Outer >= Inner
3516       if (OuterShiftAmt < InnerShiftAmt) {
3517         return false;
3518       }
3519 
3520       // If the outer shift is more than the type size, we have no bitfield to
3521       // extract (since we also check that the inner shift is <= the outer shift
3522       // then this also implies that the inner shift is < the type size)
3523       if (OuterShiftAmt >= Val.getValueSizeInBits()) {
3524         return false;
3525       }
3526 
3527       Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
3528                                         MVT::i32);
3529       Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
3530                                       DL, MVT::i32);
3531 
3532       if (N->getOpcode() == ISD::SRA) {
3533         // If we have a arithmetic right shift, we need to use the signed bfe
3534         // variant
3535         IsSigned = true;
3536       }
3537     } else {
3538       // No can do...
3539       return false;
3540     }
3541   } else {
3542     // No can do...
3543     return false;
3544   }
3545 
3546 
3547   unsigned Opc;
3548   // For the BFE operations we form here from "and" and "srl", always use the
3549   // unsigned variants.
3550   if (Val.getValueType() == MVT::i32) {
3551     if (IsSigned) {
3552       Opc = NVPTX::BFE_S32rii;
3553     } else {
3554       Opc = NVPTX::BFE_U32rii;
3555     }
3556   } else if (Val.getValueType() == MVT::i64) {
3557     if (IsSigned) {
3558       Opc = NVPTX::BFE_S64rii;
3559     } else {
3560       Opc = NVPTX::BFE_U64rii;
3561     }
3562   } else {
3563     // We cannot handle this type
3564     return false;
3565   }
3566 
3567   SDValue Ops[] = {
3568     Val, Start, Len
3569   };
3570 
3571   ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
3572   return true;
3573 }
3574 
3575 // SelectDirectAddr - Match a direct address for DAG.
3576 // A direct address could be a globaladdress or externalsymbol.
3577 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
3578   // Return true if TGA or ES.
3579   if (N.getOpcode() == ISD::TargetGlobalAddress ||
3580       N.getOpcode() == ISD::TargetExternalSymbol) {
3581     Address = N;
3582     return true;
3583   }
3584   if (N.getOpcode() == NVPTXISD::Wrapper) {
3585     Address = N.getOperand(0);
3586     return true;
3587   }
3588   // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
3589   if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
3590     if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
3591         CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM &&
3592         CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam)
3593       return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
3594   }
3595   return false;
3596 }
3597 
3598 // symbol+offset
3599 bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
3600     SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3601   if (Addr.getOpcode() == ISD::ADD) {
3602     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3603       SDValue base = Addr.getOperand(0);
3604       if (SelectDirectAddr(base, Base)) {
3605         Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3606                                            mvt);
3607         return true;
3608       }
3609     }
3610   }
3611   return false;
3612 }
3613 
3614 // symbol+offset
3615 bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
3616                                      SDValue &Base, SDValue &Offset) {
3617   return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
3618 }
3619 
3620 // symbol+offset
3621 bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
3622                                        SDValue &Base, SDValue &Offset) {
3623   return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
3624 }
3625 
3626 // register+offset
3627 bool NVPTXDAGToDAGISel::SelectADDRri_imp(
3628     SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3629   if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
3630     Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3631     Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
3632     return true;
3633   }
3634   if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
3635       Addr.getOpcode() == ISD::TargetGlobalAddress)
3636     return false; // direct calls.
3637 
3638   if (Addr.getOpcode() == ISD::ADD) {
3639     if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
3640       return false;
3641     }
3642     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3643       if (FrameIndexSDNode *FIN =
3644               dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
3645         // Constant offset from frame ref.
3646         Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3647       else
3648         Base = Addr.getOperand(0);
3649       Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3650                                          mvt);
3651       return true;
3652     }
3653   }
3654   return false;
3655 }
3656 
3657 // register+offset
3658 bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
3659                                      SDValue &Base, SDValue &Offset) {
3660   return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
3661 }
3662 
3663 // register+offset
3664 bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
3665                                        SDValue &Base, SDValue &Offset) {
3666   return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
3667 }
3668 
3669 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
3670                                                  unsigned int spN) const {
3671   const Value *Src = nullptr;
3672   if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
3673     if (spN == 0 && mN->getMemOperand()->getPseudoValue())
3674       return true;
3675     Src = mN->getMemOperand()->getValue();
3676   }
3677   if (!Src)
3678     return false;
3679   if (auto *PT = dyn_cast<PointerType>(Src->getType()))
3680     return (PT->getAddressSpace() == spN);
3681   return false;
3682 }
3683 
3684 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
3685 /// inline asm expressions.
3686 bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
3687     const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
3688   SDValue Op0, Op1;
3689   switch (ConstraintID) {
3690   default:
3691     return true;
3692   case InlineAsm::Constraint_m: // memory
3693     if (SelectDirectAddr(Op, Op0)) {
3694       OutOps.push_back(Op0);
3695       OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
3696       return false;
3697     }
3698     if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
3699       OutOps.push_back(Op0);
3700       OutOps.push_back(Op1);
3701       return false;
3702     }
3703     break;
3704   }
3705   return true;
3706 }
3707 
3708 /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
3709 /// conversion from \p SrcTy to \p DestTy.
3710 unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
3711                                              bool IsSigned) {
3712   switch (SrcTy.SimpleTy) {
3713   default:
3714     llvm_unreachable("Unhandled source type");
3715   case MVT::i8:
3716     switch (DestTy.SimpleTy) {
3717     default:
3718       llvm_unreachable("Unhandled dest type");
3719     case MVT::i16:
3720       return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
3721     case MVT::i32:
3722       return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
3723     case MVT::i64:
3724       return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
3725     }
3726   case MVT::i16:
3727     switch (DestTy.SimpleTy) {
3728     default:
3729       llvm_unreachable("Unhandled dest type");
3730     case MVT::i8:
3731       return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
3732     case MVT::i32:
3733       return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
3734     case MVT::i64:
3735       return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
3736     }
3737   case MVT::i32:
3738     switch (DestTy.SimpleTy) {
3739     default:
3740       llvm_unreachable("Unhandled dest type");
3741     case MVT::i8:
3742       return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
3743     case MVT::i16:
3744       return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
3745     case MVT::i64:
3746       return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
3747     }
3748   case MVT::i64:
3749     switch (DestTy.SimpleTy) {
3750     default:
3751       llvm_unreachable("Unhandled dest type");
3752     case MVT::i8:
3753       return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
3754     case MVT::i16:
3755       return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
3756     case MVT::i32:
3757       return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
3758     }
3759   }
3760 }
3761