1 //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines an instruction selector for the NVPTX target.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "NVPTXISelDAGToDAG.h"
14 #include "MCTargetDesc/NVPTXBaseInfo.h"
15 #include "NVPTXUtilities.h"
16 #include "llvm/Analysis/ValueTracking.h"
17 #include "llvm/IR/GlobalValue.h"
18 #include "llvm/IR/Instructions.h"
19 #include "llvm/IR/IntrinsicsNVPTX.h"
20 #include "llvm/Support/AtomicOrdering.h"
21 #include "llvm/Support/CommandLine.h"
22 #include "llvm/Support/Debug.h"
23 #include "llvm/Support/ErrorHandling.h"
24 #include "llvm/Support/raw_ostream.h"
25 #include "llvm/Target/TargetIntrinsicInfo.h"
26 
27 using namespace llvm;
28 
29 #define DEBUG_TYPE "nvptx-isel"
30 #define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
31 
32 /// createNVPTXISelDag - This pass converts a legalized DAG into a
33 /// NVPTX-specific DAG, ready for instruction scheduling.
34 FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
35                                        llvm::CodeGenOpt::Level OptLevel) {
36   return new NVPTXDAGToDAGISel(TM, OptLevel);
37 }
38 
39 char NVPTXDAGToDAGISel::ID = 0;
40 
41 INITIALIZE_PASS(NVPTXDAGToDAGISel, DEBUG_TYPE, PASS_NAME, false, false)
42 
43 NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
44                                      CodeGenOpt::Level OptLevel)
45     : SelectionDAGISel(ID, tm, OptLevel), TM(tm) {
46   doMulWide = (OptLevel > 0);
47 }
48 
49 bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
50   Subtarget = &MF.getSubtarget<NVPTXSubtarget>();
51   return SelectionDAGISel::runOnMachineFunction(MF);
52 }
53 
54 int NVPTXDAGToDAGISel::getDivF32Level() const {
55   return Subtarget->getTargetLowering()->getDivF32Level();
56 }
57 
58 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
59   return Subtarget->getTargetLowering()->usePrecSqrtF32();
60 }
61 
62 bool NVPTXDAGToDAGISel::useF32FTZ() const {
63   return Subtarget->getTargetLowering()->useF32FTZ(*MF);
64 }
65 
66 bool NVPTXDAGToDAGISel::allowFMA() const {
67   const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
68   return TL->allowFMA(*MF, OptLevel);
69 }
70 
71 bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
72   const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
73   return TL->allowUnsafeFPMath(*MF);
74 }
75 
76 bool NVPTXDAGToDAGISel::useShortPointers() const {
77   return TM.useShortPointers();
78 }
79 
80 /// Select - Select instructions not customized! Used for
81 /// expanded, promoted and normal instructions.
82 void NVPTXDAGToDAGISel::Select(SDNode *N) {
83 
84   if (N->isMachineOpcode()) {
85     N->setNodeId(-1);
86     return; // Already selected.
87   }
88 
89   switch (N->getOpcode()) {
90   case ISD::LOAD:
91   case ISD::ATOMIC_LOAD:
92     if (tryLoad(N))
93       return;
94     break;
95   case ISD::STORE:
96   case ISD::ATOMIC_STORE:
97     if (tryStore(N))
98       return;
99     break;
100   case ISD::EXTRACT_VECTOR_ELT:
101     if (tryEXTRACT_VECTOR_ELEMENT(N))
102       return;
103     break;
104   case NVPTXISD::SETP_F16X2:
105     SelectSETP_F16X2(N);
106     return;
107 
108   case NVPTXISD::LoadV2:
109   case NVPTXISD::LoadV4:
110     if (tryLoadVector(N))
111       return;
112     break;
113   case NVPTXISD::LDGV2:
114   case NVPTXISD::LDGV4:
115   case NVPTXISD::LDUV2:
116   case NVPTXISD::LDUV4:
117     if (tryLDGLDU(N))
118       return;
119     break;
120   case NVPTXISD::StoreV2:
121   case NVPTXISD::StoreV4:
122     if (tryStoreVector(N))
123       return;
124     break;
125   case NVPTXISD::LoadParam:
126   case NVPTXISD::LoadParamV2:
127   case NVPTXISD::LoadParamV4:
128     if (tryLoadParam(N))
129       return;
130     break;
131   case NVPTXISD::StoreRetval:
132   case NVPTXISD::StoreRetvalV2:
133   case NVPTXISD::StoreRetvalV4:
134     if (tryStoreRetval(N))
135       return;
136     break;
137   case NVPTXISD::StoreParam:
138   case NVPTXISD::StoreParamV2:
139   case NVPTXISD::StoreParamV4:
140   case NVPTXISD::StoreParamS32:
141   case NVPTXISD::StoreParamU32:
142     if (tryStoreParam(N))
143       return;
144     break;
145   case ISD::INTRINSIC_WO_CHAIN:
146     if (tryIntrinsicNoChain(N))
147       return;
148     break;
149   case ISD::INTRINSIC_W_CHAIN:
150     if (tryIntrinsicChain(N))
151       return;
152     break;
153   case NVPTXISD::Tex1DFloatS32:
154   case NVPTXISD::Tex1DFloatFloat:
155   case NVPTXISD::Tex1DFloatFloatLevel:
156   case NVPTXISD::Tex1DFloatFloatGrad:
157   case NVPTXISD::Tex1DS32S32:
158   case NVPTXISD::Tex1DS32Float:
159   case NVPTXISD::Tex1DS32FloatLevel:
160   case NVPTXISD::Tex1DS32FloatGrad:
161   case NVPTXISD::Tex1DU32S32:
162   case NVPTXISD::Tex1DU32Float:
163   case NVPTXISD::Tex1DU32FloatLevel:
164   case NVPTXISD::Tex1DU32FloatGrad:
165   case NVPTXISD::Tex1DArrayFloatS32:
166   case NVPTXISD::Tex1DArrayFloatFloat:
167   case NVPTXISD::Tex1DArrayFloatFloatLevel:
168   case NVPTXISD::Tex1DArrayFloatFloatGrad:
169   case NVPTXISD::Tex1DArrayS32S32:
170   case NVPTXISD::Tex1DArrayS32Float:
171   case NVPTXISD::Tex1DArrayS32FloatLevel:
172   case NVPTXISD::Tex1DArrayS32FloatGrad:
173   case NVPTXISD::Tex1DArrayU32S32:
174   case NVPTXISD::Tex1DArrayU32Float:
175   case NVPTXISD::Tex1DArrayU32FloatLevel:
176   case NVPTXISD::Tex1DArrayU32FloatGrad:
177   case NVPTXISD::Tex2DFloatS32:
178   case NVPTXISD::Tex2DFloatFloat:
179   case NVPTXISD::Tex2DFloatFloatLevel:
180   case NVPTXISD::Tex2DFloatFloatGrad:
181   case NVPTXISD::Tex2DS32S32:
182   case NVPTXISD::Tex2DS32Float:
183   case NVPTXISD::Tex2DS32FloatLevel:
184   case NVPTXISD::Tex2DS32FloatGrad:
185   case NVPTXISD::Tex2DU32S32:
186   case NVPTXISD::Tex2DU32Float:
187   case NVPTXISD::Tex2DU32FloatLevel:
188   case NVPTXISD::Tex2DU32FloatGrad:
189   case NVPTXISD::Tex2DArrayFloatS32:
190   case NVPTXISD::Tex2DArrayFloatFloat:
191   case NVPTXISD::Tex2DArrayFloatFloatLevel:
192   case NVPTXISD::Tex2DArrayFloatFloatGrad:
193   case NVPTXISD::Tex2DArrayS32S32:
194   case NVPTXISD::Tex2DArrayS32Float:
195   case NVPTXISD::Tex2DArrayS32FloatLevel:
196   case NVPTXISD::Tex2DArrayS32FloatGrad:
197   case NVPTXISD::Tex2DArrayU32S32:
198   case NVPTXISD::Tex2DArrayU32Float:
199   case NVPTXISD::Tex2DArrayU32FloatLevel:
200   case NVPTXISD::Tex2DArrayU32FloatGrad:
201   case NVPTXISD::Tex3DFloatS32:
202   case NVPTXISD::Tex3DFloatFloat:
203   case NVPTXISD::Tex3DFloatFloatLevel:
204   case NVPTXISD::Tex3DFloatFloatGrad:
205   case NVPTXISD::Tex3DS32S32:
206   case NVPTXISD::Tex3DS32Float:
207   case NVPTXISD::Tex3DS32FloatLevel:
208   case NVPTXISD::Tex3DS32FloatGrad:
209   case NVPTXISD::Tex3DU32S32:
210   case NVPTXISD::Tex3DU32Float:
211   case NVPTXISD::Tex3DU32FloatLevel:
212   case NVPTXISD::Tex3DU32FloatGrad:
213   case NVPTXISD::TexCubeFloatFloat:
214   case NVPTXISD::TexCubeFloatFloatLevel:
215   case NVPTXISD::TexCubeS32Float:
216   case NVPTXISD::TexCubeS32FloatLevel:
217   case NVPTXISD::TexCubeU32Float:
218   case NVPTXISD::TexCubeU32FloatLevel:
219   case NVPTXISD::TexCubeArrayFloatFloat:
220   case NVPTXISD::TexCubeArrayFloatFloatLevel:
221   case NVPTXISD::TexCubeArrayS32Float:
222   case NVPTXISD::TexCubeArrayS32FloatLevel:
223   case NVPTXISD::TexCubeArrayU32Float:
224   case NVPTXISD::TexCubeArrayU32FloatLevel:
225   case NVPTXISD::Tld4R2DFloatFloat:
226   case NVPTXISD::Tld4G2DFloatFloat:
227   case NVPTXISD::Tld4B2DFloatFloat:
228   case NVPTXISD::Tld4A2DFloatFloat:
229   case NVPTXISD::Tld4R2DS64Float:
230   case NVPTXISD::Tld4G2DS64Float:
231   case NVPTXISD::Tld4B2DS64Float:
232   case NVPTXISD::Tld4A2DS64Float:
233   case NVPTXISD::Tld4R2DU64Float:
234   case NVPTXISD::Tld4G2DU64Float:
235   case NVPTXISD::Tld4B2DU64Float:
236   case NVPTXISD::Tld4A2DU64Float:
237   case NVPTXISD::TexUnified1DFloatS32:
238   case NVPTXISD::TexUnified1DFloatFloat:
239   case NVPTXISD::TexUnified1DFloatFloatLevel:
240   case NVPTXISD::TexUnified1DFloatFloatGrad:
241   case NVPTXISD::TexUnified1DS32S32:
242   case NVPTXISD::TexUnified1DS32Float:
243   case NVPTXISD::TexUnified1DS32FloatLevel:
244   case NVPTXISD::TexUnified1DS32FloatGrad:
245   case NVPTXISD::TexUnified1DU32S32:
246   case NVPTXISD::TexUnified1DU32Float:
247   case NVPTXISD::TexUnified1DU32FloatLevel:
248   case NVPTXISD::TexUnified1DU32FloatGrad:
249   case NVPTXISD::TexUnified1DArrayFloatS32:
250   case NVPTXISD::TexUnified1DArrayFloatFloat:
251   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
252   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
253   case NVPTXISD::TexUnified1DArrayS32S32:
254   case NVPTXISD::TexUnified1DArrayS32Float:
255   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
256   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
257   case NVPTXISD::TexUnified1DArrayU32S32:
258   case NVPTXISD::TexUnified1DArrayU32Float:
259   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
260   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
261   case NVPTXISD::TexUnified2DFloatS32:
262   case NVPTXISD::TexUnified2DFloatFloat:
263   case NVPTXISD::TexUnified2DFloatFloatLevel:
264   case NVPTXISD::TexUnified2DFloatFloatGrad:
265   case NVPTXISD::TexUnified2DS32S32:
266   case NVPTXISD::TexUnified2DS32Float:
267   case NVPTXISD::TexUnified2DS32FloatLevel:
268   case NVPTXISD::TexUnified2DS32FloatGrad:
269   case NVPTXISD::TexUnified2DU32S32:
270   case NVPTXISD::TexUnified2DU32Float:
271   case NVPTXISD::TexUnified2DU32FloatLevel:
272   case NVPTXISD::TexUnified2DU32FloatGrad:
273   case NVPTXISD::TexUnified2DArrayFloatS32:
274   case NVPTXISD::TexUnified2DArrayFloatFloat:
275   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
276   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
277   case NVPTXISD::TexUnified2DArrayS32S32:
278   case NVPTXISD::TexUnified2DArrayS32Float:
279   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
280   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
281   case NVPTXISD::TexUnified2DArrayU32S32:
282   case NVPTXISD::TexUnified2DArrayU32Float:
283   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
284   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
285   case NVPTXISD::TexUnified3DFloatS32:
286   case NVPTXISD::TexUnified3DFloatFloat:
287   case NVPTXISD::TexUnified3DFloatFloatLevel:
288   case NVPTXISD::TexUnified3DFloatFloatGrad:
289   case NVPTXISD::TexUnified3DS32S32:
290   case NVPTXISD::TexUnified3DS32Float:
291   case NVPTXISD::TexUnified3DS32FloatLevel:
292   case NVPTXISD::TexUnified3DS32FloatGrad:
293   case NVPTXISD::TexUnified3DU32S32:
294   case NVPTXISD::TexUnified3DU32Float:
295   case NVPTXISD::TexUnified3DU32FloatLevel:
296   case NVPTXISD::TexUnified3DU32FloatGrad:
297   case NVPTXISD::TexUnifiedCubeFloatFloat:
298   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
299   case NVPTXISD::TexUnifiedCubeS32Float:
300   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
301   case NVPTXISD::TexUnifiedCubeU32Float:
302   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
303   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
304   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
305   case NVPTXISD::TexUnifiedCubeArrayS32Float:
306   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
307   case NVPTXISD::TexUnifiedCubeArrayU32Float:
308   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
309   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
310   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
311   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
312   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
313   case NVPTXISD::Tld4UnifiedR2DS64Float:
314   case NVPTXISD::Tld4UnifiedG2DS64Float:
315   case NVPTXISD::Tld4UnifiedB2DS64Float:
316   case NVPTXISD::Tld4UnifiedA2DS64Float:
317   case NVPTXISD::Tld4UnifiedR2DU64Float:
318   case NVPTXISD::Tld4UnifiedG2DU64Float:
319   case NVPTXISD::Tld4UnifiedB2DU64Float:
320   case NVPTXISD::Tld4UnifiedA2DU64Float:
321     if (tryTextureIntrinsic(N))
322       return;
323     break;
324   case NVPTXISD::Suld1DI8Clamp:
325   case NVPTXISD::Suld1DI16Clamp:
326   case NVPTXISD::Suld1DI32Clamp:
327   case NVPTXISD::Suld1DI64Clamp:
328   case NVPTXISD::Suld1DV2I8Clamp:
329   case NVPTXISD::Suld1DV2I16Clamp:
330   case NVPTXISD::Suld1DV2I32Clamp:
331   case NVPTXISD::Suld1DV2I64Clamp:
332   case NVPTXISD::Suld1DV4I8Clamp:
333   case NVPTXISD::Suld1DV4I16Clamp:
334   case NVPTXISD::Suld1DV4I32Clamp:
335   case NVPTXISD::Suld1DArrayI8Clamp:
336   case NVPTXISD::Suld1DArrayI16Clamp:
337   case NVPTXISD::Suld1DArrayI32Clamp:
338   case NVPTXISD::Suld1DArrayI64Clamp:
339   case NVPTXISD::Suld1DArrayV2I8Clamp:
340   case NVPTXISD::Suld1DArrayV2I16Clamp:
341   case NVPTXISD::Suld1DArrayV2I32Clamp:
342   case NVPTXISD::Suld1DArrayV2I64Clamp:
343   case NVPTXISD::Suld1DArrayV4I8Clamp:
344   case NVPTXISD::Suld1DArrayV4I16Clamp:
345   case NVPTXISD::Suld1DArrayV4I32Clamp:
346   case NVPTXISD::Suld2DI8Clamp:
347   case NVPTXISD::Suld2DI16Clamp:
348   case NVPTXISD::Suld2DI32Clamp:
349   case NVPTXISD::Suld2DI64Clamp:
350   case NVPTXISD::Suld2DV2I8Clamp:
351   case NVPTXISD::Suld2DV2I16Clamp:
352   case NVPTXISD::Suld2DV2I32Clamp:
353   case NVPTXISD::Suld2DV2I64Clamp:
354   case NVPTXISD::Suld2DV4I8Clamp:
355   case NVPTXISD::Suld2DV4I16Clamp:
356   case NVPTXISD::Suld2DV4I32Clamp:
357   case NVPTXISD::Suld2DArrayI8Clamp:
358   case NVPTXISD::Suld2DArrayI16Clamp:
359   case NVPTXISD::Suld2DArrayI32Clamp:
360   case NVPTXISD::Suld2DArrayI64Clamp:
361   case NVPTXISD::Suld2DArrayV2I8Clamp:
362   case NVPTXISD::Suld2DArrayV2I16Clamp:
363   case NVPTXISD::Suld2DArrayV2I32Clamp:
364   case NVPTXISD::Suld2DArrayV2I64Clamp:
365   case NVPTXISD::Suld2DArrayV4I8Clamp:
366   case NVPTXISD::Suld2DArrayV4I16Clamp:
367   case NVPTXISD::Suld2DArrayV4I32Clamp:
368   case NVPTXISD::Suld3DI8Clamp:
369   case NVPTXISD::Suld3DI16Clamp:
370   case NVPTXISD::Suld3DI32Clamp:
371   case NVPTXISD::Suld3DI64Clamp:
372   case NVPTXISD::Suld3DV2I8Clamp:
373   case NVPTXISD::Suld3DV2I16Clamp:
374   case NVPTXISD::Suld3DV2I32Clamp:
375   case NVPTXISD::Suld3DV2I64Clamp:
376   case NVPTXISD::Suld3DV4I8Clamp:
377   case NVPTXISD::Suld3DV4I16Clamp:
378   case NVPTXISD::Suld3DV4I32Clamp:
379   case NVPTXISD::Suld1DI8Trap:
380   case NVPTXISD::Suld1DI16Trap:
381   case NVPTXISD::Suld1DI32Trap:
382   case NVPTXISD::Suld1DI64Trap:
383   case NVPTXISD::Suld1DV2I8Trap:
384   case NVPTXISD::Suld1DV2I16Trap:
385   case NVPTXISD::Suld1DV2I32Trap:
386   case NVPTXISD::Suld1DV2I64Trap:
387   case NVPTXISD::Suld1DV4I8Trap:
388   case NVPTXISD::Suld1DV4I16Trap:
389   case NVPTXISD::Suld1DV4I32Trap:
390   case NVPTXISD::Suld1DArrayI8Trap:
391   case NVPTXISD::Suld1DArrayI16Trap:
392   case NVPTXISD::Suld1DArrayI32Trap:
393   case NVPTXISD::Suld1DArrayI64Trap:
394   case NVPTXISD::Suld1DArrayV2I8Trap:
395   case NVPTXISD::Suld1DArrayV2I16Trap:
396   case NVPTXISD::Suld1DArrayV2I32Trap:
397   case NVPTXISD::Suld1DArrayV2I64Trap:
398   case NVPTXISD::Suld1DArrayV4I8Trap:
399   case NVPTXISD::Suld1DArrayV4I16Trap:
400   case NVPTXISD::Suld1DArrayV4I32Trap:
401   case NVPTXISD::Suld2DI8Trap:
402   case NVPTXISD::Suld2DI16Trap:
403   case NVPTXISD::Suld2DI32Trap:
404   case NVPTXISD::Suld2DI64Trap:
405   case NVPTXISD::Suld2DV2I8Trap:
406   case NVPTXISD::Suld2DV2I16Trap:
407   case NVPTXISD::Suld2DV2I32Trap:
408   case NVPTXISD::Suld2DV2I64Trap:
409   case NVPTXISD::Suld2DV4I8Trap:
410   case NVPTXISD::Suld2DV4I16Trap:
411   case NVPTXISD::Suld2DV4I32Trap:
412   case NVPTXISD::Suld2DArrayI8Trap:
413   case NVPTXISD::Suld2DArrayI16Trap:
414   case NVPTXISD::Suld2DArrayI32Trap:
415   case NVPTXISD::Suld2DArrayI64Trap:
416   case NVPTXISD::Suld2DArrayV2I8Trap:
417   case NVPTXISD::Suld2DArrayV2I16Trap:
418   case NVPTXISD::Suld2DArrayV2I32Trap:
419   case NVPTXISD::Suld2DArrayV2I64Trap:
420   case NVPTXISD::Suld2DArrayV4I8Trap:
421   case NVPTXISD::Suld2DArrayV4I16Trap:
422   case NVPTXISD::Suld2DArrayV4I32Trap:
423   case NVPTXISD::Suld3DI8Trap:
424   case NVPTXISD::Suld3DI16Trap:
425   case NVPTXISD::Suld3DI32Trap:
426   case NVPTXISD::Suld3DI64Trap:
427   case NVPTXISD::Suld3DV2I8Trap:
428   case NVPTXISD::Suld3DV2I16Trap:
429   case NVPTXISD::Suld3DV2I32Trap:
430   case NVPTXISD::Suld3DV2I64Trap:
431   case NVPTXISD::Suld3DV4I8Trap:
432   case NVPTXISD::Suld3DV4I16Trap:
433   case NVPTXISD::Suld3DV4I32Trap:
434   case NVPTXISD::Suld1DI8Zero:
435   case NVPTXISD::Suld1DI16Zero:
436   case NVPTXISD::Suld1DI32Zero:
437   case NVPTXISD::Suld1DI64Zero:
438   case NVPTXISD::Suld1DV2I8Zero:
439   case NVPTXISD::Suld1DV2I16Zero:
440   case NVPTXISD::Suld1DV2I32Zero:
441   case NVPTXISD::Suld1DV2I64Zero:
442   case NVPTXISD::Suld1DV4I8Zero:
443   case NVPTXISD::Suld1DV4I16Zero:
444   case NVPTXISD::Suld1DV4I32Zero:
445   case NVPTXISD::Suld1DArrayI8Zero:
446   case NVPTXISD::Suld1DArrayI16Zero:
447   case NVPTXISD::Suld1DArrayI32Zero:
448   case NVPTXISD::Suld1DArrayI64Zero:
449   case NVPTXISD::Suld1DArrayV2I8Zero:
450   case NVPTXISD::Suld1DArrayV2I16Zero:
451   case NVPTXISD::Suld1DArrayV2I32Zero:
452   case NVPTXISD::Suld1DArrayV2I64Zero:
453   case NVPTXISD::Suld1DArrayV4I8Zero:
454   case NVPTXISD::Suld1DArrayV4I16Zero:
455   case NVPTXISD::Suld1DArrayV4I32Zero:
456   case NVPTXISD::Suld2DI8Zero:
457   case NVPTXISD::Suld2DI16Zero:
458   case NVPTXISD::Suld2DI32Zero:
459   case NVPTXISD::Suld2DI64Zero:
460   case NVPTXISD::Suld2DV2I8Zero:
461   case NVPTXISD::Suld2DV2I16Zero:
462   case NVPTXISD::Suld2DV2I32Zero:
463   case NVPTXISD::Suld2DV2I64Zero:
464   case NVPTXISD::Suld2DV4I8Zero:
465   case NVPTXISD::Suld2DV4I16Zero:
466   case NVPTXISD::Suld2DV4I32Zero:
467   case NVPTXISD::Suld2DArrayI8Zero:
468   case NVPTXISD::Suld2DArrayI16Zero:
469   case NVPTXISD::Suld2DArrayI32Zero:
470   case NVPTXISD::Suld2DArrayI64Zero:
471   case NVPTXISD::Suld2DArrayV2I8Zero:
472   case NVPTXISD::Suld2DArrayV2I16Zero:
473   case NVPTXISD::Suld2DArrayV2I32Zero:
474   case NVPTXISD::Suld2DArrayV2I64Zero:
475   case NVPTXISD::Suld2DArrayV4I8Zero:
476   case NVPTXISD::Suld2DArrayV4I16Zero:
477   case NVPTXISD::Suld2DArrayV4I32Zero:
478   case NVPTXISD::Suld3DI8Zero:
479   case NVPTXISD::Suld3DI16Zero:
480   case NVPTXISD::Suld3DI32Zero:
481   case NVPTXISD::Suld3DI64Zero:
482   case NVPTXISD::Suld3DV2I8Zero:
483   case NVPTXISD::Suld3DV2I16Zero:
484   case NVPTXISD::Suld3DV2I32Zero:
485   case NVPTXISD::Suld3DV2I64Zero:
486   case NVPTXISD::Suld3DV4I8Zero:
487   case NVPTXISD::Suld3DV4I16Zero:
488   case NVPTXISD::Suld3DV4I32Zero:
489     if (trySurfaceIntrinsic(N))
490       return;
491     break;
492   case ISD::AND:
493   case ISD::SRA:
494   case ISD::SRL:
495     // Try to select BFE
496     if (tryBFE(N))
497       return;
498     break;
499   case ISD::ADDRSPACECAST:
500     SelectAddrSpaceCast(N);
501     return;
502   case ISD::ConstantFP:
503     if (tryConstantFP16(N))
504       return;
505     break;
506   default:
507     break;
508   }
509   SelectCode(N);
510 }
511 
512 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
513   unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
514   switch (IID) {
515   default:
516     return false;
517   case Intrinsic::nvvm_ldg_global_f:
518   case Intrinsic::nvvm_ldg_global_i:
519   case Intrinsic::nvvm_ldg_global_p:
520   case Intrinsic::nvvm_ldu_global_f:
521   case Intrinsic::nvvm_ldu_global_i:
522   case Intrinsic::nvvm_ldu_global_p:
523     return tryLDGLDU(N);
524   }
525 }
526 
527 // There's no way to specify FP16 immediates in .f16 ops, so we have to
528 // load them into an .f16 register first.
529 bool NVPTXDAGToDAGISel::tryConstantFP16(SDNode *N) {
530   if (N->getValueType(0) != MVT::f16)
531     return false;
532   SDValue Val = CurDAG->getTargetConstantFP(
533       cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), MVT::f16);
534   SDNode *LoadConstF16 =
535       CurDAG->getMachineNode(NVPTX::LOAD_CONST_F16, SDLoc(N), MVT::f16, Val);
536   ReplaceNode(N, LoadConstF16);
537   return true;
538 }
539 
540 // Map ISD:CONDCODE value to appropriate CmpMode expected by
541 // NVPTXInstPrinter::printCmpMode()
542 static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
543   using NVPTX::PTXCmpMode::CmpMode;
544   unsigned PTXCmpMode = [](ISD::CondCode CC) {
545     switch (CC) {
546     default:
547       llvm_unreachable("Unexpected condition code.");
548     case ISD::SETOEQ:
549       return CmpMode::EQ;
550     case ISD::SETOGT:
551       return CmpMode::GT;
552     case ISD::SETOGE:
553       return CmpMode::GE;
554     case ISD::SETOLT:
555       return CmpMode::LT;
556     case ISD::SETOLE:
557       return CmpMode::LE;
558     case ISD::SETONE:
559       return CmpMode::NE;
560     case ISD::SETO:
561       return CmpMode::NUM;
562     case ISD::SETUO:
563       return CmpMode::NotANumber;
564     case ISD::SETUEQ:
565       return CmpMode::EQU;
566     case ISD::SETUGT:
567       return CmpMode::GTU;
568     case ISD::SETUGE:
569       return CmpMode::GEU;
570     case ISD::SETULT:
571       return CmpMode::LTU;
572     case ISD::SETULE:
573       return CmpMode::LEU;
574     case ISD::SETUNE:
575       return CmpMode::NEU;
576     case ISD::SETEQ:
577       return CmpMode::EQ;
578     case ISD::SETGT:
579       return CmpMode::GT;
580     case ISD::SETGE:
581       return CmpMode::GE;
582     case ISD::SETLT:
583       return CmpMode::LT;
584     case ISD::SETLE:
585       return CmpMode::LE;
586     case ISD::SETNE:
587       return CmpMode::NE;
588     }
589   }(CondCode.get());
590 
591   if (FTZ)
592     PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
593 
594   return PTXCmpMode;
595 }
596 
597 bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
598   unsigned PTXCmpMode =
599       getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
600   SDLoc DL(N);
601   SDNode *SetP = CurDAG->getMachineNode(
602       NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
603       N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
604   ReplaceNode(N, SetP);
605   return true;
606 }
607 
608 // Find all instances of extract_vector_elt that use this v2f16 vector
609 // and coalesce them into a scattering move instruction.
610 bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
611   SDValue Vector = N->getOperand(0);
612 
613   // We only care about f16x2 as it's the only real vector type we
614   // need to deal with.
615   if (Vector.getSimpleValueType() != MVT::v2f16)
616     return false;
617 
618   // Find and record all uses of this vector that extract element 0 or 1.
619   SmallVector<SDNode *, 4> E0, E1;
620   for (auto *U : Vector.getNode()->uses()) {
621     if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
622       continue;
623     if (U->getOperand(0) != Vector)
624       continue;
625     if (const ConstantSDNode *IdxConst =
626             dyn_cast<ConstantSDNode>(U->getOperand(1))) {
627       if (IdxConst->getZExtValue() == 0)
628         E0.push_back(U);
629       else if (IdxConst->getZExtValue() == 1)
630         E1.push_back(U);
631       else
632         llvm_unreachable("Invalid vector index.");
633     }
634   }
635 
636   // There's no point scattering f16x2 if we only ever access one
637   // element of it.
638   if (E0.empty() || E1.empty())
639     return false;
640 
641   unsigned Op = NVPTX::SplitF16x2;
642   // If the vector has been BITCAST'ed from i32, we can use original
643   // value directly and avoid register-to-register move.
644   SDValue Source = Vector;
645   if (Vector->getOpcode() == ISD::BITCAST) {
646     Op = NVPTX::SplitI32toF16x2;
647     Source = Vector->getOperand(0);
648   }
649   // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
650   // into f16,f16 SplitF16x2(V)
651   SDNode *ScatterOp =
652       CurDAG->getMachineNode(Op, SDLoc(N), MVT::f16, MVT::f16, Source);
653   for (auto *Node : E0)
654     ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
655   for (auto *Node : E1)
656     ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
657 
658   return true;
659 }
660 
661 static unsigned int getCodeAddrSpace(MemSDNode *N) {
662   const Value *Src = N->getMemOperand()->getValue();
663 
664   if (!Src)
665     return NVPTX::PTXLdStInstCode::GENERIC;
666 
667   if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
668     switch (PT->getAddressSpace()) {
669     case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
670     case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
671     case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED;
672     case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC;
673     case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM;
674     case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT;
675     default: break;
676     }
677   }
678   return NVPTX::PTXLdStInstCode::GENERIC;
679 }
680 
681 static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
682                           unsigned CodeAddrSpace, MachineFunction *F) {
683   // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
684   // space.
685   //
686   // We have two ways of identifying invariant loads: Loads may be explicitly
687   // marked as invariant, or we may infer them to be invariant.
688   //
689   // We currently infer invariance for loads from
690   //  - constant global variables, and
691   //  - kernel function pointer params that are noalias (i.e. __restrict) and
692   //    never written to.
693   //
694   // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
695   // not during the SelectionDAG phase).
696   //
697   // TODO: Infer invariance only at -O2.  We still want to use ldg at -O0 for
698   // explicitly invariant loads because these are how clang tells us to use ldg
699   // when the user uses a builtin.
700   if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
701     return false;
702 
703   if (N->isInvariant())
704     return true;
705 
706   bool IsKernelFn = isKernelFunction(F->getFunction());
707 
708   // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly
709   // because the former looks through phi nodes while the latter does not. We
710   // need to look through phi nodes to handle pointer induction variables.
711   SmallVector<const Value *, 8> Objs;
712   getUnderlyingObjects(N->getMemOperand()->getValue(), Objs);
713 
714   return all_of(Objs, [&](const Value *V) {
715     if (auto *A = dyn_cast<const Argument>(V))
716       return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
717     if (auto *GV = dyn_cast<const GlobalVariable>(V))
718       return GV->isConstant();
719     return false;
720   });
721 }
722 
723 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
724   unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
725   switch (IID) {
726   default:
727     return false;
728   case Intrinsic::nvvm_texsurf_handle_internal:
729     SelectTexSurfHandle(N);
730     return true;
731   }
732 }
733 
734 void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
735   // Op 0 is the intrinsic ID
736   SDValue Wrapper = N->getOperand(1);
737   SDValue GlobalVal = Wrapper.getOperand(0);
738   ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
739                                         MVT::i64, GlobalVal));
740 }
741 
742 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
743   SDValue Src = N->getOperand(0);
744   AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
745   unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
746   unsigned DstAddrSpace = CastN->getDestAddressSpace();
747   assert(SrcAddrSpace != DstAddrSpace &&
748          "addrspacecast must be between different address spaces");
749 
750   if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
751     // Specific to generic
752     unsigned Opc;
753     switch (SrcAddrSpace) {
754     default: report_fatal_error("Bad address space in addrspacecast");
755     case ADDRESS_SPACE_GLOBAL:
756       Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes;
757       break;
758     case ADDRESS_SPACE_SHARED:
759       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_shared_yes_6432
760                                                : NVPTX::cvta_shared_yes_64)
761                          : NVPTX::cvta_shared_yes;
762       break;
763     case ADDRESS_SPACE_CONST:
764       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_const_yes_6432
765                                                : NVPTX::cvta_const_yes_64)
766                          : NVPTX::cvta_const_yes;
767       break;
768     case ADDRESS_SPACE_LOCAL:
769       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_local_yes_6432
770                                                : NVPTX::cvta_local_yes_64)
771                          : NVPTX::cvta_local_yes;
772       break;
773     }
774     ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
775                                           Src));
776     return;
777   } else {
778     // Generic to specific
779     if (SrcAddrSpace != 0)
780       report_fatal_error("Cannot cast between two non-generic address spaces");
781     unsigned Opc;
782     switch (DstAddrSpace) {
783     default: report_fatal_error("Bad address space in addrspacecast");
784     case ADDRESS_SPACE_GLOBAL:
785       Opc = TM.is64Bit() ? NVPTX::cvta_to_global_yes_64
786                          : NVPTX::cvta_to_global_yes;
787       break;
788     case ADDRESS_SPACE_SHARED:
789       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_shared_yes_3264
790                                                 : NVPTX::cvta_to_shared_yes_64)
791                          : NVPTX::cvta_to_shared_yes;
792       break;
793     case ADDRESS_SPACE_CONST:
794       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_const_yes_3264
795                                              : NVPTX::cvta_to_const_yes_64)
796                          : NVPTX::cvta_to_const_yes;
797       break;
798     case ADDRESS_SPACE_LOCAL:
799       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_local_yes_3264
800                                                : NVPTX::cvta_to_local_yes_64)
801                          : NVPTX::cvta_to_local_yes;
802       break;
803     case ADDRESS_SPACE_PARAM:
804       Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
805                          : NVPTX::nvvm_ptr_gen_to_param;
806       break;
807     }
808     ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
809                                           Src));
810     return;
811   }
812 }
813 
814 // Helper function template to reduce amount of boilerplate code for
815 // opcode selection.
816 static std::optional<unsigned>
817 pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8,
818                 unsigned Opcode_i16, unsigned Opcode_i32,
819                 std::optional<unsigned> Opcode_i64, unsigned Opcode_f16,
820                 unsigned Opcode_f16x2, unsigned Opcode_f32,
821                 std::optional<unsigned> Opcode_f64) {
822   switch (VT) {
823   case MVT::i1:
824   case MVT::i8:
825     return Opcode_i8;
826   case MVT::i16:
827     return Opcode_i16;
828   case MVT::i32:
829     return Opcode_i32;
830   case MVT::i64:
831     return Opcode_i64;
832   case MVT::f16:
833   case MVT::bf16:
834     return Opcode_f16;
835   case MVT::v2f16:
836   case MVT::v2bf16:
837     return Opcode_f16x2;
838   case MVT::f32:
839     return Opcode_f32;
840   case MVT::f64:
841     return Opcode_f64;
842   default:
843     return std::nullopt;
844   }
845 }
846 
847 static int getLdStRegType(EVT VT) {
848   if (VT.isFloatingPoint())
849     switch (VT.getSimpleVT().SimpleTy) {
850     case MVT::f16:
851     case MVT::bf16:
852     case MVT::v2f16:
853     case MVT::v2bf16:
854       return NVPTX::PTXLdStInstCode::Untyped;
855     default:
856       return NVPTX::PTXLdStInstCode::Float;
857     }
858   else
859     return NVPTX::PTXLdStInstCode::Unsigned;
860 }
861 
862 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
863   SDLoc dl(N);
864   MemSDNode *LD = cast<MemSDNode>(N);
865   assert(LD->readMem() && "Expected load");
866   LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
867   EVT LoadedVT = LD->getMemoryVT();
868   SDNode *NVPTXLD = nullptr;
869 
870   // do not support pre/post inc/dec
871   if (PlainLoad && PlainLoad->isIndexed())
872     return false;
873 
874   if (!LoadedVT.isSimple())
875     return false;
876 
877   AtomicOrdering Ordering = LD->getSuccessOrdering();
878   // In order to lower atomic loads with stronger guarantees we would need to
879   // use load.acquire or insert fences. However these features were only added
880   // with PTX ISA 6.0 / sm_70.
881   // TODO: Check if we can actually use the new instructions and implement them.
882   if (isStrongerThanMonotonic(Ordering))
883     return false;
884 
885   // Address Space Setting
886   unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
887   if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
888     return tryLDGLDU(N);
889   }
890 
891   unsigned int PointerSize =
892       CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
893 
894   // Volatile Setting
895   // - .volatile is only available for .global and .shared
896   // - .volatile has the same memory synchronization semantics as .relaxed.sys
897   bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
898   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
899       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
900       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
901     isVolatile = false;
902 
903   // Type Setting: fromType + fromTypeWidth
904   //
905   // Sign   : ISD::SEXTLOAD
906   // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
907   //          type is integer
908   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
909   MVT SimpleVT = LoadedVT.getSimpleVT();
910   MVT ScalarVT = SimpleVT.getScalarType();
911   // Read at least 8 bits (predicates are stored as 8-bit values)
912   unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
913   unsigned int fromType;
914 
915   // Vector Setting
916   unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
917   if (SimpleVT.isVector()) {
918     assert((LoadedVT == MVT::v2f16 || LoadedVT == MVT::v2bf16) &&
919            "Unexpected vector type");
920     // v2f16/v2bf16 is loaded using ld.b32
921     fromTypeWidth = 32;
922   }
923 
924   if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
925     fromType = NVPTX::PTXLdStInstCode::Signed;
926   else
927     fromType = getLdStRegType(ScalarVT);
928 
929   // Create the machine instruction DAG
930   SDValue Chain = N->getOperand(0);
931   SDValue N1 = N->getOperand(1);
932   SDValue Addr;
933   SDValue Offset, Base;
934   std::optional<unsigned> Opcode;
935   MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
936 
937   if (SelectDirectAddr(N1, Addr)) {
938     Opcode = pickOpcodeForVT(
939         TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, NVPTX::LD_i32_avar,
940         NVPTX::LD_i64_avar, NVPTX::LD_f16_avar, NVPTX::LD_f16x2_avar,
941         NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
942     if (!Opcode)
943       return false;
944     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
945                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
946                       getI32Imm(fromTypeWidth, dl), Addr, Chain };
947     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
948   } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
949                                : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
950     Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
951                                  NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
952                                  NVPTX::LD_f16_asi, NVPTX::LD_f16x2_asi,
953                                  NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
954     if (!Opcode)
955       return false;
956     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
957                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
958                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
959     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
960   } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
961                                : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
962     if (PointerSize == 64)
963       Opcode = pickOpcodeForVT(
964           TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
965           NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, NVPTX::LD_f16_ari_64,
966           NVPTX::LD_f16x2_ari_64, NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
967     else
968       Opcode = pickOpcodeForVT(
969           TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari, NVPTX::LD_i32_ari,
970           NVPTX::LD_i64_ari, NVPTX::LD_f16_ari, NVPTX::LD_f16x2_ari,
971           NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
972     if (!Opcode)
973       return false;
974     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
975                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
976                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
977     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
978   } else {
979     if (PointerSize == 64)
980       Opcode = pickOpcodeForVT(
981           TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
982           NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, NVPTX::LD_f16_areg_64,
983           NVPTX::LD_f16x2_areg_64, NVPTX::LD_f32_areg_64,
984           NVPTX::LD_f64_areg_64);
985     else
986       Opcode = pickOpcodeForVT(
987           TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg, NVPTX::LD_i32_areg,
988           NVPTX::LD_i64_areg, NVPTX::LD_f16_areg, NVPTX::LD_f16x2_areg,
989           NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
990     if (!Opcode)
991       return false;
992     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
993                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
994                       getI32Imm(fromTypeWidth, dl), N1, Chain };
995     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
996   }
997 
998   if (!NVPTXLD)
999     return false;
1000 
1001   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1002   CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
1003 
1004   ReplaceNode(N, NVPTXLD);
1005   return true;
1006 }
1007 
1008 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1009 
1010   SDValue Chain = N->getOperand(0);
1011   SDValue Op1 = N->getOperand(1);
1012   SDValue Addr, Offset, Base;
1013   std::optional<unsigned> Opcode;
1014   SDLoc DL(N);
1015   SDNode *LD;
1016   MemSDNode *MemSD = cast<MemSDNode>(N);
1017   EVT LoadedVT = MemSD->getMemoryVT();
1018 
1019   if (!LoadedVT.isSimple())
1020     return false;
1021 
1022   // Address Space Setting
1023   unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
1024   if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
1025     return tryLDGLDU(N);
1026   }
1027 
1028   unsigned int PointerSize =
1029       CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
1030 
1031   // Volatile Setting
1032   // - .volatile is only availalble for .global and .shared
1033   bool IsVolatile = MemSD->isVolatile();
1034   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1035       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1036       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1037     IsVolatile = false;
1038 
1039   // Vector Setting
1040   MVT SimpleVT = LoadedVT.getSimpleVT();
1041 
1042   // Type Setting: fromType + fromTypeWidth
1043   //
1044   // Sign   : ISD::SEXTLOAD
1045   // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1046   //          type is integer
1047   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1048   MVT ScalarVT = SimpleVT.getScalarType();
1049   // Read at least 8 bits (predicates are stored as 8-bit values)
1050   unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
1051   unsigned int FromType;
1052   // The last operand holds the original LoadSDNode::getExtensionType() value
1053   unsigned ExtensionType = cast<ConstantSDNode>(
1054       N->getOperand(N->getNumOperands() - 1))->getZExtValue();
1055   if (ExtensionType == ISD::SEXTLOAD)
1056     FromType = NVPTX::PTXLdStInstCode::Signed;
1057   else
1058     FromType = getLdStRegType(ScalarVT);
1059 
1060   unsigned VecType;
1061 
1062   switch (N->getOpcode()) {
1063   case NVPTXISD::LoadV2:
1064     VecType = NVPTX::PTXLdStInstCode::V2;
1065     break;
1066   case NVPTXISD::LoadV4:
1067     VecType = NVPTX::PTXLdStInstCode::V4;
1068     break;
1069   default:
1070     return false;
1071   }
1072 
1073   EVT EltVT = N->getValueType(0);
1074 
1075   // v8f16 is a special case. PTX doesn't have ld.v8.f16
1076   // instruction. Instead, we split the vector into v2f16 chunks and
1077   // load them with ld.v4.b32.
1078   if (EltVT == MVT::v2f16 || EltVT == MVT::v2bf16) {
1079     assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
1080     EltVT = MVT::i32;
1081     FromType = NVPTX::PTXLdStInstCode::Untyped;
1082     FromTypeWidth = 32;
1083   }
1084 
1085   if (SelectDirectAddr(Op1, Addr)) {
1086     switch (N->getOpcode()) {
1087     default:
1088       return false;
1089     case NVPTXISD::LoadV2:
1090       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1091                                NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
1092                                NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
1093                                NVPTX::LDV_f16_v2_avar, NVPTX::LDV_f16x2_v2_avar,
1094                                NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
1095       break;
1096     case NVPTXISD::LoadV4:
1097       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1098                                NVPTX::LDV_i8_v4_avar, NVPTX::LDV_i16_v4_avar,
1099                                NVPTX::LDV_i32_v4_avar, std::nullopt,
1100                                NVPTX::LDV_f16_v4_avar, NVPTX::LDV_f16x2_v4_avar,
1101                                NVPTX::LDV_f32_v4_avar, std::nullopt);
1102       break;
1103     }
1104     if (!Opcode)
1105       return false;
1106     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1107                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1108                       getI32Imm(FromTypeWidth, DL), Addr, Chain };
1109     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1110   } else if (PointerSize == 64
1111                  ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
1112                  : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
1113     switch (N->getOpcode()) {
1114     default:
1115       return false;
1116     case NVPTXISD::LoadV2:
1117       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1118                                NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
1119                                NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
1120                                NVPTX::LDV_f16_v2_asi, NVPTX::LDV_f16x2_v2_asi,
1121                                NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
1122       break;
1123     case NVPTXISD::LoadV4:
1124       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1125                                NVPTX::LDV_i8_v4_asi, NVPTX::LDV_i16_v4_asi,
1126                                NVPTX::LDV_i32_v4_asi, std::nullopt,
1127                                NVPTX::LDV_f16_v4_asi, NVPTX::LDV_f16x2_v4_asi,
1128                                NVPTX::LDV_f32_v4_asi, std::nullopt);
1129       break;
1130     }
1131     if (!Opcode)
1132       return false;
1133     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1134                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1135                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1136     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1137   } else if (PointerSize == 64
1138                  ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1139                  : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1140     if (PointerSize == 64) {
1141       switch (N->getOpcode()) {
1142       default:
1143         return false;
1144       case NVPTXISD::LoadV2:
1145         Opcode = pickOpcodeForVT(
1146             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_ari_64,
1147             NVPTX::LDV_i16_v2_ari_64, NVPTX::LDV_i32_v2_ari_64,
1148             NVPTX::LDV_i64_v2_ari_64, NVPTX::LDV_f16_v2_ari_64,
1149             NVPTX::LDV_f16x2_v2_ari_64, NVPTX::LDV_f32_v2_ari_64,
1150             NVPTX::LDV_f64_v2_ari_64);
1151         break;
1152       case NVPTXISD::LoadV4:
1153         Opcode = pickOpcodeForVT(
1154             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
1155             NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, std::nullopt,
1156             NVPTX::LDV_f16_v4_ari_64, NVPTX::LDV_f16x2_v4_ari_64,
1157             NVPTX::LDV_f32_v4_ari_64, std::nullopt);
1158         break;
1159       }
1160     } else {
1161       switch (N->getOpcode()) {
1162       default:
1163         return false;
1164       case NVPTXISD::LoadV2:
1165         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1166                                  NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
1167                                  NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
1168                                  NVPTX::LDV_f16_v2_ari, NVPTX::LDV_f16x2_v2_ari,
1169                                  NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
1170         break;
1171       case NVPTXISD::LoadV4:
1172         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1173                                  NVPTX::LDV_i8_v4_ari, NVPTX::LDV_i16_v4_ari,
1174                                  NVPTX::LDV_i32_v4_ari, std::nullopt,
1175                                  NVPTX::LDV_f16_v4_ari, NVPTX::LDV_f16x2_v4_ari,
1176                                  NVPTX::LDV_f32_v4_ari, std::nullopt);
1177         break;
1178       }
1179     }
1180     if (!Opcode)
1181       return false;
1182     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1183                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1184                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1185 
1186     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1187   } else {
1188     if (PointerSize == 64) {
1189       switch (N->getOpcode()) {
1190       default:
1191         return false;
1192       case NVPTXISD::LoadV2:
1193         Opcode = pickOpcodeForVT(
1194             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
1195             NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
1196             NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f16_v2_areg_64,
1197             NVPTX::LDV_f16x2_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
1198             NVPTX::LDV_f64_v2_areg_64);
1199         break;
1200       case NVPTXISD::LoadV4:
1201         Opcode = pickOpcodeForVT(
1202             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
1203             NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, std::nullopt,
1204             NVPTX::LDV_f16_v4_areg_64, NVPTX::LDV_f16x2_v4_areg_64,
1205             NVPTX::LDV_f32_v4_areg_64, std::nullopt);
1206         break;
1207       }
1208     } else {
1209       switch (N->getOpcode()) {
1210       default:
1211         return false;
1212       case NVPTXISD::LoadV2:
1213         Opcode =
1214             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
1215                             NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
1216                             NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f16_v2_areg,
1217                             NVPTX::LDV_f16x2_v2_areg, NVPTX::LDV_f32_v2_areg,
1218                             NVPTX::LDV_f64_v2_areg);
1219         break;
1220       case NVPTXISD::LoadV4:
1221         Opcode = pickOpcodeForVT(
1222             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
1223             NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg, std::nullopt,
1224             NVPTX::LDV_f16_v4_areg, NVPTX::LDV_f16x2_v4_areg,
1225             NVPTX::LDV_f32_v4_areg, std::nullopt);
1226         break;
1227       }
1228     }
1229     if (!Opcode)
1230       return false;
1231     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1232                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1233                       getI32Imm(FromTypeWidth, DL), Op1, Chain };
1234     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1235   }
1236 
1237   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1238   CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1239 
1240   ReplaceNode(N, LD);
1241   return true;
1242 }
1243 
1244 bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
1245 
1246   SDValue Chain = N->getOperand(0);
1247   SDValue Op1;
1248   MemSDNode *Mem;
1249   bool IsLDG = true;
1250 
1251   // If this is an LDG intrinsic, the address is the third operand. If its an
1252   // LDG/LDU SD node (from custom vector handling), then its the second operand
1253   if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
1254     Op1 = N->getOperand(2);
1255     Mem = cast<MemIntrinsicSDNode>(N);
1256     unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
1257     switch (IID) {
1258     default:
1259       return false;
1260     case Intrinsic::nvvm_ldg_global_f:
1261     case Intrinsic::nvvm_ldg_global_i:
1262     case Intrinsic::nvvm_ldg_global_p:
1263       IsLDG = true;
1264       break;
1265     case Intrinsic::nvvm_ldu_global_f:
1266     case Intrinsic::nvvm_ldu_global_i:
1267     case Intrinsic::nvvm_ldu_global_p:
1268       IsLDG = false;
1269       break;
1270     }
1271   } else {
1272     Op1 = N->getOperand(1);
1273     Mem = cast<MemSDNode>(N);
1274   }
1275 
1276   std::optional<unsigned> Opcode;
1277   SDLoc DL(N);
1278   SDNode *LD;
1279   SDValue Base, Offset, Addr;
1280 
1281   EVT EltVT = Mem->getMemoryVT();
1282   unsigned NumElts = 1;
1283   if (EltVT.isVector()) {
1284     NumElts = EltVT.getVectorNumElements();
1285     EltVT = EltVT.getVectorElementType();
1286     // vectors of f16 are loaded/stored as multiples of v2f16 elements.
1287     if (EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) {
1288       assert(NumElts % 2 == 0 && "Vector must have even number of elements");
1289       EltVT = MVT::v2f16;
1290       NumElts /= 2;
1291     }
1292   }
1293 
1294   // Build the "promoted" result VTList for the load. If we are really loading
1295   // i8s, then the return type will be promoted to i16 since we do not expose
1296   // 8-bit registers in NVPTX.
1297   EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
1298   SmallVector<EVT, 5> InstVTs;
1299   for (unsigned i = 0; i != NumElts; ++i) {
1300     InstVTs.push_back(NodeVT);
1301   }
1302   InstVTs.push_back(MVT::Other);
1303   SDVTList InstVTList = CurDAG->getVTList(InstVTs);
1304 
1305   if (SelectDirectAddr(Op1, Addr)) {
1306     switch (N->getOpcode()) {
1307     default:
1308       return false;
1309     case ISD::LOAD:
1310     case ISD::INTRINSIC_W_CHAIN:
1311       if (IsLDG)
1312         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1313                                      NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1314                                      NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
1315                                      NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1316                                      NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
1317                                      NVPTX::INT_PTX_LDG_GLOBAL_f16avar,
1318                                      NVPTX::INT_PTX_LDG_GLOBAL_f16x2avar,
1319                                      NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1320                                      NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1321       else
1322         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1323                                      NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1324                                      NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
1325                                      NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1326                                      NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
1327                                      NVPTX::INT_PTX_LDU_GLOBAL_f16avar,
1328                                      NVPTX::INT_PTX_LDU_GLOBAL_f16x2avar,
1329                                      NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1330                                      NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1331       break;
1332     case NVPTXISD::LoadV2:
1333     case NVPTXISD::LDGV2:
1334       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1335                                    NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1336                                    NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1337                                    NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1338                                    NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1339                                    NVPTX::INT_PTX_LDG_G_v2f16_ELE_avar,
1340                                    NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_avar,
1341                                    NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1342                                    NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1343       break;
1344     case NVPTXISD::LDUV2:
1345       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1346                                    NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1347                                    NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1348                                    NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1349                                    NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1350                                    NVPTX::INT_PTX_LDU_G_v2f16_ELE_avar,
1351                                    NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_avar,
1352                                    NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1353                                    NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1354       break;
1355     case NVPTXISD::LoadV4:
1356     case NVPTXISD::LDGV4:
1357       Opcode = pickOpcodeForVT(
1358           EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1359           NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1360           NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,
1361           NVPTX::INT_PTX_LDG_G_v4f16_ELE_avar,
1362           NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_avar,
1363           NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);
1364       break;
1365     case NVPTXISD::LDUV4:
1366       Opcode = pickOpcodeForVT(
1367           EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1368           NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1369           NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,
1370           NVPTX::INT_PTX_LDU_G_v4f16_ELE_avar,
1371           NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_avar,
1372           NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);
1373       break;
1374     }
1375     if (!Opcode)
1376       return false;
1377     SDValue Ops[] = { Addr, Chain };
1378     LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1379   } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1380                           : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1381     if (TM.is64Bit()) {
1382       switch (N->getOpcode()) {
1383       default:
1384         return false;
1385       case ISD::LOAD:
1386       case ISD::INTRINSIC_W_CHAIN:
1387         if (IsLDG)
1388           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1389                                        NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
1390                                        NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
1391                                        NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
1392                                        NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
1393                                        NVPTX::INT_PTX_LDG_GLOBAL_f16ari64,
1394                                        NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari64,
1395                                        NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
1396                                        NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
1397         else
1398           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1399                                        NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
1400                                        NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
1401                                        NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
1402                                        NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
1403                                        NVPTX::INT_PTX_LDU_GLOBAL_f16ari64,
1404                                        NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari64,
1405                                        NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
1406                                        NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
1407         break;
1408       case NVPTXISD::LoadV2:
1409       case NVPTXISD::LDGV2:
1410         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1411                                      NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
1412                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
1413                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
1414                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
1415                                      NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari64,
1416                                      NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari64,
1417                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
1418                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
1419         break;
1420       case NVPTXISD::LDUV2:
1421         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1422                                      NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
1423                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
1424                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
1425                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
1426                                      NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari64,
1427                                      NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari64,
1428                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
1429                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
1430         break;
1431       case NVPTXISD::LoadV4:
1432       case NVPTXISD::LDGV4:
1433         Opcode = pickOpcodeForVT(
1434             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
1435             NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
1436             NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, std::nullopt,
1437             NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari64,
1438             NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari64,
1439             NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, std::nullopt);
1440         break;
1441       case NVPTXISD::LDUV4:
1442         Opcode = pickOpcodeForVT(
1443             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
1444             NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
1445             NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, std::nullopt,
1446             NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari64,
1447             NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari64,
1448             NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, std::nullopt);
1449         break;
1450       }
1451     } else {
1452       switch (N->getOpcode()) {
1453       default:
1454         return false;
1455       case ISD::LOAD:
1456       case ISD::INTRINSIC_W_CHAIN:
1457         if (IsLDG)
1458           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1459                                        NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
1460                                        NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
1461                                        NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
1462                                        NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
1463                                        NVPTX::INT_PTX_LDG_GLOBAL_f16ari,
1464                                        NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari,
1465                                        NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
1466                                        NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
1467         else
1468           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1469                                        NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
1470                                        NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
1471                                        NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
1472                                        NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
1473                                        NVPTX::INT_PTX_LDU_GLOBAL_f16ari,
1474                                        NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari,
1475                                        NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
1476                                        NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
1477         break;
1478       case NVPTXISD::LoadV2:
1479       case NVPTXISD::LDGV2:
1480         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1481                                      NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
1482                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
1483                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
1484                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
1485                                      NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari32,
1486                                      NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari32,
1487                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
1488                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
1489         break;
1490       case NVPTXISD::LDUV2:
1491         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1492                                      NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
1493                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
1494                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
1495                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
1496                                      NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari32,
1497                                      NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari32,
1498                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
1499                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
1500         break;
1501       case NVPTXISD::LoadV4:
1502       case NVPTXISD::LDGV4:
1503         Opcode = pickOpcodeForVT(
1504             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
1505             NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
1506             NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, std::nullopt,
1507             NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari32,
1508             NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari32,
1509             NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, std::nullopt);
1510         break;
1511       case NVPTXISD::LDUV4:
1512         Opcode = pickOpcodeForVT(
1513             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
1514             NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
1515             NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, std::nullopt,
1516             NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari32,
1517             NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari32,
1518             NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, std::nullopt);
1519         break;
1520       }
1521     }
1522     if (!Opcode)
1523       return false;
1524     SDValue Ops[] = {Base, Offset, Chain};
1525     LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1526   } else {
1527     if (TM.is64Bit()) {
1528       switch (N->getOpcode()) {
1529       default:
1530         return false;
1531       case ISD::LOAD:
1532       case ISD::INTRINSIC_W_CHAIN:
1533         if (IsLDG)
1534           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1535                                        NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
1536                                        NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
1537                                        NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
1538                                        NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
1539                                        NVPTX::INT_PTX_LDG_GLOBAL_f16areg64,
1540                                        NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg64,
1541                                        NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
1542                                        NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
1543         else
1544           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1545                                        NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
1546                                        NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
1547                                        NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
1548                                        NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
1549                                        NVPTX::INT_PTX_LDU_GLOBAL_f16areg64,
1550                                        NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg64,
1551                                        NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
1552                                        NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
1553         break;
1554       case NVPTXISD::LoadV2:
1555       case NVPTXISD::LDGV2:
1556         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1557                                      NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
1558                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
1559                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
1560                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
1561                                      NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg64,
1562                                      NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg64,
1563                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
1564                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
1565         break;
1566       case NVPTXISD::LDUV2:
1567         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1568                                      NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
1569                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
1570                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
1571                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
1572                                      NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg64,
1573                                      NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg64,
1574                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
1575                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
1576         break;
1577       case NVPTXISD::LoadV4:
1578       case NVPTXISD::LDGV4:
1579         Opcode = pickOpcodeForVT(
1580             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
1581             NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
1582             NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, std::nullopt,
1583             NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg64,
1584             NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg64,
1585             NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, std::nullopt);
1586         break;
1587       case NVPTXISD::LDUV4:
1588         Opcode = pickOpcodeForVT(
1589             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
1590             NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
1591             NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, std::nullopt,
1592             NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg64,
1593             NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg64,
1594             NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, std::nullopt);
1595         break;
1596       }
1597     } else {
1598       switch (N->getOpcode()) {
1599       default:
1600         return false;
1601       case ISD::LOAD:
1602       case ISD::INTRINSIC_W_CHAIN:
1603         if (IsLDG)
1604           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1605                                    NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
1606                                    NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
1607                                    NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
1608                                    NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
1609                                    NVPTX::INT_PTX_LDG_GLOBAL_f16areg,
1610                                    NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg,
1611                                    NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
1612                                    NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
1613         else
1614           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1615                                    NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
1616                                    NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
1617                                    NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
1618                                    NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
1619                                    NVPTX::INT_PTX_LDU_GLOBAL_f16areg,
1620                                    NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg,
1621                                    NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
1622                                    NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
1623         break;
1624       case NVPTXISD::LoadV2:
1625       case NVPTXISD::LDGV2:
1626         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1627                                  NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
1628                                  NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
1629                                  NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
1630                                  NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
1631                                  NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg32,
1632                                  NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg32,
1633                                  NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
1634                                  NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
1635         break;
1636       case NVPTXISD::LDUV2:
1637         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1638                                  NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
1639                                  NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
1640                                  NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
1641                                  NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
1642                                  NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg32,
1643                                  NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg32,
1644                                  NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
1645                                  NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
1646         break;
1647       case NVPTXISD::LoadV4:
1648       case NVPTXISD::LDGV4:
1649         Opcode = pickOpcodeForVT(
1650             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
1651             NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
1652             NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, std::nullopt,
1653             NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg32,
1654             NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg32,
1655             NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, std::nullopt);
1656         break;
1657       case NVPTXISD::LDUV4:
1658         Opcode = pickOpcodeForVT(
1659             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
1660             NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
1661             NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, std::nullopt,
1662             NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg32,
1663             NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg32,
1664             NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, std::nullopt);
1665         break;
1666       }
1667     }
1668     if (!Opcode)
1669       return false;
1670     SDValue Ops[] = { Op1, Chain };
1671     LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1672   }
1673 
1674   // For automatic generation of LDG (through SelectLoad[Vector], not the
1675   // intrinsics), we may have an extending load like:
1676   //
1677   //   i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
1678   //
1679   // In this case, the matching logic above will select a load for the original
1680   // memory type (in this case, i8) and our types will not match (the node needs
1681   // to return an i32 in this case). Our LDG/LDU nodes do not support the
1682   // concept of sign-/zero-extension, so emulate it here by adding an explicit
1683   // CVT instruction. Ptxas should clean up any redundancies here.
1684 
1685   EVT OrigType = N->getValueType(0);
1686   LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
1687 
1688   if (OrigType != EltVT && LdNode) {
1689     // We have an extending-load. The instruction we selected operates on the
1690     // smaller type, but the SDNode we are replacing has the larger type. We
1691     // need to emit a CVT to make the types match.
1692     bool IsSigned = LdNode->getExtensionType() == ISD::SEXTLOAD;
1693     unsigned CvtOpc = GetConvertOpcode(OrigType.getSimpleVT(),
1694                                        EltVT.getSimpleVT(), IsSigned);
1695 
1696     // For each output value, apply the manual sign/zero-extension and make sure
1697     // all users of the load go through that CVT.
1698     for (unsigned i = 0; i != NumElts; ++i) {
1699       SDValue Res(LD, i);
1700       SDValue OrigVal(N, i);
1701 
1702       SDNode *CvtNode =
1703         CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
1704                                CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
1705                                                          DL, MVT::i32));
1706       ReplaceUses(OrigVal, SDValue(CvtNode, 0));
1707     }
1708   }
1709 
1710   ReplaceNode(N, LD);
1711   return true;
1712 }
1713 
1714 bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1715   SDLoc dl(N);
1716   MemSDNode *ST = cast<MemSDNode>(N);
1717   assert(ST->writeMem() && "Expected store");
1718   StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
1719   AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
1720   assert((PlainStore || AtomicStore) && "Expected store");
1721   EVT StoreVT = ST->getMemoryVT();
1722   SDNode *NVPTXST = nullptr;
1723 
1724   // do not support pre/post inc/dec
1725   if (PlainStore && PlainStore->isIndexed())
1726     return false;
1727 
1728   if (!StoreVT.isSimple())
1729     return false;
1730 
1731   AtomicOrdering Ordering = ST->getSuccessOrdering();
1732   // In order to lower atomic loads with stronger guarantees we would need to
1733   // use store.release or insert fences. However these features were only added
1734   // with PTX ISA 6.0 / sm_70.
1735   // TODO: Check if we can actually use the new instructions and implement them.
1736   if (isStrongerThanMonotonic(Ordering))
1737     return false;
1738 
1739   // Address Space Setting
1740   unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
1741   unsigned int PointerSize =
1742       CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
1743 
1744   // Volatile Setting
1745   // - .volatile is only available for .global and .shared
1746   // - .volatile has the same memory synchronization semantics as .relaxed.sys
1747   bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;
1748   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1749       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1750       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1751     isVolatile = false;
1752 
1753   // Vector Setting
1754   MVT SimpleVT = StoreVT.getSimpleVT();
1755   unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
1756 
1757   // Type Setting: toType + toTypeWidth
1758   // - for integer type, always use 'u'
1759   //
1760   MVT ScalarVT = SimpleVT.getScalarType();
1761   unsigned toTypeWidth = ScalarVT.getSizeInBits();
1762   if (SimpleVT.isVector()) {
1763     assert((StoreVT == MVT::v2f16 || StoreVT == MVT::v2bf16) &&
1764            "Unexpected vector type");
1765     // v2f16 is stored using st.b32
1766     toTypeWidth = 32;
1767   }
1768 
1769   unsigned int toType = getLdStRegType(ScalarVT);
1770 
1771   // Create the machine instruction DAG
1772   SDValue Chain = ST->getChain();
1773   SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1774   SDValue BasePtr = ST->getBasePtr();
1775   SDValue Addr;
1776   SDValue Offset, Base;
1777   std::optional<unsigned> Opcode;
1778   MVT::SimpleValueType SourceVT =
1779       Value.getNode()->getSimpleValueType(0).SimpleTy;
1780 
1781   if (SelectDirectAddr(BasePtr, Addr)) {
1782     Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
1783                              NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
1784                              NVPTX::ST_f16_avar, NVPTX::ST_f16x2_avar,
1785                              NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
1786     if (!Opcode)
1787       return false;
1788     SDValue Ops[] = {Value,
1789                      getI32Imm(isVolatile, dl),
1790                      getI32Imm(CodeAddrSpace, dl),
1791                      getI32Imm(vecType, dl),
1792                      getI32Imm(toType, dl),
1793                      getI32Imm(toTypeWidth, dl),
1794                      Addr,
1795                      Chain};
1796     NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1797   } else if (PointerSize == 64
1798                  ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
1799                  : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
1800     Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
1801                              NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
1802                              NVPTX::ST_f16_asi, NVPTX::ST_f16x2_asi,
1803                              NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
1804     if (!Opcode)
1805       return false;
1806     SDValue Ops[] = {Value,
1807                      getI32Imm(isVolatile, dl),
1808                      getI32Imm(CodeAddrSpace, dl),
1809                      getI32Imm(vecType, dl),
1810                      getI32Imm(toType, dl),
1811                      getI32Imm(toTypeWidth, dl),
1812                      Base,
1813                      Offset,
1814                      Chain};
1815     NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1816   } else if (PointerSize == 64
1817                  ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
1818                  : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
1819     if (PointerSize == 64)
1820       Opcode = pickOpcodeForVT(
1821           SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
1822           NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, NVPTX::ST_f16_ari_64,
1823           NVPTX::ST_f16x2_ari_64, NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
1824     else
1825       Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
1826                                NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
1827                                NVPTX::ST_f16_ari, NVPTX::ST_f16x2_ari,
1828                                NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
1829     if (!Opcode)
1830       return false;
1831 
1832     SDValue Ops[] = {Value,
1833                      getI32Imm(isVolatile, dl),
1834                      getI32Imm(CodeAddrSpace, dl),
1835                      getI32Imm(vecType, dl),
1836                      getI32Imm(toType, dl),
1837                      getI32Imm(toTypeWidth, dl),
1838                      Base,
1839                      Offset,
1840                      Chain};
1841     NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1842   } else {
1843     if (PointerSize == 64)
1844       Opcode =
1845           pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
1846                           NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
1847                           NVPTX::ST_f16_areg_64, NVPTX::ST_f16x2_areg_64,
1848                           NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
1849     else
1850       Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
1851                                NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
1852                                NVPTX::ST_f16_areg, NVPTX::ST_f16x2_areg,
1853                                NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
1854     if (!Opcode)
1855       return false;
1856     SDValue Ops[] = {Value,
1857                      getI32Imm(isVolatile, dl),
1858                      getI32Imm(CodeAddrSpace, dl),
1859                      getI32Imm(vecType, dl),
1860                      getI32Imm(toType, dl),
1861                      getI32Imm(toTypeWidth, dl),
1862                      BasePtr,
1863                      Chain};
1864     NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1865   }
1866 
1867   if (!NVPTXST)
1868     return false;
1869 
1870   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1871   CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1872   ReplaceNode(N, NVPTXST);
1873   return true;
1874 }
1875 
1876 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1877   SDValue Chain = N->getOperand(0);
1878   SDValue Op1 = N->getOperand(1);
1879   SDValue Addr, Offset, Base;
1880   std::optional<unsigned> Opcode;
1881   SDLoc DL(N);
1882   SDNode *ST;
1883   EVT EltVT = Op1.getValueType();
1884   MemSDNode *MemSD = cast<MemSDNode>(N);
1885   EVT StoreVT = MemSD->getMemoryVT();
1886 
1887   // Address Space Setting
1888   unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
1889   if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
1890     report_fatal_error("Cannot store to pointer that points to constant "
1891                        "memory space");
1892   }
1893   unsigned int PointerSize =
1894       CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
1895 
1896   // Volatile Setting
1897   // - .volatile is only availalble for .global and .shared
1898   bool IsVolatile = MemSD->isVolatile();
1899   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1900       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1901       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1902     IsVolatile = false;
1903 
1904   // Type Setting: toType + toTypeWidth
1905   // - for integer type, always use 'u'
1906   assert(StoreVT.isSimple() && "Store value is not simple");
1907   MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
1908   unsigned ToTypeWidth = ScalarVT.getSizeInBits();
1909   unsigned ToType = getLdStRegType(ScalarVT);
1910 
1911   SmallVector<SDValue, 12> StOps;
1912   SDValue N2;
1913   unsigned VecType;
1914 
1915   switch (N->getOpcode()) {
1916   case NVPTXISD::StoreV2:
1917     VecType = NVPTX::PTXLdStInstCode::V2;
1918     StOps.push_back(N->getOperand(1));
1919     StOps.push_back(N->getOperand(2));
1920     N2 = N->getOperand(3);
1921     break;
1922   case NVPTXISD::StoreV4:
1923     VecType = NVPTX::PTXLdStInstCode::V4;
1924     StOps.push_back(N->getOperand(1));
1925     StOps.push_back(N->getOperand(2));
1926     StOps.push_back(N->getOperand(3));
1927     StOps.push_back(N->getOperand(4));
1928     N2 = N->getOperand(5);
1929     break;
1930   default:
1931     return false;
1932   }
1933 
1934   // v8f16 is a special case. PTX doesn't have st.v8.f16
1935   // instruction. Instead, we split the vector into v2f16 chunks and
1936   // store them with st.v4.b32.
1937   if (EltVT == MVT::v2f16 || EltVT == MVT::v2bf16) {
1938     assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
1939     EltVT = MVT::i32;
1940     ToType = NVPTX::PTXLdStInstCode::Untyped;
1941     ToTypeWidth = 32;
1942   }
1943 
1944   StOps.push_back(getI32Imm(IsVolatile, DL));
1945   StOps.push_back(getI32Imm(CodeAddrSpace, DL));
1946   StOps.push_back(getI32Imm(VecType, DL));
1947   StOps.push_back(getI32Imm(ToType, DL));
1948   StOps.push_back(getI32Imm(ToTypeWidth, DL));
1949 
1950   if (SelectDirectAddr(N2, Addr)) {
1951     switch (N->getOpcode()) {
1952     default:
1953       return false;
1954     case NVPTXISD::StoreV2:
1955       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1956                                NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
1957                                NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
1958                                NVPTX::STV_f16_v2_avar, NVPTX::STV_f16x2_v2_avar,
1959                                NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
1960       break;
1961     case NVPTXISD::StoreV4:
1962       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1963                                NVPTX::STV_i8_v4_avar, NVPTX::STV_i16_v4_avar,
1964                                NVPTX::STV_i32_v4_avar, std::nullopt,
1965                                NVPTX::STV_f16_v4_avar, NVPTX::STV_f16x2_v4_avar,
1966                                NVPTX::STV_f32_v4_avar, std::nullopt);
1967       break;
1968     }
1969     StOps.push_back(Addr);
1970   } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
1971                                : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
1972     switch (N->getOpcode()) {
1973     default:
1974       return false;
1975     case NVPTXISD::StoreV2:
1976       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1977                                NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
1978                                NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
1979                                NVPTX::STV_f16_v2_asi, NVPTX::STV_f16x2_v2_asi,
1980                                NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
1981       break;
1982     case NVPTXISD::StoreV4:
1983       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1984                                NVPTX::STV_i8_v4_asi, NVPTX::STV_i16_v4_asi,
1985                                NVPTX::STV_i32_v4_asi, std::nullopt,
1986                                NVPTX::STV_f16_v4_asi, NVPTX::STV_f16x2_v4_asi,
1987                                NVPTX::STV_f32_v4_asi, std::nullopt);
1988       break;
1989     }
1990     StOps.push_back(Base);
1991     StOps.push_back(Offset);
1992   } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
1993                                : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
1994     if (PointerSize == 64) {
1995       switch (N->getOpcode()) {
1996       default:
1997         return false;
1998       case NVPTXISD::StoreV2:
1999         Opcode = pickOpcodeForVT(
2000             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_ari_64,
2001             NVPTX::STV_i16_v2_ari_64, NVPTX::STV_i32_v2_ari_64,
2002             NVPTX::STV_i64_v2_ari_64, NVPTX::STV_f16_v2_ari_64,
2003             NVPTX::STV_f16x2_v2_ari_64, NVPTX::STV_f32_v2_ari_64,
2004             NVPTX::STV_f64_v2_ari_64);
2005         break;
2006       case NVPTXISD::StoreV4:
2007         Opcode = pickOpcodeForVT(
2008             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
2009             NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, std::nullopt,
2010             NVPTX::STV_f16_v4_ari_64, NVPTX::STV_f16x2_v4_ari_64,
2011             NVPTX::STV_f32_v4_ari_64, std::nullopt);
2012         break;
2013       }
2014     } else {
2015       switch (N->getOpcode()) {
2016       default:
2017         return false;
2018       case NVPTXISD::StoreV2:
2019         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2020                                  NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
2021                                  NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
2022                                  NVPTX::STV_f16_v2_ari, NVPTX::STV_f16x2_v2_ari,
2023                                  NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
2024         break;
2025       case NVPTXISD::StoreV4:
2026         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2027                                  NVPTX::STV_i8_v4_ari, NVPTX::STV_i16_v4_ari,
2028                                  NVPTX::STV_i32_v4_ari, std::nullopt,
2029                                  NVPTX::STV_f16_v4_ari, NVPTX::STV_f16x2_v4_ari,
2030                                  NVPTX::STV_f32_v4_ari, std::nullopt);
2031         break;
2032       }
2033     }
2034     StOps.push_back(Base);
2035     StOps.push_back(Offset);
2036   } else {
2037     if (PointerSize == 64) {
2038       switch (N->getOpcode()) {
2039       default:
2040         return false;
2041       case NVPTXISD::StoreV2:
2042         Opcode = pickOpcodeForVT(
2043             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
2044             NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
2045             NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f16_v2_areg_64,
2046             NVPTX::STV_f16x2_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
2047             NVPTX::STV_f64_v2_areg_64);
2048         break;
2049       case NVPTXISD::StoreV4:
2050         Opcode = pickOpcodeForVT(
2051             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
2052             NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, std::nullopt,
2053             NVPTX::STV_f16_v4_areg_64, NVPTX::STV_f16x2_v4_areg_64,
2054             NVPTX::STV_f32_v4_areg_64, std::nullopt);
2055         break;
2056       }
2057     } else {
2058       switch (N->getOpcode()) {
2059       default:
2060         return false;
2061       case NVPTXISD::StoreV2:
2062         Opcode =
2063             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
2064                             NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
2065                             NVPTX::STV_i64_v2_areg, NVPTX::STV_f16_v2_areg,
2066                             NVPTX::STV_f16x2_v2_areg, NVPTX::STV_f32_v2_areg,
2067                             NVPTX::STV_f64_v2_areg);
2068         break;
2069       case NVPTXISD::StoreV4:
2070         Opcode = pickOpcodeForVT(
2071             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
2072             NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg, std::nullopt,
2073             NVPTX::STV_f16_v4_areg, NVPTX::STV_f16x2_v4_areg,
2074             NVPTX::STV_f32_v4_areg, std::nullopt);
2075         break;
2076       }
2077     }
2078     StOps.push_back(N2);
2079   }
2080 
2081   if (!Opcode)
2082     return false;
2083 
2084   StOps.push_back(Chain);
2085 
2086   ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, StOps);
2087 
2088   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2089   CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
2090 
2091   ReplaceNode(N, ST);
2092   return true;
2093 }
2094 
2095 bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
2096   SDValue Chain = Node->getOperand(0);
2097   SDValue Offset = Node->getOperand(2);
2098   SDValue Flag = Node->getOperand(3);
2099   SDLoc DL(Node);
2100   MemSDNode *Mem = cast<MemSDNode>(Node);
2101 
2102   unsigned VecSize;
2103   switch (Node->getOpcode()) {
2104   default:
2105     return false;
2106   case NVPTXISD::LoadParam:
2107     VecSize = 1;
2108     break;
2109   case NVPTXISD::LoadParamV2:
2110     VecSize = 2;
2111     break;
2112   case NVPTXISD::LoadParamV4:
2113     VecSize = 4;
2114     break;
2115   }
2116 
2117   EVT EltVT = Node->getValueType(0);
2118   EVT MemVT = Mem->getMemoryVT();
2119 
2120   std::optional<unsigned> Opcode;
2121 
2122   switch (VecSize) {
2123   default:
2124     return false;
2125   case 1:
2126     Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
2127                              NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
2128                              NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
2129                              NVPTX::LoadParamMemF16, NVPTX::LoadParamMemF16x2,
2130                              NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
2131     break;
2132   case 2:
2133     Opcode =
2134         pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
2135                         NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
2136                         NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F16,
2137                         NVPTX::LoadParamMemV2F16x2, NVPTX::LoadParamMemV2F32,
2138                         NVPTX::LoadParamMemV2F64);
2139     break;
2140   case 4:
2141     Opcode = pickOpcodeForVT(
2142         MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
2143         NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32, std::nullopt,
2144         NVPTX::LoadParamMemV4F16, NVPTX::LoadParamMemV4F16x2,
2145         NVPTX::LoadParamMemV4F32, std::nullopt);
2146     break;
2147   }
2148   if (!Opcode)
2149     return false;
2150 
2151   SDVTList VTs;
2152   if (VecSize == 1) {
2153     VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
2154   } else if (VecSize == 2) {
2155     VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
2156   } else {
2157     EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
2158     VTs = CurDAG->getVTList(EVTs);
2159   }
2160 
2161   unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2162 
2163   SmallVector<SDValue, 2> Ops;
2164   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2165   Ops.push_back(Chain);
2166   Ops.push_back(Flag);
2167 
2168   ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops));
2169   return true;
2170 }
2171 
2172 bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
2173   SDLoc DL(N);
2174   SDValue Chain = N->getOperand(0);
2175   SDValue Offset = N->getOperand(1);
2176   unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2177   MemSDNode *Mem = cast<MemSDNode>(N);
2178 
2179   // How many elements do we have?
2180   unsigned NumElts = 1;
2181   switch (N->getOpcode()) {
2182   default:
2183     return false;
2184   case NVPTXISD::StoreRetval:
2185     NumElts = 1;
2186     break;
2187   case NVPTXISD::StoreRetvalV2:
2188     NumElts = 2;
2189     break;
2190   case NVPTXISD::StoreRetvalV4:
2191     NumElts = 4;
2192     break;
2193   }
2194 
2195   // Build vector of operands
2196   SmallVector<SDValue, 6> Ops;
2197   for (unsigned i = 0; i < NumElts; ++i)
2198     Ops.push_back(N->getOperand(i + 2));
2199   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2200   Ops.push_back(Chain);
2201 
2202   // Determine target opcode
2203   // If we have an i1, use an 8-bit store. The lowering code in
2204   // NVPTXISelLowering will have already emitted an upcast.
2205   std::optional<unsigned> Opcode = 0;
2206   switch (NumElts) {
2207   default:
2208     return false;
2209   case 1:
2210     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2211                              NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
2212                              NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
2213                              NVPTX::StoreRetvalF16, NVPTX::StoreRetvalF16x2,
2214                              NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
2215     break;
2216   case 2:
2217     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2218                              NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
2219                              NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
2220                              NVPTX::StoreRetvalV2F16, NVPTX::StoreRetvalV2F16x2,
2221                              NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
2222     break;
2223   case 4:
2224     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2225                              NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
2226                              NVPTX::StoreRetvalV4I32, std::nullopt,
2227                              NVPTX::StoreRetvalV4F16, NVPTX::StoreRetvalV4F16x2,
2228                              NVPTX::StoreRetvalV4F32, std::nullopt);
2229     break;
2230   }
2231   if (!Opcode)
2232     return false;
2233 
2234   SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
2235   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2236   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2237 
2238   ReplaceNode(N, Ret);
2239   return true;
2240 }
2241 
2242 bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
2243   SDLoc DL(N);
2244   SDValue Chain = N->getOperand(0);
2245   SDValue Param = N->getOperand(1);
2246   unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue();
2247   SDValue Offset = N->getOperand(2);
2248   unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2249   MemSDNode *Mem = cast<MemSDNode>(N);
2250   SDValue Flag = N->getOperand(N->getNumOperands() - 1);
2251 
2252   // How many elements do we have?
2253   unsigned NumElts = 1;
2254   switch (N->getOpcode()) {
2255   default:
2256     return false;
2257   case NVPTXISD::StoreParamU32:
2258   case NVPTXISD::StoreParamS32:
2259   case NVPTXISD::StoreParam:
2260     NumElts = 1;
2261     break;
2262   case NVPTXISD::StoreParamV2:
2263     NumElts = 2;
2264     break;
2265   case NVPTXISD::StoreParamV4:
2266     NumElts = 4;
2267     break;
2268   }
2269 
2270   // Build vector of operands
2271   SmallVector<SDValue, 8> Ops;
2272   for (unsigned i = 0; i < NumElts; ++i)
2273     Ops.push_back(N->getOperand(i + 3));
2274   Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32));
2275   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2276   Ops.push_back(Chain);
2277   Ops.push_back(Flag);
2278 
2279   // Determine target opcode
2280   // If we have an i1, use an 8-bit store. The lowering code in
2281   // NVPTXISelLowering will have already emitted an upcast.
2282   std::optional<unsigned> Opcode = 0;
2283   switch (N->getOpcode()) {
2284   default:
2285     switch (NumElts) {
2286     default:
2287       return false;
2288     case 1:
2289       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2290                                NVPTX::StoreParamI8, NVPTX::StoreParamI16,
2291                                NVPTX::StoreParamI32, NVPTX::StoreParamI64,
2292                                NVPTX::StoreParamF16, NVPTX::StoreParamF16x2,
2293                                NVPTX::StoreParamF32, NVPTX::StoreParamF64);
2294       break;
2295     case 2:
2296       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2297                                NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16,
2298                                NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64,
2299                                NVPTX::StoreParamV2F16, NVPTX::StoreParamV2F16x2,
2300                                NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64);
2301       break;
2302     case 4:
2303       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2304                                NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16,
2305                                NVPTX::StoreParamV4I32, std::nullopt,
2306                                NVPTX::StoreParamV4F16, NVPTX::StoreParamV4F16x2,
2307                                NVPTX::StoreParamV4F32, std::nullopt);
2308       break;
2309     }
2310     if (!Opcode)
2311       return false;
2312     break;
2313   // Special case: if we have a sign-extend/zero-extend node, insert the
2314   // conversion instruction first, and use that as the value operand to
2315   // the selected StoreParam node.
2316   case NVPTXISD::StoreParamU32: {
2317     Opcode = NVPTX::StoreParamI32;
2318     SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
2319                                                 MVT::i32);
2320     SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
2321                                          MVT::i32, Ops[0], CvtNone);
2322     Ops[0] = SDValue(Cvt, 0);
2323     break;
2324   }
2325   case NVPTXISD::StoreParamS32: {
2326     Opcode = NVPTX::StoreParamI32;
2327     SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
2328                                                 MVT::i32);
2329     SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
2330                                          MVT::i32, Ops[0], CvtNone);
2331     Ops[0] = SDValue(Cvt, 0);
2332     break;
2333   }
2334   }
2335 
2336   SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
2337   SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops);
2338   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2339   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2340 
2341   ReplaceNode(N, Ret);
2342   return true;
2343 }
2344 
2345 bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
2346   unsigned Opc = 0;
2347 
2348   switch (N->getOpcode()) {
2349   default: return false;
2350   case NVPTXISD::Tex1DFloatS32:
2351     Opc = NVPTX::TEX_1D_F32_S32_RR;
2352     break;
2353   case NVPTXISD::Tex1DFloatFloat:
2354     Opc = NVPTX::TEX_1D_F32_F32_RR;
2355     break;
2356   case NVPTXISD::Tex1DFloatFloatLevel:
2357     Opc = NVPTX::TEX_1D_F32_F32_LEVEL_RR;
2358     break;
2359   case NVPTXISD::Tex1DFloatFloatGrad:
2360     Opc = NVPTX::TEX_1D_F32_F32_GRAD_RR;
2361     break;
2362   case NVPTXISD::Tex1DS32S32:
2363     Opc = NVPTX::TEX_1D_S32_S32_RR;
2364     break;
2365   case NVPTXISD::Tex1DS32Float:
2366     Opc = NVPTX::TEX_1D_S32_F32_RR;
2367     break;
2368   case NVPTXISD::Tex1DS32FloatLevel:
2369     Opc = NVPTX::TEX_1D_S32_F32_LEVEL_RR;
2370     break;
2371   case NVPTXISD::Tex1DS32FloatGrad:
2372     Opc = NVPTX::TEX_1D_S32_F32_GRAD_RR;
2373     break;
2374   case NVPTXISD::Tex1DU32S32:
2375     Opc = NVPTX::TEX_1D_U32_S32_RR;
2376     break;
2377   case NVPTXISD::Tex1DU32Float:
2378     Opc = NVPTX::TEX_1D_U32_F32_RR;
2379     break;
2380   case NVPTXISD::Tex1DU32FloatLevel:
2381     Opc = NVPTX::TEX_1D_U32_F32_LEVEL_RR;
2382     break;
2383   case NVPTXISD::Tex1DU32FloatGrad:
2384     Opc = NVPTX::TEX_1D_U32_F32_GRAD_RR;
2385     break;
2386   case NVPTXISD::Tex1DArrayFloatS32:
2387     Opc = NVPTX::TEX_1D_ARRAY_F32_S32_RR;
2388     break;
2389   case NVPTXISD::Tex1DArrayFloatFloat:
2390     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_RR;
2391     break;
2392   case NVPTXISD::Tex1DArrayFloatFloatLevel:
2393     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR;
2394     break;
2395   case NVPTXISD::Tex1DArrayFloatFloatGrad:
2396     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR;
2397     break;
2398   case NVPTXISD::Tex1DArrayS32S32:
2399     Opc = NVPTX::TEX_1D_ARRAY_S32_S32_RR;
2400     break;
2401   case NVPTXISD::Tex1DArrayS32Float:
2402     Opc = NVPTX::TEX_1D_ARRAY_S32_F32_RR;
2403     break;
2404   case NVPTXISD::Tex1DArrayS32FloatLevel:
2405     Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR;
2406     break;
2407   case NVPTXISD::Tex1DArrayS32FloatGrad:
2408     Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR;
2409     break;
2410   case NVPTXISD::Tex1DArrayU32S32:
2411     Opc = NVPTX::TEX_1D_ARRAY_U32_S32_RR;
2412     break;
2413   case NVPTXISD::Tex1DArrayU32Float:
2414     Opc = NVPTX::TEX_1D_ARRAY_U32_F32_RR;
2415     break;
2416   case NVPTXISD::Tex1DArrayU32FloatLevel:
2417     Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR;
2418     break;
2419   case NVPTXISD::Tex1DArrayU32FloatGrad:
2420     Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR;
2421     break;
2422   case NVPTXISD::Tex2DFloatS32:
2423     Opc = NVPTX::TEX_2D_F32_S32_RR;
2424     break;
2425   case NVPTXISD::Tex2DFloatFloat:
2426     Opc = NVPTX::TEX_2D_F32_F32_RR;
2427     break;
2428   case NVPTXISD::Tex2DFloatFloatLevel:
2429     Opc = NVPTX::TEX_2D_F32_F32_LEVEL_RR;
2430     break;
2431   case NVPTXISD::Tex2DFloatFloatGrad:
2432     Opc = NVPTX::TEX_2D_F32_F32_GRAD_RR;
2433     break;
2434   case NVPTXISD::Tex2DS32S32:
2435     Opc = NVPTX::TEX_2D_S32_S32_RR;
2436     break;
2437   case NVPTXISD::Tex2DS32Float:
2438     Opc = NVPTX::TEX_2D_S32_F32_RR;
2439     break;
2440   case NVPTXISD::Tex2DS32FloatLevel:
2441     Opc = NVPTX::TEX_2D_S32_F32_LEVEL_RR;
2442     break;
2443   case NVPTXISD::Tex2DS32FloatGrad:
2444     Opc = NVPTX::TEX_2D_S32_F32_GRAD_RR;
2445     break;
2446   case NVPTXISD::Tex2DU32S32:
2447     Opc = NVPTX::TEX_2D_U32_S32_RR;
2448     break;
2449   case NVPTXISD::Tex2DU32Float:
2450     Opc = NVPTX::TEX_2D_U32_F32_RR;
2451     break;
2452   case NVPTXISD::Tex2DU32FloatLevel:
2453     Opc = NVPTX::TEX_2D_U32_F32_LEVEL_RR;
2454     break;
2455   case NVPTXISD::Tex2DU32FloatGrad:
2456     Opc = NVPTX::TEX_2D_U32_F32_GRAD_RR;
2457     break;
2458   case NVPTXISD::Tex2DArrayFloatS32:
2459     Opc = NVPTX::TEX_2D_ARRAY_F32_S32_RR;
2460     break;
2461   case NVPTXISD::Tex2DArrayFloatFloat:
2462     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_RR;
2463     break;
2464   case NVPTXISD::Tex2DArrayFloatFloatLevel:
2465     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR;
2466     break;
2467   case NVPTXISD::Tex2DArrayFloatFloatGrad:
2468     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR;
2469     break;
2470   case NVPTXISD::Tex2DArrayS32S32:
2471     Opc = NVPTX::TEX_2D_ARRAY_S32_S32_RR;
2472     break;
2473   case NVPTXISD::Tex2DArrayS32Float:
2474     Opc = NVPTX::TEX_2D_ARRAY_S32_F32_RR;
2475     break;
2476   case NVPTXISD::Tex2DArrayS32FloatLevel:
2477     Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR;
2478     break;
2479   case NVPTXISD::Tex2DArrayS32FloatGrad:
2480     Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR;
2481     break;
2482   case NVPTXISD::Tex2DArrayU32S32:
2483     Opc = NVPTX::TEX_2D_ARRAY_U32_S32_RR;
2484     break;
2485   case NVPTXISD::Tex2DArrayU32Float:
2486     Opc = NVPTX::TEX_2D_ARRAY_U32_F32_RR;
2487     break;
2488   case NVPTXISD::Tex2DArrayU32FloatLevel:
2489     Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR;
2490     break;
2491   case NVPTXISD::Tex2DArrayU32FloatGrad:
2492     Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR;
2493     break;
2494   case NVPTXISD::Tex3DFloatS32:
2495     Opc = NVPTX::TEX_3D_F32_S32_RR;
2496     break;
2497   case NVPTXISD::Tex3DFloatFloat:
2498     Opc = NVPTX::TEX_3D_F32_F32_RR;
2499     break;
2500   case NVPTXISD::Tex3DFloatFloatLevel:
2501     Opc = NVPTX::TEX_3D_F32_F32_LEVEL_RR;
2502     break;
2503   case NVPTXISD::Tex3DFloatFloatGrad:
2504     Opc = NVPTX::TEX_3D_F32_F32_GRAD_RR;
2505     break;
2506   case NVPTXISD::Tex3DS32S32:
2507     Opc = NVPTX::TEX_3D_S32_S32_RR;
2508     break;
2509   case NVPTXISD::Tex3DS32Float:
2510     Opc = NVPTX::TEX_3D_S32_F32_RR;
2511     break;
2512   case NVPTXISD::Tex3DS32FloatLevel:
2513     Opc = NVPTX::TEX_3D_S32_F32_LEVEL_RR;
2514     break;
2515   case NVPTXISD::Tex3DS32FloatGrad:
2516     Opc = NVPTX::TEX_3D_S32_F32_GRAD_RR;
2517     break;
2518   case NVPTXISD::Tex3DU32S32:
2519     Opc = NVPTX::TEX_3D_U32_S32_RR;
2520     break;
2521   case NVPTXISD::Tex3DU32Float:
2522     Opc = NVPTX::TEX_3D_U32_F32_RR;
2523     break;
2524   case NVPTXISD::Tex3DU32FloatLevel:
2525     Opc = NVPTX::TEX_3D_U32_F32_LEVEL_RR;
2526     break;
2527   case NVPTXISD::Tex3DU32FloatGrad:
2528     Opc = NVPTX::TEX_3D_U32_F32_GRAD_RR;
2529     break;
2530   case NVPTXISD::TexCubeFloatFloat:
2531     Opc = NVPTX::TEX_CUBE_F32_F32_RR;
2532     break;
2533   case NVPTXISD::TexCubeFloatFloatLevel:
2534     Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL_RR;
2535     break;
2536   case NVPTXISD::TexCubeS32Float:
2537     Opc = NVPTX::TEX_CUBE_S32_F32_RR;
2538     break;
2539   case NVPTXISD::TexCubeS32FloatLevel:
2540     Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL_RR;
2541     break;
2542   case NVPTXISD::TexCubeU32Float:
2543     Opc = NVPTX::TEX_CUBE_U32_F32_RR;
2544     break;
2545   case NVPTXISD::TexCubeU32FloatLevel:
2546     Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL_RR;
2547     break;
2548   case NVPTXISD::TexCubeArrayFloatFloat:
2549     Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_RR;
2550     break;
2551   case NVPTXISD::TexCubeArrayFloatFloatLevel:
2552     Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR;
2553     break;
2554   case NVPTXISD::TexCubeArrayS32Float:
2555     Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_RR;
2556     break;
2557   case NVPTXISD::TexCubeArrayS32FloatLevel:
2558     Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR;
2559     break;
2560   case NVPTXISD::TexCubeArrayU32Float:
2561     Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_RR;
2562     break;
2563   case NVPTXISD::TexCubeArrayU32FloatLevel:
2564     Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR;
2565     break;
2566   case NVPTXISD::Tld4R2DFloatFloat:
2567     Opc = NVPTX::TLD4_R_2D_F32_F32_RR;
2568     break;
2569   case NVPTXISD::Tld4G2DFloatFloat:
2570     Opc = NVPTX::TLD4_G_2D_F32_F32_RR;
2571     break;
2572   case NVPTXISD::Tld4B2DFloatFloat:
2573     Opc = NVPTX::TLD4_B_2D_F32_F32_RR;
2574     break;
2575   case NVPTXISD::Tld4A2DFloatFloat:
2576     Opc = NVPTX::TLD4_A_2D_F32_F32_RR;
2577     break;
2578   case NVPTXISD::Tld4R2DS64Float:
2579     Opc = NVPTX::TLD4_R_2D_S32_F32_RR;
2580     break;
2581   case NVPTXISD::Tld4G2DS64Float:
2582     Opc = NVPTX::TLD4_G_2D_S32_F32_RR;
2583     break;
2584   case NVPTXISD::Tld4B2DS64Float:
2585     Opc = NVPTX::TLD4_B_2D_S32_F32_RR;
2586     break;
2587   case NVPTXISD::Tld4A2DS64Float:
2588     Opc = NVPTX::TLD4_A_2D_S32_F32_RR;
2589     break;
2590   case NVPTXISD::Tld4R2DU64Float:
2591     Opc = NVPTX::TLD4_R_2D_U32_F32_RR;
2592     break;
2593   case NVPTXISD::Tld4G2DU64Float:
2594     Opc = NVPTX::TLD4_G_2D_U32_F32_RR;
2595     break;
2596   case NVPTXISD::Tld4B2DU64Float:
2597     Opc = NVPTX::TLD4_B_2D_U32_F32_RR;
2598     break;
2599   case NVPTXISD::Tld4A2DU64Float:
2600     Opc = NVPTX::TLD4_A_2D_U32_F32_RR;
2601     break;
2602   case NVPTXISD::TexUnified1DFloatS32:
2603     Opc = NVPTX::TEX_UNIFIED_1D_F32_S32_R;
2604     break;
2605   case NVPTXISD::TexUnified1DFloatFloat:
2606     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_R;
2607     break;
2608   case NVPTXISD::TexUnified1DFloatFloatLevel:
2609     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R;
2610     break;
2611   case NVPTXISD::TexUnified1DFloatFloatGrad:
2612     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R;
2613     break;
2614   case NVPTXISD::TexUnified1DS32S32:
2615     Opc = NVPTX::TEX_UNIFIED_1D_S32_S32_R;
2616     break;
2617   case NVPTXISD::TexUnified1DS32Float:
2618     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_R;
2619     break;
2620   case NVPTXISD::TexUnified1DS32FloatLevel:
2621     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R;
2622     break;
2623   case NVPTXISD::TexUnified1DS32FloatGrad:
2624     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R;
2625     break;
2626   case NVPTXISD::TexUnified1DU32S32:
2627     Opc = NVPTX::TEX_UNIFIED_1D_U32_S32_R;
2628     break;
2629   case NVPTXISD::TexUnified1DU32Float:
2630     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_R;
2631     break;
2632   case NVPTXISD::TexUnified1DU32FloatLevel:
2633     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R;
2634     break;
2635   case NVPTXISD::TexUnified1DU32FloatGrad:
2636     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R;
2637     break;
2638   case NVPTXISD::TexUnified1DArrayFloatS32:
2639     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R;
2640     break;
2641   case NVPTXISD::TexUnified1DArrayFloatFloat:
2642     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R;
2643     break;
2644   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
2645     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R;
2646     break;
2647   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
2648     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R;
2649     break;
2650   case NVPTXISD::TexUnified1DArrayS32S32:
2651     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R;
2652     break;
2653   case NVPTXISD::TexUnified1DArrayS32Float:
2654     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R;
2655     break;
2656   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
2657     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R;
2658     break;
2659   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
2660     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R;
2661     break;
2662   case NVPTXISD::TexUnified1DArrayU32S32:
2663     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R;
2664     break;
2665   case NVPTXISD::TexUnified1DArrayU32Float:
2666     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R;
2667     break;
2668   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
2669     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R;
2670     break;
2671   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
2672     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R;
2673     break;
2674   case NVPTXISD::TexUnified2DFloatS32:
2675     Opc = NVPTX::TEX_UNIFIED_2D_F32_S32_R;
2676     break;
2677   case NVPTXISD::TexUnified2DFloatFloat:
2678     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_R;
2679     break;
2680   case NVPTXISD::TexUnified2DFloatFloatLevel:
2681     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R;
2682     break;
2683   case NVPTXISD::TexUnified2DFloatFloatGrad:
2684     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R;
2685     break;
2686   case NVPTXISD::TexUnified2DS32S32:
2687     Opc = NVPTX::TEX_UNIFIED_2D_S32_S32_R;
2688     break;
2689   case NVPTXISD::TexUnified2DS32Float:
2690     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_R;
2691     break;
2692   case NVPTXISD::TexUnified2DS32FloatLevel:
2693     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R;
2694     break;
2695   case NVPTXISD::TexUnified2DS32FloatGrad:
2696     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R;
2697     break;
2698   case NVPTXISD::TexUnified2DU32S32:
2699     Opc = NVPTX::TEX_UNIFIED_2D_U32_S32_R;
2700     break;
2701   case NVPTXISD::TexUnified2DU32Float:
2702     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_R;
2703     break;
2704   case NVPTXISD::TexUnified2DU32FloatLevel:
2705     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R;
2706     break;
2707   case NVPTXISD::TexUnified2DU32FloatGrad:
2708     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R;
2709     break;
2710   case NVPTXISD::TexUnified2DArrayFloatS32:
2711     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R;
2712     break;
2713   case NVPTXISD::TexUnified2DArrayFloatFloat:
2714     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R;
2715     break;
2716   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
2717     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R;
2718     break;
2719   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
2720     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R;
2721     break;
2722   case NVPTXISD::TexUnified2DArrayS32S32:
2723     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R;
2724     break;
2725   case NVPTXISD::TexUnified2DArrayS32Float:
2726     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R;
2727     break;
2728   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
2729     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R;
2730     break;
2731   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
2732     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R;
2733     break;
2734   case NVPTXISD::TexUnified2DArrayU32S32:
2735     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R;
2736     break;
2737   case NVPTXISD::TexUnified2DArrayU32Float:
2738     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R;
2739     break;
2740   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
2741     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R;
2742     break;
2743   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
2744     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R;
2745     break;
2746   case NVPTXISD::TexUnified3DFloatS32:
2747     Opc = NVPTX::TEX_UNIFIED_3D_F32_S32_R;
2748     break;
2749   case NVPTXISD::TexUnified3DFloatFloat:
2750     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_R;
2751     break;
2752   case NVPTXISD::TexUnified3DFloatFloatLevel:
2753     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R;
2754     break;
2755   case NVPTXISD::TexUnified3DFloatFloatGrad:
2756     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R;
2757     break;
2758   case NVPTXISD::TexUnified3DS32S32:
2759     Opc = NVPTX::TEX_UNIFIED_3D_S32_S32_R;
2760     break;
2761   case NVPTXISD::TexUnified3DS32Float:
2762     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_R;
2763     break;
2764   case NVPTXISD::TexUnified3DS32FloatLevel:
2765     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R;
2766     break;
2767   case NVPTXISD::TexUnified3DS32FloatGrad:
2768     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R;
2769     break;
2770   case NVPTXISD::TexUnified3DU32S32:
2771     Opc = NVPTX::TEX_UNIFIED_3D_U32_S32_R;
2772     break;
2773   case NVPTXISD::TexUnified3DU32Float:
2774     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_R;
2775     break;
2776   case NVPTXISD::TexUnified3DU32FloatLevel:
2777     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R;
2778     break;
2779   case NVPTXISD::TexUnified3DU32FloatGrad:
2780     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R;
2781     break;
2782   case NVPTXISD::TexUnifiedCubeFloatFloat:
2783     Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_R;
2784     break;
2785   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
2786     Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R;
2787     break;
2788   case NVPTXISD::TexUnifiedCubeS32Float:
2789     Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_R;
2790     break;
2791   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
2792     Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R;
2793     break;
2794   case NVPTXISD::TexUnifiedCubeU32Float:
2795     Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_R;
2796     break;
2797   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
2798     Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R;
2799     break;
2800   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
2801     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R;
2802     break;
2803   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
2804     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R;
2805     break;
2806   case NVPTXISD::TexUnifiedCubeArrayS32Float:
2807     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R;
2808     break;
2809   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
2810     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R;
2811     break;
2812   case NVPTXISD::TexUnifiedCubeArrayU32Float:
2813     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R;
2814     break;
2815   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
2816     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R;
2817     break;
2818   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
2819     Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R;
2820     break;
2821   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
2822     Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R;
2823     break;
2824   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
2825     Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R;
2826     break;
2827   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
2828     Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R;
2829     break;
2830   case NVPTXISD::Tld4UnifiedR2DS64Float:
2831     Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R;
2832     break;
2833   case NVPTXISD::Tld4UnifiedG2DS64Float:
2834     Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R;
2835     break;
2836   case NVPTXISD::Tld4UnifiedB2DS64Float:
2837     Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R;
2838     break;
2839   case NVPTXISD::Tld4UnifiedA2DS64Float:
2840     Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R;
2841     break;
2842   case NVPTXISD::Tld4UnifiedR2DU64Float:
2843     Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R;
2844     break;
2845   case NVPTXISD::Tld4UnifiedG2DU64Float:
2846     Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R;
2847     break;
2848   case NVPTXISD::Tld4UnifiedB2DU64Float:
2849     Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R;
2850     break;
2851   case NVPTXISD::Tld4UnifiedA2DU64Float:
2852     Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R;
2853     break;
2854   }
2855 
2856   // Copy over operands
2857   SmallVector<SDValue, 8> Ops(drop_begin(N->ops()));
2858   Ops.push_back(N->getOperand(0)); // Move chain to the back.
2859 
2860   ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
2861   return true;
2862 }
2863 
2864 bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
2865   unsigned Opc = 0;
2866   switch (N->getOpcode()) {
2867   default: return false;
2868   case NVPTXISD::Suld1DI8Clamp:
2869     Opc = NVPTX::SULD_1D_I8_CLAMP_R;
2870     break;
2871   case NVPTXISD::Suld1DI16Clamp:
2872     Opc = NVPTX::SULD_1D_I16_CLAMP_R;
2873     break;
2874   case NVPTXISD::Suld1DI32Clamp:
2875     Opc = NVPTX::SULD_1D_I32_CLAMP_R;
2876     break;
2877   case NVPTXISD::Suld1DI64Clamp:
2878     Opc = NVPTX::SULD_1D_I64_CLAMP_R;
2879     break;
2880   case NVPTXISD::Suld1DV2I8Clamp:
2881     Opc = NVPTX::SULD_1D_V2I8_CLAMP_R;
2882     break;
2883   case NVPTXISD::Suld1DV2I16Clamp:
2884     Opc = NVPTX::SULD_1D_V2I16_CLAMP_R;
2885     break;
2886   case NVPTXISD::Suld1DV2I32Clamp:
2887     Opc = NVPTX::SULD_1D_V2I32_CLAMP_R;
2888     break;
2889   case NVPTXISD::Suld1DV2I64Clamp:
2890     Opc = NVPTX::SULD_1D_V2I64_CLAMP_R;
2891     break;
2892   case NVPTXISD::Suld1DV4I8Clamp:
2893     Opc = NVPTX::SULD_1D_V4I8_CLAMP_R;
2894     break;
2895   case NVPTXISD::Suld1DV4I16Clamp:
2896     Opc = NVPTX::SULD_1D_V4I16_CLAMP_R;
2897     break;
2898   case NVPTXISD::Suld1DV4I32Clamp:
2899     Opc = NVPTX::SULD_1D_V4I32_CLAMP_R;
2900     break;
2901   case NVPTXISD::Suld1DArrayI8Clamp:
2902     Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP_R;
2903     break;
2904   case NVPTXISD::Suld1DArrayI16Clamp:
2905     Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP_R;
2906     break;
2907   case NVPTXISD::Suld1DArrayI32Clamp:
2908     Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP_R;
2909     break;
2910   case NVPTXISD::Suld1DArrayI64Clamp:
2911     Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP_R;
2912     break;
2913   case NVPTXISD::Suld1DArrayV2I8Clamp:
2914     Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R;
2915     break;
2916   case NVPTXISD::Suld1DArrayV2I16Clamp:
2917     Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R;
2918     break;
2919   case NVPTXISD::Suld1DArrayV2I32Clamp:
2920     Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R;
2921     break;
2922   case NVPTXISD::Suld1DArrayV2I64Clamp:
2923     Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R;
2924     break;
2925   case NVPTXISD::Suld1DArrayV4I8Clamp:
2926     Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R;
2927     break;
2928   case NVPTXISD::Suld1DArrayV4I16Clamp:
2929     Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R;
2930     break;
2931   case NVPTXISD::Suld1DArrayV4I32Clamp:
2932     Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R;
2933     break;
2934   case NVPTXISD::Suld2DI8Clamp:
2935     Opc = NVPTX::SULD_2D_I8_CLAMP_R;
2936     break;
2937   case NVPTXISD::Suld2DI16Clamp:
2938     Opc = NVPTX::SULD_2D_I16_CLAMP_R;
2939     break;
2940   case NVPTXISD::Suld2DI32Clamp:
2941     Opc = NVPTX::SULD_2D_I32_CLAMP_R;
2942     break;
2943   case NVPTXISD::Suld2DI64Clamp:
2944     Opc = NVPTX::SULD_2D_I64_CLAMP_R;
2945     break;
2946   case NVPTXISD::Suld2DV2I8Clamp:
2947     Opc = NVPTX::SULD_2D_V2I8_CLAMP_R;
2948     break;
2949   case NVPTXISD::Suld2DV2I16Clamp:
2950     Opc = NVPTX::SULD_2D_V2I16_CLAMP_R;
2951     break;
2952   case NVPTXISD::Suld2DV2I32Clamp:
2953     Opc = NVPTX::SULD_2D_V2I32_CLAMP_R;
2954     break;
2955   case NVPTXISD::Suld2DV2I64Clamp:
2956     Opc = NVPTX::SULD_2D_V2I64_CLAMP_R;
2957     break;
2958   case NVPTXISD::Suld2DV4I8Clamp:
2959     Opc = NVPTX::SULD_2D_V4I8_CLAMP_R;
2960     break;
2961   case NVPTXISD::Suld2DV4I16Clamp:
2962     Opc = NVPTX::SULD_2D_V4I16_CLAMP_R;
2963     break;
2964   case NVPTXISD::Suld2DV4I32Clamp:
2965     Opc = NVPTX::SULD_2D_V4I32_CLAMP_R;
2966     break;
2967   case NVPTXISD::Suld2DArrayI8Clamp:
2968     Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP_R;
2969     break;
2970   case NVPTXISD::Suld2DArrayI16Clamp:
2971     Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP_R;
2972     break;
2973   case NVPTXISD::Suld2DArrayI32Clamp:
2974     Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP_R;
2975     break;
2976   case NVPTXISD::Suld2DArrayI64Clamp:
2977     Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP_R;
2978     break;
2979   case NVPTXISD::Suld2DArrayV2I8Clamp:
2980     Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R;
2981     break;
2982   case NVPTXISD::Suld2DArrayV2I16Clamp:
2983     Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R;
2984     break;
2985   case NVPTXISD::Suld2DArrayV2I32Clamp:
2986     Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R;
2987     break;
2988   case NVPTXISD::Suld2DArrayV2I64Clamp:
2989     Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R;
2990     break;
2991   case NVPTXISD::Suld2DArrayV4I8Clamp:
2992     Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R;
2993     break;
2994   case NVPTXISD::Suld2DArrayV4I16Clamp:
2995     Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R;
2996     break;
2997   case NVPTXISD::Suld2DArrayV4I32Clamp:
2998     Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R;
2999     break;
3000   case NVPTXISD::Suld3DI8Clamp:
3001     Opc = NVPTX::SULD_3D_I8_CLAMP_R;
3002     break;
3003   case NVPTXISD::Suld3DI16Clamp:
3004     Opc = NVPTX::SULD_3D_I16_CLAMP_R;
3005     break;
3006   case NVPTXISD::Suld3DI32Clamp:
3007     Opc = NVPTX::SULD_3D_I32_CLAMP_R;
3008     break;
3009   case NVPTXISD::Suld3DI64Clamp:
3010     Opc = NVPTX::SULD_3D_I64_CLAMP_R;
3011     break;
3012   case NVPTXISD::Suld3DV2I8Clamp:
3013     Opc = NVPTX::SULD_3D_V2I8_CLAMP_R;
3014     break;
3015   case NVPTXISD::Suld3DV2I16Clamp:
3016     Opc = NVPTX::SULD_3D_V2I16_CLAMP_R;
3017     break;
3018   case NVPTXISD::Suld3DV2I32Clamp:
3019     Opc = NVPTX::SULD_3D_V2I32_CLAMP_R;
3020     break;
3021   case NVPTXISD::Suld3DV2I64Clamp:
3022     Opc = NVPTX::SULD_3D_V2I64_CLAMP_R;
3023     break;
3024   case NVPTXISD::Suld3DV4I8Clamp:
3025     Opc = NVPTX::SULD_3D_V4I8_CLAMP_R;
3026     break;
3027   case NVPTXISD::Suld3DV4I16Clamp:
3028     Opc = NVPTX::SULD_3D_V4I16_CLAMP_R;
3029     break;
3030   case NVPTXISD::Suld3DV4I32Clamp:
3031     Opc = NVPTX::SULD_3D_V4I32_CLAMP_R;
3032     break;
3033   case NVPTXISD::Suld1DI8Trap:
3034     Opc = NVPTX::SULD_1D_I8_TRAP_R;
3035     break;
3036   case NVPTXISD::Suld1DI16Trap:
3037     Opc = NVPTX::SULD_1D_I16_TRAP_R;
3038     break;
3039   case NVPTXISD::Suld1DI32Trap:
3040     Opc = NVPTX::SULD_1D_I32_TRAP_R;
3041     break;
3042   case NVPTXISD::Suld1DI64Trap:
3043     Opc = NVPTX::SULD_1D_I64_TRAP_R;
3044     break;
3045   case NVPTXISD::Suld1DV2I8Trap:
3046     Opc = NVPTX::SULD_1D_V2I8_TRAP_R;
3047     break;
3048   case NVPTXISD::Suld1DV2I16Trap:
3049     Opc = NVPTX::SULD_1D_V2I16_TRAP_R;
3050     break;
3051   case NVPTXISD::Suld1DV2I32Trap:
3052     Opc = NVPTX::SULD_1D_V2I32_TRAP_R;
3053     break;
3054   case NVPTXISD::Suld1DV2I64Trap:
3055     Opc = NVPTX::SULD_1D_V2I64_TRAP_R;
3056     break;
3057   case NVPTXISD::Suld1DV4I8Trap:
3058     Opc = NVPTX::SULD_1D_V4I8_TRAP_R;
3059     break;
3060   case NVPTXISD::Suld1DV4I16Trap:
3061     Opc = NVPTX::SULD_1D_V4I16_TRAP_R;
3062     break;
3063   case NVPTXISD::Suld1DV4I32Trap:
3064     Opc = NVPTX::SULD_1D_V4I32_TRAP_R;
3065     break;
3066   case NVPTXISD::Suld1DArrayI8Trap:
3067     Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP_R;
3068     break;
3069   case NVPTXISD::Suld1DArrayI16Trap:
3070     Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP_R;
3071     break;
3072   case NVPTXISD::Suld1DArrayI32Trap:
3073     Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP_R;
3074     break;
3075   case NVPTXISD::Suld1DArrayI64Trap:
3076     Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP_R;
3077     break;
3078   case NVPTXISD::Suld1DArrayV2I8Trap:
3079     Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R;
3080     break;
3081   case NVPTXISD::Suld1DArrayV2I16Trap:
3082     Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R;
3083     break;
3084   case NVPTXISD::Suld1DArrayV2I32Trap:
3085     Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R;
3086     break;
3087   case NVPTXISD::Suld1DArrayV2I64Trap:
3088     Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R;
3089     break;
3090   case NVPTXISD::Suld1DArrayV4I8Trap:
3091     Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R;
3092     break;
3093   case NVPTXISD::Suld1DArrayV4I16Trap:
3094     Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R;
3095     break;
3096   case NVPTXISD::Suld1DArrayV4I32Trap:
3097     Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R;
3098     break;
3099   case NVPTXISD::Suld2DI8Trap:
3100     Opc = NVPTX::SULD_2D_I8_TRAP_R;
3101     break;
3102   case NVPTXISD::Suld2DI16Trap:
3103     Opc = NVPTX::SULD_2D_I16_TRAP_R;
3104     break;
3105   case NVPTXISD::Suld2DI32Trap:
3106     Opc = NVPTX::SULD_2D_I32_TRAP_R;
3107     break;
3108   case NVPTXISD::Suld2DI64Trap:
3109     Opc = NVPTX::SULD_2D_I64_TRAP_R;
3110     break;
3111   case NVPTXISD::Suld2DV2I8Trap:
3112     Opc = NVPTX::SULD_2D_V2I8_TRAP_R;
3113     break;
3114   case NVPTXISD::Suld2DV2I16Trap:
3115     Opc = NVPTX::SULD_2D_V2I16_TRAP_R;
3116     break;
3117   case NVPTXISD::Suld2DV2I32Trap:
3118     Opc = NVPTX::SULD_2D_V2I32_TRAP_R;
3119     break;
3120   case NVPTXISD::Suld2DV2I64Trap:
3121     Opc = NVPTX::SULD_2D_V2I64_TRAP_R;
3122     break;
3123   case NVPTXISD::Suld2DV4I8Trap:
3124     Opc = NVPTX::SULD_2D_V4I8_TRAP_R;
3125     break;
3126   case NVPTXISD::Suld2DV4I16Trap:
3127     Opc = NVPTX::SULD_2D_V4I16_TRAP_R;
3128     break;
3129   case NVPTXISD::Suld2DV4I32Trap:
3130     Opc = NVPTX::SULD_2D_V4I32_TRAP_R;
3131     break;
3132   case NVPTXISD::Suld2DArrayI8Trap:
3133     Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP_R;
3134     break;
3135   case NVPTXISD::Suld2DArrayI16Trap:
3136     Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP_R;
3137     break;
3138   case NVPTXISD::Suld2DArrayI32Trap:
3139     Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP_R;
3140     break;
3141   case NVPTXISD::Suld2DArrayI64Trap:
3142     Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP_R;
3143     break;
3144   case NVPTXISD::Suld2DArrayV2I8Trap:
3145     Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R;
3146     break;
3147   case NVPTXISD::Suld2DArrayV2I16Trap:
3148     Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R;
3149     break;
3150   case NVPTXISD::Suld2DArrayV2I32Trap:
3151     Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R;
3152     break;
3153   case NVPTXISD::Suld2DArrayV2I64Trap:
3154     Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R;
3155     break;
3156   case NVPTXISD::Suld2DArrayV4I8Trap:
3157     Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R;
3158     break;
3159   case NVPTXISD::Suld2DArrayV4I16Trap:
3160     Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R;
3161     break;
3162   case NVPTXISD::Suld2DArrayV4I32Trap:
3163     Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R;
3164     break;
3165   case NVPTXISD::Suld3DI8Trap:
3166     Opc = NVPTX::SULD_3D_I8_TRAP_R;
3167     break;
3168   case NVPTXISD::Suld3DI16Trap:
3169     Opc = NVPTX::SULD_3D_I16_TRAP_R;
3170     break;
3171   case NVPTXISD::Suld3DI32Trap:
3172     Opc = NVPTX::SULD_3D_I32_TRAP_R;
3173     break;
3174   case NVPTXISD::Suld3DI64Trap:
3175     Opc = NVPTX::SULD_3D_I64_TRAP_R;
3176     break;
3177   case NVPTXISD::Suld3DV2I8Trap:
3178     Opc = NVPTX::SULD_3D_V2I8_TRAP_R;
3179     break;
3180   case NVPTXISD::Suld3DV2I16Trap:
3181     Opc = NVPTX::SULD_3D_V2I16_TRAP_R;
3182     break;
3183   case NVPTXISD::Suld3DV2I32Trap:
3184     Opc = NVPTX::SULD_3D_V2I32_TRAP_R;
3185     break;
3186   case NVPTXISD::Suld3DV2I64Trap:
3187     Opc = NVPTX::SULD_3D_V2I64_TRAP_R;
3188     break;
3189   case NVPTXISD::Suld3DV4I8Trap:
3190     Opc = NVPTX::SULD_3D_V4I8_TRAP_R;
3191     break;
3192   case NVPTXISD::Suld3DV4I16Trap:
3193     Opc = NVPTX::SULD_3D_V4I16_TRAP_R;
3194     break;
3195   case NVPTXISD::Suld3DV4I32Trap:
3196     Opc = NVPTX::SULD_3D_V4I32_TRAP_R;
3197     break;
3198   case NVPTXISD::Suld1DI8Zero:
3199     Opc = NVPTX::SULD_1D_I8_ZERO_R;
3200     break;
3201   case NVPTXISD::Suld1DI16Zero:
3202     Opc = NVPTX::SULD_1D_I16_ZERO_R;
3203     break;
3204   case NVPTXISD::Suld1DI32Zero:
3205     Opc = NVPTX::SULD_1D_I32_ZERO_R;
3206     break;
3207   case NVPTXISD::Suld1DI64Zero:
3208     Opc = NVPTX::SULD_1D_I64_ZERO_R;
3209     break;
3210   case NVPTXISD::Suld1DV2I8Zero:
3211     Opc = NVPTX::SULD_1D_V2I8_ZERO_R;
3212     break;
3213   case NVPTXISD::Suld1DV2I16Zero:
3214     Opc = NVPTX::SULD_1D_V2I16_ZERO_R;
3215     break;
3216   case NVPTXISD::Suld1DV2I32Zero:
3217     Opc = NVPTX::SULD_1D_V2I32_ZERO_R;
3218     break;
3219   case NVPTXISD::Suld1DV2I64Zero:
3220     Opc = NVPTX::SULD_1D_V2I64_ZERO_R;
3221     break;
3222   case NVPTXISD::Suld1DV4I8Zero:
3223     Opc = NVPTX::SULD_1D_V4I8_ZERO_R;
3224     break;
3225   case NVPTXISD::Suld1DV4I16Zero:
3226     Opc = NVPTX::SULD_1D_V4I16_ZERO_R;
3227     break;
3228   case NVPTXISD::Suld1DV4I32Zero:
3229     Opc = NVPTX::SULD_1D_V4I32_ZERO_R;
3230     break;
3231   case NVPTXISD::Suld1DArrayI8Zero:
3232     Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO_R;
3233     break;
3234   case NVPTXISD::Suld1DArrayI16Zero:
3235     Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO_R;
3236     break;
3237   case NVPTXISD::Suld1DArrayI32Zero:
3238     Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO_R;
3239     break;
3240   case NVPTXISD::Suld1DArrayI64Zero:
3241     Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO_R;
3242     break;
3243   case NVPTXISD::Suld1DArrayV2I8Zero:
3244     Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R;
3245     break;
3246   case NVPTXISD::Suld1DArrayV2I16Zero:
3247     Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R;
3248     break;
3249   case NVPTXISD::Suld1DArrayV2I32Zero:
3250     Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R;
3251     break;
3252   case NVPTXISD::Suld1DArrayV2I64Zero:
3253     Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R;
3254     break;
3255   case NVPTXISD::Suld1DArrayV4I8Zero:
3256     Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R;
3257     break;
3258   case NVPTXISD::Suld1DArrayV4I16Zero:
3259     Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R;
3260     break;
3261   case NVPTXISD::Suld1DArrayV4I32Zero:
3262     Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R;
3263     break;
3264   case NVPTXISD::Suld2DI8Zero:
3265     Opc = NVPTX::SULD_2D_I8_ZERO_R;
3266     break;
3267   case NVPTXISD::Suld2DI16Zero:
3268     Opc = NVPTX::SULD_2D_I16_ZERO_R;
3269     break;
3270   case NVPTXISD::Suld2DI32Zero:
3271     Opc = NVPTX::SULD_2D_I32_ZERO_R;
3272     break;
3273   case NVPTXISD::Suld2DI64Zero:
3274     Opc = NVPTX::SULD_2D_I64_ZERO_R;
3275     break;
3276   case NVPTXISD::Suld2DV2I8Zero:
3277     Opc = NVPTX::SULD_2D_V2I8_ZERO_R;
3278     break;
3279   case NVPTXISD::Suld2DV2I16Zero:
3280     Opc = NVPTX::SULD_2D_V2I16_ZERO_R;
3281     break;
3282   case NVPTXISD::Suld2DV2I32Zero:
3283     Opc = NVPTX::SULD_2D_V2I32_ZERO_R;
3284     break;
3285   case NVPTXISD::Suld2DV2I64Zero:
3286     Opc = NVPTX::SULD_2D_V2I64_ZERO_R;
3287     break;
3288   case NVPTXISD::Suld2DV4I8Zero:
3289     Opc = NVPTX::SULD_2D_V4I8_ZERO_R;
3290     break;
3291   case NVPTXISD::Suld2DV4I16Zero:
3292     Opc = NVPTX::SULD_2D_V4I16_ZERO_R;
3293     break;
3294   case NVPTXISD::Suld2DV4I32Zero:
3295     Opc = NVPTX::SULD_2D_V4I32_ZERO_R;
3296     break;
3297   case NVPTXISD::Suld2DArrayI8Zero:
3298     Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO_R;
3299     break;
3300   case NVPTXISD::Suld2DArrayI16Zero:
3301     Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO_R;
3302     break;
3303   case NVPTXISD::Suld2DArrayI32Zero:
3304     Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO_R;
3305     break;
3306   case NVPTXISD::Suld2DArrayI64Zero:
3307     Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO_R;
3308     break;
3309   case NVPTXISD::Suld2DArrayV2I8Zero:
3310     Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R;
3311     break;
3312   case NVPTXISD::Suld2DArrayV2I16Zero:
3313     Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R;
3314     break;
3315   case NVPTXISD::Suld2DArrayV2I32Zero:
3316     Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R;
3317     break;
3318   case NVPTXISD::Suld2DArrayV2I64Zero:
3319     Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R;
3320     break;
3321   case NVPTXISD::Suld2DArrayV4I8Zero:
3322     Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R;
3323     break;
3324   case NVPTXISD::Suld2DArrayV4I16Zero:
3325     Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R;
3326     break;
3327   case NVPTXISD::Suld2DArrayV4I32Zero:
3328     Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R;
3329     break;
3330   case NVPTXISD::Suld3DI8Zero:
3331     Opc = NVPTX::SULD_3D_I8_ZERO_R;
3332     break;
3333   case NVPTXISD::Suld3DI16Zero:
3334     Opc = NVPTX::SULD_3D_I16_ZERO_R;
3335     break;
3336   case NVPTXISD::Suld3DI32Zero:
3337     Opc = NVPTX::SULD_3D_I32_ZERO_R;
3338     break;
3339   case NVPTXISD::Suld3DI64Zero:
3340     Opc = NVPTX::SULD_3D_I64_ZERO_R;
3341     break;
3342   case NVPTXISD::Suld3DV2I8Zero:
3343     Opc = NVPTX::SULD_3D_V2I8_ZERO_R;
3344     break;
3345   case NVPTXISD::Suld3DV2I16Zero:
3346     Opc = NVPTX::SULD_3D_V2I16_ZERO_R;
3347     break;
3348   case NVPTXISD::Suld3DV2I32Zero:
3349     Opc = NVPTX::SULD_3D_V2I32_ZERO_R;
3350     break;
3351   case NVPTXISD::Suld3DV2I64Zero:
3352     Opc = NVPTX::SULD_3D_V2I64_ZERO_R;
3353     break;
3354   case NVPTXISD::Suld3DV4I8Zero:
3355     Opc = NVPTX::SULD_3D_V4I8_ZERO_R;
3356     break;
3357   case NVPTXISD::Suld3DV4I16Zero:
3358     Opc = NVPTX::SULD_3D_V4I16_ZERO_R;
3359     break;
3360   case NVPTXISD::Suld3DV4I32Zero:
3361     Opc = NVPTX::SULD_3D_V4I32_ZERO_R;
3362     break;
3363   }
3364 
3365   // Copy over operands
3366   SmallVector<SDValue, 8> Ops(drop_begin(N->ops()));
3367   Ops.push_back(N->getOperand(0)); // Move chain to the back.
3368 
3369   ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
3370   return true;
3371 }
3372 
3373 
3374 /// SelectBFE - Look for instruction sequences that can be made more efficient
3375 /// by using the 'bfe' (bit-field extract) PTX instruction
3376 bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
3377   SDLoc DL(N);
3378   SDValue LHS = N->getOperand(0);
3379   SDValue RHS = N->getOperand(1);
3380   SDValue Len;
3381   SDValue Start;
3382   SDValue Val;
3383   bool IsSigned = false;
3384 
3385   if (N->getOpcode() == ISD::AND) {
3386     // Canonicalize the operands
3387     // We want 'and %val, %mask'
3388     if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
3389       std::swap(LHS, RHS);
3390     }
3391 
3392     ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
3393     if (!Mask) {
3394       // We need a constant mask on the RHS of the AND
3395       return false;
3396     }
3397 
3398     // Extract the mask bits
3399     uint64_t MaskVal = Mask->getZExtValue();
3400     if (!isMask_64(MaskVal)) {
3401       // We *could* handle shifted masks here, but doing so would require an
3402       // 'and' operation to fix up the low-order bits so we would trade
3403       // shr+and for bfe+and, which has the same throughput
3404       return false;
3405     }
3406 
3407     // How many bits are in our mask?
3408     uint64_t NumBits = countTrailingOnes(MaskVal);
3409     Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3410 
3411     if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
3412       // We have a 'srl/and' pair, extract the effective start bit and length
3413       Val = LHS.getNode()->getOperand(0);
3414       Start = LHS.getNode()->getOperand(1);
3415       ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
3416       if (StartConst) {
3417         uint64_t StartVal = StartConst->getZExtValue();
3418         // How many "good" bits do we have left?  "good" is defined here as bits
3419         // that exist in the original value, not shifted in.
3420         uint64_t GoodBits = Start.getValueSizeInBits() - StartVal;
3421         if (NumBits > GoodBits) {
3422           // Do not handle the case where bits have been shifted in. In theory
3423           // we could handle this, but the cost is likely higher than just
3424           // emitting the srl/and pair.
3425           return false;
3426         }
3427         Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
3428       } else {
3429         // Do not handle the case where the shift amount (can be zero if no srl
3430         // was found) is not constant. We could handle this case, but it would
3431         // require run-time logic that would be more expensive than just
3432         // emitting the srl/and pair.
3433         return false;
3434       }
3435     } else {
3436       // Do not handle the case where the LHS of the and is not a shift. While
3437       // it would be trivial to handle this case, it would just transform
3438       // 'and' -> 'bfe', but 'and' has higher-throughput.
3439       return false;
3440     }
3441   } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
3442     if (LHS->getOpcode() == ISD::AND) {
3443       ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
3444       if (!ShiftCnst) {
3445         // Shift amount must be constant
3446         return false;
3447       }
3448 
3449       uint64_t ShiftAmt = ShiftCnst->getZExtValue();
3450 
3451       SDValue AndLHS = LHS->getOperand(0);
3452       SDValue AndRHS = LHS->getOperand(1);
3453 
3454       // Canonicalize the AND to have the mask on the RHS
3455       if (isa<ConstantSDNode>(AndLHS)) {
3456         std::swap(AndLHS, AndRHS);
3457       }
3458 
3459       ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
3460       if (!MaskCnst) {
3461         // Mask must be constant
3462         return false;
3463       }
3464 
3465       uint64_t MaskVal = MaskCnst->getZExtValue();
3466       uint64_t NumZeros;
3467       uint64_t NumBits;
3468       if (isMask_64(MaskVal)) {
3469         NumZeros = 0;
3470         // The number of bits in the result bitfield will be the number of
3471         // trailing ones (the AND) minus the number of bits we shift off
3472         NumBits = countTrailingOnes(MaskVal) - ShiftAmt;
3473       } else if (isShiftedMask_64(MaskVal)) {
3474         NumZeros = countTrailingZeros(MaskVal);
3475         unsigned NumOnes = countTrailingOnes(MaskVal >> NumZeros);
3476         // The number of bits in the result bitfield will be the number of
3477         // trailing zeros plus the number of set bits in the mask minus the
3478         // number of bits we shift off
3479         NumBits = NumZeros + NumOnes - ShiftAmt;
3480       } else {
3481         // This is not a mask we can handle
3482         return false;
3483       }
3484 
3485       if (ShiftAmt < NumZeros) {
3486         // Handling this case would require extra logic that would make this
3487         // transformation non-profitable
3488         return false;
3489       }
3490 
3491       Val = AndLHS;
3492       Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
3493       Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3494     } else if (LHS->getOpcode() == ISD::SHL) {
3495       // Here, we have a pattern like:
3496       //
3497       // (sra (shl val, NN), MM)
3498       // or
3499       // (srl (shl val, NN), MM)
3500       //
3501       // If MM >= NN, we can efficiently optimize this with bfe
3502       Val = LHS->getOperand(0);
3503 
3504       SDValue ShlRHS = LHS->getOperand(1);
3505       ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
3506       if (!ShlCnst) {
3507         // Shift amount must be constant
3508         return false;
3509       }
3510       uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
3511 
3512       SDValue ShrRHS = RHS;
3513       ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
3514       if (!ShrCnst) {
3515         // Shift amount must be constant
3516         return false;
3517       }
3518       uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
3519 
3520       // To avoid extra codegen and be profitable, we need Outer >= Inner
3521       if (OuterShiftAmt < InnerShiftAmt) {
3522         return false;
3523       }
3524 
3525       // If the outer shift is more than the type size, we have no bitfield to
3526       // extract (since we also check that the inner shift is <= the outer shift
3527       // then this also implies that the inner shift is < the type size)
3528       if (OuterShiftAmt >= Val.getValueSizeInBits()) {
3529         return false;
3530       }
3531 
3532       Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
3533                                         MVT::i32);
3534       Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
3535                                       DL, MVT::i32);
3536 
3537       if (N->getOpcode() == ISD::SRA) {
3538         // If we have a arithmetic right shift, we need to use the signed bfe
3539         // variant
3540         IsSigned = true;
3541       }
3542     } else {
3543       // No can do...
3544       return false;
3545     }
3546   } else {
3547     // No can do...
3548     return false;
3549   }
3550 
3551 
3552   unsigned Opc;
3553   // For the BFE operations we form here from "and" and "srl", always use the
3554   // unsigned variants.
3555   if (Val.getValueType() == MVT::i32) {
3556     if (IsSigned) {
3557       Opc = NVPTX::BFE_S32rii;
3558     } else {
3559       Opc = NVPTX::BFE_U32rii;
3560     }
3561   } else if (Val.getValueType() == MVT::i64) {
3562     if (IsSigned) {
3563       Opc = NVPTX::BFE_S64rii;
3564     } else {
3565       Opc = NVPTX::BFE_U64rii;
3566     }
3567   } else {
3568     // We cannot handle this type
3569     return false;
3570   }
3571 
3572   SDValue Ops[] = {
3573     Val, Start, Len
3574   };
3575 
3576   ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
3577   return true;
3578 }
3579 
3580 // SelectDirectAddr - Match a direct address for DAG.
3581 // A direct address could be a globaladdress or externalsymbol.
3582 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
3583   // Return true if TGA or ES.
3584   if (N.getOpcode() == ISD::TargetGlobalAddress ||
3585       N.getOpcode() == ISD::TargetExternalSymbol) {
3586     Address = N;
3587     return true;
3588   }
3589   if (N.getOpcode() == NVPTXISD::Wrapper) {
3590     Address = N.getOperand(0);
3591     return true;
3592   }
3593   // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
3594   if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
3595     if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
3596         CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM &&
3597         CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam)
3598       return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
3599   }
3600   return false;
3601 }
3602 
3603 // symbol+offset
3604 bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
3605     SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3606   if (Addr.getOpcode() == ISD::ADD) {
3607     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3608       SDValue base = Addr.getOperand(0);
3609       if (SelectDirectAddr(base, Base)) {
3610         Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3611                                            mvt);
3612         return true;
3613       }
3614     }
3615   }
3616   return false;
3617 }
3618 
3619 // symbol+offset
3620 bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
3621                                      SDValue &Base, SDValue &Offset) {
3622   return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
3623 }
3624 
3625 // symbol+offset
3626 bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
3627                                        SDValue &Base, SDValue &Offset) {
3628   return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
3629 }
3630 
3631 // register+offset
3632 bool NVPTXDAGToDAGISel::SelectADDRri_imp(
3633     SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3634   if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
3635     Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3636     Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
3637     return true;
3638   }
3639   if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
3640       Addr.getOpcode() == ISD::TargetGlobalAddress)
3641     return false; // direct calls.
3642 
3643   if (Addr.getOpcode() == ISD::ADD) {
3644     if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
3645       return false;
3646     }
3647     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3648       if (FrameIndexSDNode *FIN =
3649               dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
3650         // Constant offset from frame ref.
3651         Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3652       else
3653         Base = Addr.getOperand(0);
3654       Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3655                                          mvt);
3656       return true;
3657     }
3658   }
3659   return false;
3660 }
3661 
3662 // register+offset
3663 bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
3664                                      SDValue &Base, SDValue &Offset) {
3665   return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
3666 }
3667 
3668 // register+offset
3669 bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
3670                                        SDValue &Base, SDValue &Offset) {
3671   return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
3672 }
3673 
3674 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
3675                                                  unsigned int spN) const {
3676   const Value *Src = nullptr;
3677   if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
3678     if (spN == 0 && mN->getMemOperand()->getPseudoValue())
3679       return true;
3680     Src = mN->getMemOperand()->getValue();
3681   }
3682   if (!Src)
3683     return false;
3684   if (auto *PT = dyn_cast<PointerType>(Src->getType()))
3685     return (PT->getAddressSpace() == spN);
3686   return false;
3687 }
3688 
3689 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
3690 /// inline asm expressions.
3691 bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
3692     const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
3693   SDValue Op0, Op1;
3694   switch (ConstraintID) {
3695   default:
3696     return true;
3697   case InlineAsm::Constraint_m: // memory
3698     if (SelectDirectAddr(Op, Op0)) {
3699       OutOps.push_back(Op0);
3700       OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
3701       return false;
3702     }
3703     if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
3704       OutOps.push_back(Op0);
3705       OutOps.push_back(Op1);
3706       return false;
3707     }
3708     break;
3709   }
3710   return true;
3711 }
3712 
3713 /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
3714 /// conversion from \p SrcTy to \p DestTy.
3715 unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
3716                                              bool IsSigned) {
3717   switch (SrcTy.SimpleTy) {
3718   default:
3719     llvm_unreachable("Unhandled source type");
3720   case MVT::i8:
3721     switch (DestTy.SimpleTy) {
3722     default:
3723       llvm_unreachable("Unhandled dest type");
3724     case MVT::i16:
3725       return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
3726     case MVT::i32:
3727       return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
3728     case MVT::i64:
3729       return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
3730     }
3731   case MVT::i16:
3732     switch (DestTy.SimpleTy) {
3733     default:
3734       llvm_unreachable("Unhandled dest type");
3735     case MVT::i8:
3736       return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
3737     case MVT::i32:
3738       return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
3739     case MVT::i64:
3740       return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
3741     }
3742   case MVT::i32:
3743     switch (DestTy.SimpleTy) {
3744     default:
3745       llvm_unreachable("Unhandled dest type");
3746     case MVT::i8:
3747       return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
3748     case MVT::i16:
3749       return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
3750     case MVT::i64:
3751       return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
3752     }
3753   case MVT::i64:
3754     switch (DestTy.SimpleTy) {
3755     default:
3756       llvm_unreachable("Unhandled dest type");
3757     case MVT::i8:
3758       return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
3759     case MVT::i16:
3760       return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
3761     case MVT::i32:
3762       return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
3763     }
3764   }
3765 }
3766