1 //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines an instruction selector for the NVPTX target.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "NVPTXISelDAGToDAG.h"
14 #include "MCTargetDesc/NVPTXBaseInfo.h"
15 #include "NVPTXUtilities.h"
16 #include "llvm/Analysis/ValueTracking.h"
17 #include "llvm/IR/GlobalValue.h"
18 #include "llvm/IR/Instructions.h"
19 #include "llvm/IR/IntrinsicsNVPTX.h"
20 #include "llvm/Support/AtomicOrdering.h"
21 #include "llvm/Support/CommandLine.h"
22 #include "llvm/Support/Debug.h"
23 #include "llvm/Support/ErrorHandling.h"
24 #include "llvm/Support/raw_ostream.h"
25 #include "llvm/Target/TargetIntrinsicInfo.h"
26 
27 using namespace llvm;
28 
29 #define DEBUG_TYPE "nvptx-isel"
30 #define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
31 
32 /// createNVPTXISelDag - This pass converts a legalized DAG into a
33 /// NVPTX-specific DAG, ready for instruction scheduling.
34 FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
35                                        llvm::CodeGenOpt::Level OptLevel) {
36   return new NVPTXDAGToDAGISel(TM, OptLevel);
37 }
38 
39 char NVPTXDAGToDAGISel::ID = 0;
40 
41 INITIALIZE_PASS(NVPTXDAGToDAGISel, DEBUG_TYPE, PASS_NAME, false, false)
42 
43 NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
44                                      CodeGenOpt::Level OptLevel)
45     : SelectionDAGISel(ID, tm, OptLevel), TM(tm) {
46   doMulWide = (OptLevel > 0);
47 }
48 
49 bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
50   Subtarget = &MF.getSubtarget<NVPTXSubtarget>();
51   return SelectionDAGISel::runOnMachineFunction(MF);
52 }
53 
54 int NVPTXDAGToDAGISel::getDivF32Level() const {
55   return Subtarget->getTargetLowering()->getDivF32Level();
56 }
57 
58 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
59   return Subtarget->getTargetLowering()->usePrecSqrtF32();
60 }
61 
62 bool NVPTXDAGToDAGISel::useF32FTZ() const {
63   return Subtarget->getTargetLowering()->useF32FTZ(*MF);
64 }
65 
66 bool NVPTXDAGToDAGISel::allowFMA() const {
67   const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
68   return TL->allowFMA(*MF, OptLevel);
69 }
70 
71 bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
72   const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
73   return TL->allowUnsafeFPMath(*MF);
74 }
75 
76 bool NVPTXDAGToDAGISel::useShortPointers() const {
77   return TM.useShortPointers();
78 }
79 
80 /// Select - Select instructions not customized! Used for
81 /// expanded, promoted and normal instructions.
82 void NVPTXDAGToDAGISel::Select(SDNode *N) {
83 
84   if (N->isMachineOpcode()) {
85     N->setNodeId(-1);
86     return; // Already selected.
87   }
88 
89   switch (N->getOpcode()) {
90   case ISD::LOAD:
91   case ISD::ATOMIC_LOAD:
92     if (tryLoad(N))
93       return;
94     break;
95   case ISD::STORE:
96   case ISD::ATOMIC_STORE:
97     if (tryStore(N))
98       return;
99     break;
100   case ISD::EXTRACT_VECTOR_ELT:
101     if (tryEXTRACT_VECTOR_ELEMENT(N))
102       return;
103     break;
104   case NVPTXISD::SETP_F16X2:
105     SelectSETP_F16X2(N);
106     return;
107 
108   case NVPTXISD::LoadV2:
109   case NVPTXISD::LoadV4:
110     if (tryLoadVector(N))
111       return;
112     break;
113   case NVPTXISD::LDGV2:
114   case NVPTXISD::LDGV4:
115   case NVPTXISD::LDUV2:
116   case NVPTXISD::LDUV4:
117     if (tryLDGLDU(N))
118       return;
119     break;
120   case NVPTXISD::StoreV2:
121   case NVPTXISD::StoreV4:
122     if (tryStoreVector(N))
123       return;
124     break;
125   case NVPTXISD::LoadParam:
126   case NVPTXISD::LoadParamV2:
127   case NVPTXISD::LoadParamV4:
128     if (tryLoadParam(N))
129       return;
130     break;
131   case NVPTXISD::StoreRetval:
132   case NVPTXISD::StoreRetvalV2:
133   case NVPTXISD::StoreRetvalV4:
134     if (tryStoreRetval(N))
135       return;
136     break;
137   case NVPTXISD::StoreParam:
138   case NVPTXISD::StoreParamV2:
139   case NVPTXISD::StoreParamV4:
140   case NVPTXISD::StoreParamS32:
141   case NVPTXISD::StoreParamU32:
142     if (tryStoreParam(N))
143       return;
144     break;
145   case ISD::INTRINSIC_WO_CHAIN:
146     if (tryIntrinsicNoChain(N))
147       return;
148     break;
149   case ISD::INTRINSIC_W_CHAIN:
150     if (tryIntrinsicChain(N))
151       return;
152     break;
153   case NVPTXISD::Tex1DFloatS32:
154   case NVPTXISD::Tex1DFloatFloat:
155   case NVPTXISD::Tex1DFloatFloatLevel:
156   case NVPTXISD::Tex1DFloatFloatGrad:
157   case NVPTXISD::Tex1DS32S32:
158   case NVPTXISD::Tex1DS32Float:
159   case NVPTXISD::Tex1DS32FloatLevel:
160   case NVPTXISD::Tex1DS32FloatGrad:
161   case NVPTXISD::Tex1DU32S32:
162   case NVPTXISD::Tex1DU32Float:
163   case NVPTXISD::Tex1DU32FloatLevel:
164   case NVPTXISD::Tex1DU32FloatGrad:
165   case NVPTXISD::Tex1DArrayFloatS32:
166   case NVPTXISD::Tex1DArrayFloatFloat:
167   case NVPTXISD::Tex1DArrayFloatFloatLevel:
168   case NVPTXISD::Tex1DArrayFloatFloatGrad:
169   case NVPTXISD::Tex1DArrayS32S32:
170   case NVPTXISD::Tex1DArrayS32Float:
171   case NVPTXISD::Tex1DArrayS32FloatLevel:
172   case NVPTXISD::Tex1DArrayS32FloatGrad:
173   case NVPTXISD::Tex1DArrayU32S32:
174   case NVPTXISD::Tex1DArrayU32Float:
175   case NVPTXISD::Tex1DArrayU32FloatLevel:
176   case NVPTXISD::Tex1DArrayU32FloatGrad:
177   case NVPTXISD::Tex2DFloatS32:
178   case NVPTXISD::Tex2DFloatFloat:
179   case NVPTXISD::Tex2DFloatFloatLevel:
180   case NVPTXISD::Tex2DFloatFloatGrad:
181   case NVPTXISD::Tex2DS32S32:
182   case NVPTXISD::Tex2DS32Float:
183   case NVPTXISD::Tex2DS32FloatLevel:
184   case NVPTXISD::Tex2DS32FloatGrad:
185   case NVPTXISD::Tex2DU32S32:
186   case NVPTXISD::Tex2DU32Float:
187   case NVPTXISD::Tex2DU32FloatLevel:
188   case NVPTXISD::Tex2DU32FloatGrad:
189   case NVPTXISD::Tex2DArrayFloatS32:
190   case NVPTXISD::Tex2DArrayFloatFloat:
191   case NVPTXISD::Tex2DArrayFloatFloatLevel:
192   case NVPTXISD::Tex2DArrayFloatFloatGrad:
193   case NVPTXISD::Tex2DArrayS32S32:
194   case NVPTXISD::Tex2DArrayS32Float:
195   case NVPTXISD::Tex2DArrayS32FloatLevel:
196   case NVPTXISD::Tex2DArrayS32FloatGrad:
197   case NVPTXISD::Tex2DArrayU32S32:
198   case NVPTXISD::Tex2DArrayU32Float:
199   case NVPTXISD::Tex2DArrayU32FloatLevel:
200   case NVPTXISD::Tex2DArrayU32FloatGrad:
201   case NVPTXISD::Tex3DFloatS32:
202   case NVPTXISD::Tex3DFloatFloat:
203   case NVPTXISD::Tex3DFloatFloatLevel:
204   case NVPTXISD::Tex3DFloatFloatGrad:
205   case NVPTXISD::Tex3DS32S32:
206   case NVPTXISD::Tex3DS32Float:
207   case NVPTXISD::Tex3DS32FloatLevel:
208   case NVPTXISD::Tex3DS32FloatGrad:
209   case NVPTXISD::Tex3DU32S32:
210   case NVPTXISD::Tex3DU32Float:
211   case NVPTXISD::Tex3DU32FloatLevel:
212   case NVPTXISD::Tex3DU32FloatGrad:
213   case NVPTXISD::TexCubeFloatFloat:
214   case NVPTXISD::TexCubeFloatFloatLevel:
215   case NVPTXISD::TexCubeS32Float:
216   case NVPTXISD::TexCubeS32FloatLevel:
217   case NVPTXISD::TexCubeU32Float:
218   case NVPTXISD::TexCubeU32FloatLevel:
219   case NVPTXISD::TexCubeArrayFloatFloat:
220   case NVPTXISD::TexCubeArrayFloatFloatLevel:
221   case NVPTXISD::TexCubeArrayS32Float:
222   case NVPTXISD::TexCubeArrayS32FloatLevel:
223   case NVPTXISD::TexCubeArrayU32Float:
224   case NVPTXISD::TexCubeArrayU32FloatLevel:
225   case NVPTXISD::Tld4R2DFloatFloat:
226   case NVPTXISD::Tld4G2DFloatFloat:
227   case NVPTXISD::Tld4B2DFloatFloat:
228   case NVPTXISD::Tld4A2DFloatFloat:
229   case NVPTXISD::Tld4R2DS64Float:
230   case NVPTXISD::Tld4G2DS64Float:
231   case NVPTXISD::Tld4B2DS64Float:
232   case NVPTXISD::Tld4A2DS64Float:
233   case NVPTXISD::Tld4R2DU64Float:
234   case NVPTXISD::Tld4G2DU64Float:
235   case NVPTXISD::Tld4B2DU64Float:
236   case NVPTXISD::Tld4A2DU64Float:
237   case NVPTXISD::TexUnified1DFloatS32:
238   case NVPTXISD::TexUnified1DFloatFloat:
239   case NVPTXISD::TexUnified1DFloatFloatLevel:
240   case NVPTXISD::TexUnified1DFloatFloatGrad:
241   case NVPTXISD::TexUnified1DS32S32:
242   case NVPTXISD::TexUnified1DS32Float:
243   case NVPTXISD::TexUnified1DS32FloatLevel:
244   case NVPTXISD::TexUnified1DS32FloatGrad:
245   case NVPTXISD::TexUnified1DU32S32:
246   case NVPTXISD::TexUnified1DU32Float:
247   case NVPTXISD::TexUnified1DU32FloatLevel:
248   case NVPTXISD::TexUnified1DU32FloatGrad:
249   case NVPTXISD::TexUnified1DArrayFloatS32:
250   case NVPTXISD::TexUnified1DArrayFloatFloat:
251   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
252   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
253   case NVPTXISD::TexUnified1DArrayS32S32:
254   case NVPTXISD::TexUnified1DArrayS32Float:
255   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
256   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
257   case NVPTXISD::TexUnified1DArrayU32S32:
258   case NVPTXISD::TexUnified1DArrayU32Float:
259   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
260   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
261   case NVPTXISD::TexUnified2DFloatS32:
262   case NVPTXISD::TexUnified2DFloatFloat:
263   case NVPTXISD::TexUnified2DFloatFloatLevel:
264   case NVPTXISD::TexUnified2DFloatFloatGrad:
265   case NVPTXISD::TexUnified2DS32S32:
266   case NVPTXISD::TexUnified2DS32Float:
267   case NVPTXISD::TexUnified2DS32FloatLevel:
268   case NVPTXISD::TexUnified2DS32FloatGrad:
269   case NVPTXISD::TexUnified2DU32S32:
270   case NVPTXISD::TexUnified2DU32Float:
271   case NVPTXISD::TexUnified2DU32FloatLevel:
272   case NVPTXISD::TexUnified2DU32FloatGrad:
273   case NVPTXISD::TexUnified2DArrayFloatS32:
274   case NVPTXISD::TexUnified2DArrayFloatFloat:
275   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
276   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
277   case NVPTXISD::TexUnified2DArrayS32S32:
278   case NVPTXISD::TexUnified2DArrayS32Float:
279   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
280   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
281   case NVPTXISD::TexUnified2DArrayU32S32:
282   case NVPTXISD::TexUnified2DArrayU32Float:
283   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
284   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
285   case NVPTXISD::TexUnified3DFloatS32:
286   case NVPTXISD::TexUnified3DFloatFloat:
287   case NVPTXISD::TexUnified3DFloatFloatLevel:
288   case NVPTXISD::TexUnified3DFloatFloatGrad:
289   case NVPTXISD::TexUnified3DS32S32:
290   case NVPTXISD::TexUnified3DS32Float:
291   case NVPTXISD::TexUnified3DS32FloatLevel:
292   case NVPTXISD::TexUnified3DS32FloatGrad:
293   case NVPTXISD::TexUnified3DU32S32:
294   case NVPTXISD::TexUnified3DU32Float:
295   case NVPTXISD::TexUnified3DU32FloatLevel:
296   case NVPTXISD::TexUnified3DU32FloatGrad:
297   case NVPTXISD::TexUnifiedCubeFloatFloat:
298   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
299   case NVPTXISD::TexUnifiedCubeS32Float:
300   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
301   case NVPTXISD::TexUnifiedCubeU32Float:
302   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
303   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
304   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
305   case NVPTXISD::TexUnifiedCubeArrayS32Float:
306   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
307   case NVPTXISD::TexUnifiedCubeArrayU32Float:
308   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
309   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
310   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
311   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
312   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
313   case NVPTXISD::Tld4UnifiedR2DS64Float:
314   case NVPTXISD::Tld4UnifiedG2DS64Float:
315   case NVPTXISD::Tld4UnifiedB2DS64Float:
316   case NVPTXISD::Tld4UnifiedA2DS64Float:
317   case NVPTXISD::Tld4UnifiedR2DU64Float:
318   case NVPTXISD::Tld4UnifiedG2DU64Float:
319   case NVPTXISD::Tld4UnifiedB2DU64Float:
320   case NVPTXISD::Tld4UnifiedA2DU64Float:
321     if (tryTextureIntrinsic(N))
322       return;
323     break;
324   case NVPTXISD::Suld1DI8Clamp:
325   case NVPTXISD::Suld1DI16Clamp:
326   case NVPTXISD::Suld1DI32Clamp:
327   case NVPTXISD::Suld1DI64Clamp:
328   case NVPTXISD::Suld1DV2I8Clamp:
329   case NVPTXISD::Suld1DV2I16Clamp:
330   case NVPTXISD::Suld1DV2I32Clamp:
331   case NVPTXISD::Suld1DV2I64Clamp:
332   case NVPTXISD::Suld1DV4I8Clamp:
333   case NVPTXISD::Suld1DV4I16Clamp:
334   case NVPTXISD::Suld1DV4I32Clamp:
335   case NVPTXISD::Suld1DArrayI8Clamp:
336   case NVPTXISD::Suld1DArrayI16Clamp:
337   case NVPTXISD::Suld1DArrayI32Clamp:
338   case NVPTXISD::Suld1DArrayI64Clamp:
339   case NVPTXISD::Suld1DArrayV2I8Clamp:
340   case NVPTXISD::Suld1DArrayV2I16Clamp:
341   case NVPTXISD::Suld1DArrayV2I32Clamp:
342   case NVPTXISD::Suld1DArrayV2I64Clamp:
343   case NVPTXISD::Suld1DArrayV4I8Clamp:
344   case NVPTXISD::Suld1DArrayV4I16Clamp:
345   case NVPTXISD::Suld1DArrayV4I32Clamp:
346   case NVPTXISD::Suld2DI8Clamp:
347   case NVPTXISD::Suld2DI16Clamp:
348   case NVPTXISD::Suld2DI32Clamp:
349   case NVPTXISD::Suld2DI64Clamp:
350   case NVPTXISD::Suld2DV2I8Clamp:
351   case NVPTXISD::Suld2DV2I16Clamp:
352   case NVPTXISD::Suld2DV2I32Clamp:
353   case NVPTXISD::Suld2DV2I64Clamp:
354   case NVPTXISD::Suld2DV4I8Clamp:
355   case NVPTXISD::Suld2DV4I16Clamp:
356   case NVPTXISD::Suld2DV4I32Clamp:
357   case NVPTXISD::Suld2DArrayI8Clamp:
358   case NVPTXISD::Suld2DArrayI16Clamp:
359   case NVPTXISD::Suld2DArrayI32Clamp:
360   case NVPTXISD::Suld2DArrayI64Clamp:
361   case NVPTXISD::Suld2DArrayV2I8Clamp:
362   case NVPTXISD::Suld2DArrayV2I16Clamp:
363   case NVPTXISD::Suld2DArrayV2I32Clamp:
364   case NVPTXISD::Suld2DArrayV2I64Clamp:
365   case NVPTXISD::Suld2DArrayV4I8Clamp:
366   case NVPTXISD::Suld2DArrayV4I16Clamp:
367   case NVPTXISD::Suld2DArrayV4I32Clamp:
368   case NVPTXISD::Suld3DI8Clamp:
369   case NVPTXISD::Suld3DI16Clamp:
370   case NVPTXISD::Suld3DI32Clamp:
371   case NVPTXISD::Suld3DI64Clamp:
372   case NVPTXISD::Suld3DV2I8Clamp:
373   case NVPTXISD::Suld3DV2I16Clamp:
374   case NVPTXISD::Suld3DV2I32Clamp:
375   case NVPTXISD::Suld3DV2I64Clamp:
376   case NVPTXISD::Suld3DV4I8Clamp:
377   case NVPTXISD::Suld3DV4I16Clamp:
378   case NVPTXISD::Suld3DV4I32Clamp:
379   case NVPTXISD::Suld1DI8Trap:
380   case NVPTXISD::Suld1DI16Trap:
381   case NVPTXISD::Suld1DI32Trap:
382   case NVPTXISD::Suld1DI64Trap:
383   case NVPTXISD::Suld1DV2I8Trap:
384   case NVPTXISD::Suld1DV2I16Trap:
385   case NVPTXISD::Suld1DV2I32Trap:
386   case NVPTXISD::Suld1DV2I64Trap:
387   case NVPTXISD::Suld1DV4I8Trap:
388   case NVPTXISD::Suld1DV4I16Trap:
389   case NVPTXISD::Suld1DV4I32Trap:
390   case NVPTXISD::Suld1DArrayI8Trap:
391   case NVPTXISD::Suld1DArrayI16Trap:
392   case NVPTXISD::Suld1DArrayI32Trap:
393   case NVPTXISD::Suld1DArrayI64Trap:
394   case NVPTXISD::Suld1DArrayV2I8Trap:
395   case NVPTXISD::Suld1DArrayV2I16Trap:
396   case NVPTXISD::Suld1DArrayV2I32Trap:
397   case NVPTXISD::Suld1DArrayV2I64Trap:
398   case NVPTXISD::Suld1DArrayV4I8Trap:
399   case NVPTXISD::Suld1DArrayV4I16Trap:
400   case NVPTXISD::Suld1DArrayV4I32Trap:
401   case NVPTXISD::Suld2DI8Trap:
402   case NVPTXISD::Suld2DI16Trap:
403   case NVPTXISD::Suld2DI32Trap:
404   case NVPTXISD::Suld2DI64Trap:
405   case NVPTXISD::Suld2DV2I8Trap:
406   case NVPTXISD::Suld2DV2I16Trap:
407   case NVPTXISD::Suld2DV2I32Trap:
408   case NVPTXISD::Suld2DV2I64Trap:
409   case NVPTXISD::Suld2DV4I8Trap:
410   case NVPTXISD::Suld2DV4I16Trap:
411   case NVPTXISD::Suld2DV4I32Trap:
412   case NVPTXISD::Suld2DArrayI8Trap:
413   case NVPTXISD::Suld2DArrayI16Trap:
414   case NVPTXISD::Suld2DArrayI32Trap:
415   case NVPTXISD::Suld2DArrayI64Trap:
416   case NVPTXISD::Suld2DArrayV2I8Trap:
417   case NVPTXISD::Suld2DArrayV2I16Trap:
418   case NVPTXISD::Suld2DArrayV2I32Trap:
419   case NVPTXISD::Suld2DArrayV2I64Trap:
420   case NVPTXISD::Suld2DArrayV4I8Trap:
421   case NVPTXISD::Suld2DArrayV4I16Trap:
422   case NVPTXISD::Suld2DArrayV4I32Trap:
423   case NVPTXISD::Suld3DI8Trap:
424   case NVPTXISD::Suld3DI16Trap:
425   case NVPTXISD::Suld3DI32Trap:
426   case NVPTXISD::Suld3DI64Trap:
427   case NVPTXISD::Suld3DV2I8Trap:
428   case NVPTXISD::Suld3DV2I16Trap:
429   case NVPTXISD::Suld3DV2I32Trap:
430   case NVPTXISD::Suld3DV2I64Trap:
431   case NVPTXISD::Suld3DV4I8Trap:
432   case NVPTXISD::Suld3DV4I16Trap:
433   case NVPTXISD::Suld3DV4I32Trap:
434   case NVPTXISD::Suld1DI8Zero:
435   case NVPTXISD::Suld1DI16Zero:
436   case NVPTXISD::Suld1DI32Zero:
437   case NVPTXISD::Suld1DI64Zero:
438   case NVPTXISD::Suld1DV2I8Zero:
439   case NVPTXISD::Suld1DV2I16Zero:
440   case NVPTXISD::Suld1DV2I32Zero:
441   case NVPTXISD::Suld1DV2I64Zero:
442   case NVPTXISD::Suld1DV4I8Zero:
443   case NVPTXISD::Suld1DV4I16Zero:
444   case NVPTXISD::Suld1DV4I32Zero:
445   case NVPTXISD::Suld1DArrayI8Zero:
446   case NVPTXISD::Suld1DArrayI16Zero:
447   case NVPTXISD::Suld1DArrayI32Zero:
448   case NVPTXISD::Suld1DArrayI64Zero:
449   case NVPTXISD::Suld1DArrayV2I8Zero:
450   case NVPTXISD::Suld1DArrayV2I16Zero:
451   case NVPTXISD::Suld1DArrayV2I32Zero:
452   case NVPTXISD::Suld1DArrayV2I64Zero:
453   case NVPTXISD::Suld1DArrayV4I8Zero:
454   case NVPTXISD::Suld1DArrayV4I16Zero:
455   case NVPTXISD::Suld1DArrayV4I32Zero:
456   case NVPTXISD::Suld2DI8Zero:
457   case NVPTXISD::Suld2DI16Zero:
458   case NVPTXISD::Suld2DI32Zero:
459   case NVPTXISD::Suld2DI64Zero:
460   case NVPTXISD::Suld2DV2I8Zero:
461   case NVPTXISD::Suld2DV2I16Zero:
462   case NVPTXISD::Suld2DV2I32Zero:
463   case NVPTXISD::Suld2DV2I64Zero:
464   case NVPTXISD::Suld2DV4I8Zero:
465   case NVPTXISD::Suld2DV4I16Zero:
466   case NVPTXISD::Suld2DV4I32Zero:
467   case NVPTXISD::Suld2DArrayI8Zero:
468   case NVPTXISD::Suld2DArrayI16Zero:
469   case NVPTXISD::Suld2DArrayI32Zero:
470   case NVPTXISD::Suld2DArrayI64Zero:
471   case NVPTXISD::Suld2DArrayV2I8Zero:
472   case NVPTXISD::Suld2DArrayV2I16Zero:
473   case NVPTXISD::Suld2DArrayV2I32Zero:
474   case NVPTXISD::Suld2DArrayV2I64Zero:
475   case NVPTXISD::Suld2DArrayV4I8Zero:
476   case NVPTXISD::Suld2DArrayV4I16Zero:
477   case NVPTXISD::Suld2DArrayV4I32Zero:
478   case NVPTXISD::Suld3DI8Zero:
479   case NVPTXISD::Suld3DI16Zero:
480   case NVPTXISD::Suld3DI32Zero:
481   case NVPTXISD::Suld3DI64Zero:
482   case NVPTXISD::Suld3DV2I8Zero:
483   case NVPTXISD::Suld3DV2I16Zero:
484   case NVPTXISD::Suld3DV2I32Zero:
485   case NVPTXISD::Suld3DV2I64Zero:
486   case NVPTXISD::Suld3DV4I8Zero:
487   case NVPTXISD::Suld3DV4I16Zero:
488   case NVPTXISD::Suld3DV4I32Zero:
489     if (trySurfaceIntrinsic(N))
490       return;
491     break;
492   case ISD::AND:
493   case ISD::SRA:
494   case ISD::SRL:
495     // Try to select BFE
496     if (tryBFE(N))
497       return;
498     break;
499   case ISD::ADDRSPACECAST:
500     SelectAddrSpaceCast(N);
501     return;
502   case ISD::ConstantFP:
503     if (tryConstantFP(N))
504       return;
505     break;
506   default:
507     break;
508   }
509   SelectCode(N);
510 }
511 
512 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
513   unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
514   switch (IID) {
515   default:
516     return false;
517   case Intrinsic::nvvm_ldg_global_f:
518   case Intrinsic::nvvm_ldg_global_i:
519   case Intrinsic::nvvm_ldg_global_p:
520   case Intrinsic::nvvm_ldu_global_f:
521   case Intrinsic::nvvm_ldu_global_i:
522   case Intrinsic::nvvm_ldu_global_p:
523     return tryLDGLDU(N);
524   }
525 }
526 
527 // There's no way to specify FP16 and BF16 immediates in .(b)f16 ops, so we
528 // have to load them into an .(b)f16 register first.
529 bool NVPTXDAGToDAGISel::tryConstantFP(SDNode *N) {
530   if (N->getValueType(0) != MVT::f16 && N->getValueType(0) != MVT::bf16)
531     return false;
532   SDValue Val = CurDAG->getTargetConstantFP(
533       cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), N->getValueType(0));
534   SDNode *LoadConstF16 = CurDAG->getMachineNode(
535       (N->getValueType(0) == MVT::f16 ? NVPTX::LOAD_CONST_F16
536                                       : NVPTX::LOAD_CONST_BF16),
537       SDLoc(N), N->getValueType(0), Val);
538   ReplaceNode(N, LoadConstF16);
539   return true;
540 }
541 
542 // Map ISD:CONDCODE value to appropriate CmpMode expected by
543 // NVPTXInstPrinter::printCmpMode()
544 static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
545   using NVPTX::PTXCmpMode::CmpMode;
546   unsigned PTXCmpMode = [](ISD::CondCode CC) {
547     switch (CC) {
548     default:
549       llvm_unreachable("Unexpected condition code.");
550     case ISD::SETOEQ:
551       return CmpMode::EQ;
552     case ISD::SETOGT:
553       return CmpMode::GT;
554     case ISD::SETOGE:
555       return CmpMode::GE;
556     case ISD::SETOLT:
557       return CmpMode::LT;
558     case ISD::SETOLE:
559       return CmpMode::LE;
560     case ISD::SETONE:
561       return CmpMode::NE;
562     case ISD::SETO:
563       return CmpMode::NUM;
564     case ISD::SETUO:
565       return CmpMode::NotANumber;
566     case ISD::SETUEQ:
567       return CmpMode::EQU;
568     case ISD::SETUGT:
569       return CmpMode::GTU;
570     case ISD::SETUGE:
571       return CmpMode::GEU;
572     case ISD::SETULT:
573       return CmpMode::LTU;
574     case ISD::SETULE:
575       return CmpMode::LEU;
576     case ISD::SETUNE:
577       return CmpMode::NEU;
578     case ISD::SETEQ:
579       return CmpMode::EQ;
580     case ISD::SETGT:
581       return CmpMode::GT;
582     case ISD::SETGE:
583       return CmpMode::GE;
584     case ISD::SETLT:
585       return CmpMode::LT;
586     case ISD::SETLE:
587       return CmpMode::LE;
588     case ISD::SETNE:
589       return CmpMode::NE;
590     }
591   }(CondCode.get());
592 
593   if (FTZ)
594     PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
595 
596   return PTXCmpMode;
597 }
598 
599 bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
600   unsigned PTXCmpMode =
601       getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
602   SDLoc DL(N);
603   SDNode *SetP = CurDAG->getMachineNode(
604       NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
605       N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
606   ReplaceNode(N, SetP);
607   return true;
608 }
609 
610 // Find all instances of extract_vector_elt that use this v2f16 vector
611 // and coalesce them into a scattering move instruction.
612 bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
613   SDValue Vector = N->getOperand(0);
614 
615   // We only care about f16x2 as it's the only real vector type we
616   // need to deal with.
617   MVT VT = Vector.getSimpleValueType();
618   if (!(VT == MVT::v2f16 || VT == MVT::v2bf16))
619     return false;
620   // Find and record all uses of this vector that extract element 0 or 1.
621   SmallVector<SDNode *, 4> E0, E1;
622   for (auto *U : Vector.getNode()->uses()) {
623     if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
624       continue;
625     if (U->getOperand(0) != Vector)
626       continue;
627     if (const ConstantSDNode *IdxConst =
628             dyn_cast<ConstantSDNode>(U->getOperand(1))) {
629       if (IdxConst->getZExtValue() == 0)
630         E0.push_back(U);
631       else if (IdxConst->getZExtValue() == 1)
632         E1.push_back(U);
633       else
634         llvm_unreachable("Invalid vector index.");
635     }
636   }
637 
638   // There's no point scattering f16x2 if we only ever access one
639   // element of it.
640   if (E0.empty() || E1.empty())
641     return false;
642 
643   // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
644   // into f16,f16 SplitF16x2(V)
645   MVT EltVT = VT.getVectorElementType();
646   SDNode *ScatterOp =
647       CurDAG->getMachineNode(NVPTX::I32toV2I16, SDLoc(N), EltVT, EltVT, Vector);
648   for (auto *Node : E0)
649     ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
650   for (auto *Node : E1)
651     ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
652 
653   return true;
654 }
655 
656 static unsigned int getCodeAddrSpace(MemSDNode *N) {
657   const Value *Src = N->getMemOperand()->getValue();
658 
659   if (!Src)
660     return NVPTX::PTXLdStInstCode::GENERIC;
661 
662   if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
663     switch (PT->getAddressSpace()) {
664     case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
665     case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
666     case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED;
667     case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC;
668     case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM;
669     case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT;
670     default: break;
671     }
672   }
673   return NVPTX::PTXLdStInstCode::GENERIC;
674 }
675 
676 static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
677                           unsigned CodeAddrSpace, MachineFunction *F) {
678   // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
679   // space.
680   //
681   // We have two ways of identifying invariant loads: Loads may be explicitly
682   // marked as invariant, or we may infer them to be invariant.
683   //
684   // We currently infer invariance for loads from
685   //  - constant global variables, and
686   //  - kernel function pointer params that are noalias (i.e. __restrict) and
687   //    never written to.
688   //
689   // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
690   // not during the SelectionDAG phase).
691   //
692   // TODO: Infer invariance only at -O2.  We still want to use ldg at -O0 for
693   // explicitly invariant loads because these are how clang tells us to use ldg
694   // when the user uses a builtin.
695   if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
696     return false;
697 
698   if (N->isInvariant())
699     return true;
700 
701   bool IsKernelFn = isKernelFunction(F->getFunction());
702 
703   // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly
704   // because the former looks through phi nodes while the latter does not. We
705   // need to look through phi nodes to handle pointer induction variables.
706   SmallVector<const Value *, 8> Objs;
707   getUnderlyingObjects(N->getMemOperand()->getValue(), Objs);
708 
709   return all_of(Objs, [&](const Value *V) {
710     if (auto *A = dyn_cast<const Argument>(V))
711       return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
712     if (auto *GV = dyn_cast<const GlobalVariable>(V))
713       return GV->isConstant();
714     return false;
715   });
716 }
717 
718 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
719   unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
720   switch (IID) {
721   default:
722     return false;
723   case Intrinsic::nvvm_texsurf_handle_internal:
724     SelectTexSurfHandle(N);
725     return true;
726   }
727 }
728 
729 void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
730   // Op 0 is the intrinsic ID
731   SDValue Wrapper = N->getOperand(1);
732   SDValue GlobalVal = Wrapper.getOperand(0);
733   ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
734                                         MVT::i64, GlobalVal));
735 }
736 
737 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
738   SDValue Src = N->getOperand(0);
739   AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
740   unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
741   unsigned DstAddrSpace = CastN->getDestAddressSpace();
742   assert(SrcAddrSpace != DstAddrSpace &&
743          "addrspacecast must be between different address spaces");
744 
745   if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
746     // Specific to generic
747     unsigned Opc;
748     switch (SrcAddrSpace) {
749     default: report_fatal_error("Bad address space in addrspacecast");
750     case ADDRESS_SPACE_GLOBAL:
751       Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes;
752       break;
753     case ADDRESS_SPACE_SHARED:
754       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_shared_yes_6432
755                                                : NVPTX::cvta_shared_yes_64)
756                          : NVPTX::cvta_shared_yes;
757       break;
758     case ADDRESS_SPACE_CONST:
759       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_const_yes_6432
760                                                : NVPTX::cvta_const_yes_64)
761                          : NVPTX::cvta_const_yes;
762       break;
763     case ADDRESS_SPACE_LOCAL:
764       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_local_yes_6432
765                                                : NVPTX::cvta_local_yes_64)
766                          : NVPTX::cvta_local_yes;
767       break;
768     }
769     ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
770                                           Src));
771     return;
772   } else {
773     // Generic to specific
774     if (SrcAddrSpace != 0)
775       report_fatal_error("Cannot cast between two non-generic address spaces");
776     unsigned Opc;
777     switch (DstAddrSpace) {
778     default: report_fatal_error("Bad address space in addrspacecast");
779     case ADDRESS_SPACE_GLOBAL:
780       Opc = TM.is64Bit() ? NVPTX::cvta_to_global_yes_64
781                          : NVPTX::cvta_to_global_yes;
782       break;
783     case ADDRESS_SPACE_SHARED:
784       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_shared_yes_3264
785                                                 : NVPTX::cvta_to_shared_yes_64)
786                          : NVPTX::cvta_to_shared_yes;
787       break;
788     case ADDRESS_SPACE_CONST:
789       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_const_yes_3264
790                                              : NVPTX::cvta_to_const_yes_64)
791                          : NVPTX::cvta_to_const_yes;
792       break;
793     case ADDRESS_SPACE_LOCAL:
794       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_local_yes_3264
795                                                : NVPTX::cvta_to_local_yes_64)
796                          : NVPTX::cvta_to_local_yes;
797       break;
798     case ADDRESS_SPACE_PARAM:
799       Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
800                          : NVPTX::nvvm_ptr_gen_to_param;
801       break;
802     }
803     ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
804                                           Src));
805     return;
806   }
807 }
808 
809 // Helper function template to reduce amount of boilerplate code for
810 // opcode selection.
811 static std::optional<unsigned>
812 pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8,
813                 unsigned Opcode_i16, unsigned Opcode_i32,
814                 std::optional<unsigned> Opcode_i64, unsigned Opcode_f32,
815                 std::optional<unsigned> Opcode_f64) {
816   switch (VT) {
817   case MVT::i1:
818   case MVT::i8:
819     return Opcode_i8;
820   case MVT::i16:
821     return Opcode_i16;
822   case MVT::i32:
823     return Opcode_i32;
824   case MVT::i64:
825     return Opcode_i64;
826   case MVT::f16:
827   case MVT::bf16:
828     return Opcode_i16;
829   case MVT::v2f16:
830   case MVT::v2bf16:
831     return Opcode_i32;
832   case MVT::f32:
833     return Opcode_f32;
834   case MVT::f64:
835     return Opcode_f64;
836   default:
837     return std::nullopt;
838   }
839 }
840 
841 static int getLdStRegType(EVT VT) {
842   if (VT.isFloatingPoint())
843     switch (VT.getSimpleVT().SimpleTy) {
844     case MVT::f16:
845     case MVT::bf16:
846     case MVT::v2f16:
847     case MVT::v2bf16:
848       return NVPTX::PTXLdStInstCode::Untyped;
849     default:
850       return NVPTX::PTXLdStInstCode::Float;
851     }
852   else
853     return NVPTX::PTXLdStInstCode::Unsigned;
854 }
855 
856 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
857   SDLoc dl(N);
858   MemSDNode *LD = cast<MemSDNode>(N);
859   assert(LD->readMem() && "Expected load");
860   LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
861   EVT LoadedVT = LD->getMemoryVT();
862   SDNode *NVPTXLD = nullptr;
863 
864   // do not support pre/post inc/dec
865   if (PlainLoad && PlainLoad->isIndexed())
866     return false;
867 
868   if (!LoadedVT.isSimple())
869     return false;
870 
871   AtomicOrdering Ordering = LD->getSuccessOrdering();
872   // In order to lower atomic loads with stronger guarantees we would need to
873   // use load.acquire or insert fences. However these features were only added
874   // with PTX ISA 6.0 / sm_70.
875   // TODO: Check if we can actually use the new instructions and implement them.
876   if (isStrongerThanMonotonic(Ordering))
877     return false;
878 
879   // Address Space Setting
880   unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
881   if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
882     return tryLDGLDU(N);
883   }
884 
885   unsigned int PointerSize =
886       CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
887 
888   // Volatile Setting
889   // - .volatile is only available for .global and .shared
890   // - .volatile has the same memory synchronization semantics as .relaxed.sys
891   bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
892   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
893       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
894       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
895     isVolatile = false;
896 
897   // Type Setting: fromType + fromTypeWidth
898   //
899   // Sign   : ISD::SEXTLOAD
900   // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
901   //          type is integer
902   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
903   MVT SimpleVT = LoadedVT.getSimpleVT();
904   MVT ScalarVT = SimpleVT.getScalarType();
905   // Read at least 8 bits (predicates are stored as 8-bit values)
906   unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
907   unsigned int fromType;
908 
909   // Vector Setting
910   unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
911   if (SimpleVT.isVector()) {
912     assert((LoadedVT == MVT::v2f16 || LoadedVT == MVT::v2bf16) &&
913            "Unexpected vector type");
914     // v2f16/v2bf16 is loaded using ld.b32
915     fromTypeWidth = 32;
916   }
917 
918   if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
919     fromType = NVPTX::PTXLdStInstCode::Signed;
920   else
921     fromType = getLdStRegType(ScalarVT);
922 
923   // Create the machine instruction DAG
924   SDValue Chain = N->getOperand(0);
925   SDValue N1 = N->getOperand(1);
926   SDValue Addr;
927   SDValue Offset, Base;
928   std::optional<unsigned> Opcode;
929   MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
930 
931   if (SelectDirectAddr(N1, Addr)) {
932     Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar,
933                              NVPTX::LD_i32_avar, NVPTX::LD_i64_avar,
934                              NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
935     if (!Opcode)
936       return false;
937     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
938                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
939                       getI32Imm(fromTypeWidth, dl), Addr, Chain };
940     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
941   } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
942                                : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
943     Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
944                              NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
945                              NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
946     if (!Opcode)
947       return false;
948     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
949                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
950                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
951     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
952   } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
953                                : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
954     if (PointerSize == 64)
955       Opcode =
956           pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
957                           NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64,
958                           NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
959     else
960       Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari,
961                                NVPTX::LD_i32_ari, NVPTX::LD_i64_ari,
962                                NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
963     if (!Opcode)
964       return false;
965     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
966                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
967                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
968     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
969   } else {
970     if (PointerSize == 64)
971       Opcode =
972           pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
973                           NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64,
974                           NVPTX::LD_f32_areg_64, NVPTX::LD_f64_areg_64);
975     else
976       Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg,
977                                NVPTX::LD_i32_areg, NVPTX::LD_i64_areg,
978                                NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
979     if (!Opcode)
980       return false;
981     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
982                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
983                       getI32Imm(fromTypeWidth, dl), N1, Chain };
984     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
985   }
986 
987   if (!NVPTXLD)
988     return false;
989 
990   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
991   CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
992 
993   ReplaceNode(N, NVPTXLD);
994   return true;
995 }
996 
997 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
998 
999   SDValue Chain = N->getOperand(0);
1000   SDValue Op1 = N->getOperand(1);
1001   SDValue Addr, Offset, Base;
1002   std::optional<unsigned> Opcode;
1003   SDLoc DL(N);
1004   SDNode *LD;
1005   MemSDNode *MemSD = cast<MemSDNode>(N);
1006   EVT LoadedVT = MemSD->getMemoryVT();
1007 
1008   if (!LoadedVT.isSimple())
1009     return false;
1010 
1011   // Address Space Setting
1012   unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
1013   if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
1014     return tryLDGLDU(N);
1015   }
1016 
1017   unsigned int PointerSize =
1018       CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
1019 
1020   // Volatile Setting
1021   // - .volatile is only availalble for .global and .shared
1022   bool IsVolatile = MemSD->isVolatile();
1023   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1024       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1025       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1026     IsVolatile = false;
1027 
1028   // Vector Setting
1029   MVT SimpleVT = LoadedVT.getSimpleVT();
1030 
1031   // Type Setting: fromType + fromTypeWidth
1032   //
1033   // Sign   : ISD::SEXTLOAD
1034   // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1035   //          type is integer
1036   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1037   MVT ScalarVT = SimpleVT.getScalarType();
1038   // Read at least 8 bits (predicates are stored as 8-bit values)
1039   unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
1040   unsigned int FromType;
1041   // The last operand holds the original LoadSDNode::getExtensionType() value
1042   unsigned ExtensionType = cast<ConstantSDNode>(
1043       N->getOperand(N->getNumOperands() - 1))->getZExtValue();
1044   if (ExtensionType == ISD::SEXTLOAD)
1045     FromType = NVPTX::PTXLdStInstCode::Signed;
1046   else
1047     FromType = getLdStRegType(ScalarVT);
1048 
1049   unsigned VecType;
1050 
1051   switch (N->getOpcode()) {
1052   case NVPTXISD::LoadV2:
1053     VecType = NVPTX::PTXLdStInstCode::V2;
1054     break;
1055   case NVPTXISD::LoadV4:
1056     VecType = NVPTX::PTXLdStInstCode::V4;
1057     break;
1058   default:
1059     return false;
1060   }
1061 
1062   EVT EltVT = N->getValueType(0);
1063 
1064   // v8f16 is a special case. PTX doesn't have ld.v8.f16
1065   // instruction. Instead, we split the vector into v2f16 chunks and
1066   // load them with ld.v4.b32.
1067   if (EltVT == MVT::v2f16 || EltVT == MVT::v2bf16) {
1068     assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
1069     EltVT = MVT::i32;
1070     FromType = NVPTX::PTXLdStInstCode::Untyped;
1071     FromTypeWidth = 32;
1072   }
1073 
1074   if (SelectDirectAddr(Op1, Addr)) {
1075     switch (N->getOpcode()) {
1076     default:
1077       return false;
1078     case NVPTXISD::LoadV2:
1079       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1080                                NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
1081                                NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
1082                                NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
1083       break;
1084     case NVPTXISD::LoadV4:
1085       Opcode =
1086           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
1087                           NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar,
1088                           std::nullopt, NVPTX::LDV_f32_v4_avar, std::nullopt);
1089       break;
1090     }
1091     if (!Opcode)
1092       return false;
1093     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1094                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1095                       getI32Imm(FromTypeWidth, DL), Addr, Chain };
1096     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1097   } else if (PointerSize == 64
1098                  ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
1099                  : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
1100     switch (N->getOpcode()) {
1101     default:
1102       return false;
1103     case NVPTXISD::LoadV2:
1104       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1105                                NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
1106                                NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
1107                                NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
1108       break;
1109     case NVPTXISD::LoadV4:
1110       Opcode =
1111           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
1112                           NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi,
1113                           std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt);
1114       break;
1115     }
1116     if (!Opcode)
1117       return false;
1118     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1119                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1120                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1121     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1122   } else if (PointerSize == 64
1123                  ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1124                  : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1125     if (PointerSize == 64) {
1126       switch (N->getOpcode()) {
1127       default:
1128         return false;
1129       case NVPTXISD::LoadV2:
1130         Opcode =
1131             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1132                             NVPTX::LDV_i8_v2_ari_64, NVPTX::LDV_i16_v2_ari_64,
1133                             NVPTX::LDV_i32_v2_ari_64, NVPTX::LDV_i64_v2_ari_64,
1134                             NVPTX::LDV_f32_v2_ari_64, NVPTX::LDV_f64_v2_ari_64);
1135         break;
1136       case NVPTXISD::LoadV4:
1137         Opcode = pickOpcodeForVT(
1138             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
1139             NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, std::nullopt,
1140             NVPTX::LDV_f32_v4_ari_64, std::nullopt);
1141         break;
1142       }
1143     } else {
1144       switch (N->getOpcode()) {
1145       default:
1146         return false;
1147       case NVPTXISD::LoadV2:
1148         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1149                                  NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
1150                                  NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
1151                                  NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
1152         break;
1153       case NVPTXISD::LoadV4:
1154         Opcode =
1155             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
1156                             NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari,
1157                             std::nullopt, NVPTX::LDV_f32_v4_ari, std::nullopt);
1158         break;
1159       }
1160     }
1161     if (!Opcode)
1162       return false;
1163     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1164                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1165                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1166 
1167     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1168   } else {
1169     if (PointerSize == 64) {
1170       switch (N->getOpcode()) {
1171       default:
1172         return false;
1173       case NVPTXISD::LoadV2:
1174         Opcode = pickOpcodeForVT(
1175             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
1176             NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
1177             NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
1178             NVPTX::LDV_f64_v2_areg_64);
1179         break;
1180       case NVPTXISD::LoadV4:
1181         Opcode = pickOpcodeForVT(
1182             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
1183             NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, std::nullopt,
1184             NVPTX::LDV_f32_v4_areg_64, std::nullopt);
1185         break;
1186       }
1187     } else {
1188       switch (N->getOpcode()) {
1189       default:
1190         return false;
1191       case NVPTXISD::LoadV2:
1192         Opcode =
1193             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
1194                             NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
1195                             NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f32_v2_areg,
1196                             NVPTX::LDV_f64_v2_areg);
1197         break;
1198       case NVPTXISD::LoadV4:
1199         Opcode =
1200             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
1201                             NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg,
1202                             std::nullopt, NVPTX::LDV_f32_v4_areg, std::nullopt);
1203         break;
1204       }
1205     }
1206     if (!Opcode)
1207       return false;
1208     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1209                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1210                       getI32Imm(FromTypeWidth, DL), Op1, Chain };
1211     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1212   }
1213 
1214   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1215   CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1216 
1217   ReplaceNode(N, LD);
1218   return true;
1219 }
1220 
1221 bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
1222 
1223   SDValue Chain = N->getOperand(0);
1224   SDValue Op1;
1225   MemSDNode *Mem;
1226   bool IsLDG = true;
1227 
1228   // If this is an LDG intrinsic, the address is the third operand. If its an
1229   // LDG/LDU SD node (from custom vector handling), then its the second operand
1230   if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
1231     Op1 = N->getOperand(2);
1232     Mem = cast<MemIntrinsicSDNode>(N);
1233     unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
1234     switch (IID) {
1235     default:
1236       return false;
1237     case Intrinsic::nvvm_ldg_global_f:
1238     case Intrinsic::nvvm_ldg_global_i:
1239     case Intrinsic::nvvm_ldg_global_p:
1240       IsLDG = true;
1241       break;
1242     case Intrinsic::nvvm_ldu_global_f:
1243     case Intrinsic::nvvm_ldu_global_i:
1244     case Intrinsic::nvvm_ldu_global_p:
1245       IsLDG = false;
1246       break;
1247     }
1248   } else {
1249     Op1 = N->getOperand(1);
1250     Mem = cast<MemSDNode>(N);
1251   }
1252 
1253   std::optional<unsigned> Opcode;
1254   SDLoc DL(N);
1255   SDNode *LD;
1256   SDValue Base, Offset, Addr;
1257 
1258   EVT EltVT = Mem->getMemoryVT();
1259   unsigned NumElts = 1;
1260   if (EltVT.isVector()) {
1261     NumElts = EltVT.getVectorNumElements();
1262     EltVT = EltVT.getVectorElementType();
1263     // vectors of f16 are loaded/stored as multiples of v2f16 elements.
1264     if ((EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) ||
1265         (EltVT == MVT::bf16 && N->getValueType(0) == MVT::v2bf16)) {
1266           assert(NumElts % 2 == 0 && "Vector must have even number of elements");
1267           EltVT = N->getValueType(0);
1268           NumElts /= 2;
1269     }
1270   }
1271 
1272   // Build the "promoted" result VTList for the load. If we are really loading
1273   // i8s, then the return type will be promoted to i16 since we do not expose
1274   // 8-bit registers in NVPTX.
1275   EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
1276   SmallVector<EVT, 5> InstVTs;
1277   for (unsigned i = 0; i != NumElts; ++i) {
1278     InstVTs.push_back(NodeVT);
1279   }
1280   InstVTs.push_back(MVT::Other);
1281   SDVTList InstVTList = CurDAG->getVTList(InstVTs);
1282 
1283   if (SelectDirectAddr(Op1, Addr)) {
1284     switch (N->getOpcode()) {
1285     default:
1286       return false;
1287     case ISD::LOAD:
1288     case ISD::INTRINSIC_W_CHAIN:
1289       if (IsLDG)
1290         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1291                                  NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1292                                  NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
1293                                  NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1294                                  NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
1295                                  NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1296                                  NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1297       else
1298         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1299                                  NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1300                                  NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
1301                                  NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1302                                  NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
1303                                  NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1304                                  NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1305       break;
1306     case NVPTXISD::LoadV2:
1307     case NVPTXISD::LDGV2:
1308       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1309                                NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1310                                NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1311                                NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1312                                NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1313                                NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1314                                NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1315       break;
1316     case NVPTXISD::LDUV2:
1317       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1318                                NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1319                                NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1320                                NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1321                                NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1322                                NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1323                                NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1324       break;
1325     case NVPTXISD::LoadV4:
1326     case NVPTXISD::LDGV4:
1327       Opcode = pickOpcodeForVT(
1328           EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1329           NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1330           NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,
1331           NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);
1332       break;
1333     case NVPTXISD::LDUV4:
1334       Opcode = pickOpcodeForVT(
1335           EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1336           NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1337           NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,
1338           NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);
1339       break;
1340     }
1341     if (!Opcode)
1342       return false;
1343     SDValue Ops[] = { Addr, Chain };
1344     LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1345   } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1346                           : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1347     if (TM.is64Bit()) {
1348       switch (N->getOpcode()) {
1349       default:
1350         return false;
1351       case ISD::LOAD:
1352       case ISD::INTRINSIC_W_CHAIN:
1353         if (IsLDG)
1354           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1355                                        NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
1356                                        NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
1357                                        NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
1358                                        NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
1359                                        NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
1360                                        NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
1361         else
1362           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1363                                        NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
1364                                        NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
1365                                        NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
1366                                        NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
1367                                        NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
1368                                        NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
1369         break;
1370       case NVPTXISD::LoadV2:
1371       case NVPTXISD::LDGV2:
1372         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1373                                      NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
1374                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
1375                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
1376                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
1377                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
1378                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
1379         break;
1380       case NVPTXISD::LDUV2:
1381         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1382                                      NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
1383                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
1384                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
1385                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
1386                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
1387                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
1388         break;
1389       case NVPTXISD::LoadV4:
1390       case NVPTXISD::LDGV4:
1391         Opcode = pickOpcodeForVT(
1392             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
1393             NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
1394             NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, std::nullopt,
1395             NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, std::nullopt);
1396         break;
1397       case NVPTXISD::LDUV4:
1398         Opcode = pickOpcodeForVT(
1399             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
1400             NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
1401             NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, std::nullopt,
1402             NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, std::nullopt);
1403         break;
1404       }
1405     } else {
1406       switch (N->getOpcode()) {
1407       default:
1408         return false;
1409       case ISD::LOAD:
1410       case ISD::INTRINSIC_W_CHAIN:
1411         if (IsLDG)
1412           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1413                                    NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
1414                                    NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
1415                                    NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
1416                                    NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
1417                                    NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
1418                                    NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
1419         else
1420           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1421                                    NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
1422                                    NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
1423                                    NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
1424                                    NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
1425                                    NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
1426                                    NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
1427         break;
1428       case NVPTXISD::LoadV2:
1429       case NVPTXISD::LDGV2:
1430         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1431                                  NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
1432                                  NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
1433                                  NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
1434                                  NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
1435                                  NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
1436                                  NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
1437         break;
1438       case NVPTXISD::LDUV2:
1439         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1440                                  NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
1441                                  NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
1442                                  NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
1443                                  NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
1444                                  NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
1445                                  NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
1446         break;
1447       case NVPTXISD::LoadV4:
1448       case NVPTXISD::LDGV4:
1449         Opcode = pickOpcodeForVT(
1450             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
1451             NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
1452             NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, std::nullopt,
1453             NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, std::nullopt);
1454         break;
1455       case NVPTXISD::LDUV4:
1456         Opcode = pickOpcodeForVT(
1457             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
1458             NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
1459             NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, std::nullopt,
1460             NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, std::nullopt);
1461         break;
1462       }
1463     }
1464     if (!Opcode)
1465       return false;
1466     SDValue Ops[] = {Base, Offset, Chain};
1467     LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1468   } else {
1469     if (TM.is64Bit()) {
1470       switch (N->getOpcode()) {
1471       default:
1472         return false;
1473       case ISD::LOAD:
1474       case ISD::INTRINSIC_W_CHAIN:
1475         if (IsLDG)
1476           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1477                                        NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
1478                                        NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
1479                                        NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
1480                                        NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
1481                                        NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
1482                                        NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
1483         else
1484           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1485                                        NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
1486                                        NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
1487                                        NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
1488                                        NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
1489                                        NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
1490                                        NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
1491         break;
1492       case NVPTXISD::LoadV2:
1493       case NVPTXISD::LDGV2:
1494         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1495                                      NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
1496                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
1497                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
1498                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
1499                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
1500                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
1501         break;
1502       case NVPTXISD::LDUV2:
1503         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1504                                      NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
1505                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
1506                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
1507                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
1508                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
1509                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
1510         break;
1511       case NVPTXISD::LoadV4:
1512       case NVPTXISD::LDGV4:
1513         Opcode = pickOpcodeForVT(
1514             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
1515             NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
1516             NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, std::nullopt,
1517             NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, std::nullopt);
1518         break;
1519       case NVPTXISD::LDUV4:
1520         Opcode = pickOpcodeForVT(
1521             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
1522             NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
1523             NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, std::nullopt,
1524             NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, std::nullopt);
1525         break;
1526       }
1527     } else {
1528       switch (N->getOpcode()) {
1529       default:
1530         return false;
1531       case ISD::LOAD:
1532       case ISD::INTRINSIC_W_CHAIN:
1533         if (IsLDG)
1534           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1535                                    NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
1536                                    NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
1537                                    NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
1538                                    NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
1539                                    NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
1540                                    NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
1541         else
1542           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1543                                    NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
1544                                    NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
1545                                    NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
1546                                    NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
1547                                    NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
1548                                    NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
1549         break;
1550       case NVPTXISD::LoadV2:
1551       case NVPTXISD::LDGV2:
1552         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1553                                  NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
1554                                  NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
1555                                  NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
1556                                  NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
1557                                  NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
1558                                  NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
1559         break;
1560       case NVPTXISD::LDUV2:
1561         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1562                                  NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
1563                                  NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
1564                                  NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
1565                                  NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
1566                                  NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
1567                                  NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
1568         break;
1569       case NVPTXISD::LoadV4:
1570       case NVPTXISD::LDGV4:
1571         Opcode = pickOpcodeForVT(
1572             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
1573             NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
1574             NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, std::nullopt,
1575             NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, std::nullopt);
1576         break;
1577       case NVPTXISD::LDUV4:
1578         Opcode = pickOpcodeForVT(
1579             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
1580             NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
1581             NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, std::nullopt,
1582             NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, std::nullopt);
1583         break;
1584       }
1585     }
1586     if (!Opcode)
1587       return false;
1588     SDValue Ops[] = { Op1, Chain };
1589     LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1590   }
1591 
1592   // For automatic generation of LDG (through SelectLoad[Vector], not the
1593   // intrinsics), we may have an extending load like:
1594   //
1595   //   i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
1596   //
1597   // In this case, the matching logic above will select a load for the original
1598   // memory type (in this case, i8) and our types will not match (the node needs
1599   // to return an i32 in this case). Our LDG/LDU nodes do not support the
1600   // concept of sign-/zero-extension, so emulate it here by adding an explicit
1601   // CVT instruction. Ptxas should clean up any redundancies here.
1602 
1603   EVT OrigType = N->getValueType(0);
1604   LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
1605 
1606   if (OrigType != EltVT &&
1607       (LdNode || (OrigType.isFloatingPoint() && EltVT.isFloatingPoint()))) {
1608     // We have an extending-load. The instruction we selected operates on the
1609     // smaller type, but the SDNode we are replacing has the larger type. We
1610     // need to emit a CVT to make the types match.
1611     unsigned CvtOpc =
1612         GetConvertOpcode(OrigType.getSimpleVT(), EltVT.getSimpleVT(), LdNode);
1613 
1614     // For each output value, apply the manual sign/zero-extension and make sure
1615     // all users of the load go through that CVT.
1616     for (unsigned i = 0; i != NumElts; ++i) {
1617       SDValue Res(LD, i);
1618       SDValue OrigVal(N, i);
1619 
1620       SDNode *CvtNode =
1621         CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
1622                                CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
1623                                                          DL, MVT::i32));
1624       ReplaceUses(OrigVal, SDValue(CvtNode, 0));
1625     }
1626   }
1627 
1628   ReplaceNode(N, LD);
1629   return true;
1630 }
1631 
1632 bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1633   SDLoc dl(N);
1634   MemSDNode *ST = cast<MemSDNode>(N);
1635   assert(ST->writeMem() && "Expected store");
1636   StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
1637   AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
1638   assert((PlainStore || AtomicStore) && "Expected store");
1639   EVT StoreVT = ST->getMemoryVT();
1640   SDNode *NVPTXST = nullptr;
1641 
1642   // do not support pre/post inc/dec
1643   if (PlainStore && PlainStore->isIndexed())
1644     return false;
1645 
1646   if (!StoreVT.isSimple())
1647     return false;
1648 
1649   AtomicOrdering Ordering = ST->getSuccessOrdering();
1650   // In order to lower atomic loads with stronger guarantees we would need to
1651   // use store.release or insert fences. However these features were only added
1652   // with PTX ISA 6.0 / sm_70.
1653   // TODO: Check if we can actually use the new instructions and implement them.
1654   if (isStrongerThanMonotonic(Ordering))
1655     return false;
1656 
1657   // Address Space Setting
1658   unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
1659   unsigned int PointerSize =
1660       CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
1661 
1662   // Volatile Setting
1663   // - .volatile is only available for .global and .shared
1664   // - .volatile has the same memory synchronization semantics as .relaxed.sys
1665   bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;
1666   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1667       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1668       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1669     isVolatile = false;
1670 
1671   // Vector Setting
1672   MVT SimpleVT = StoreVT.getSimpleVT();
1673   unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
1674 
1675   // Type Setting: toType + toTypeWidth
1676   // - for integer type, always use 'u'
1677   //
1678   MVT ScalarVT = SimpleVT.getScalarType();
1679   unsigned toTypeWidth = ScalarVT.getSizeInBits();
1680   if (SimpleVT.isVector()) {
1681     assert((StoreVT == MVT::v2f16 || StoreVT == MVT::v2bf16) &&
1682            "Unexpected vector type");
1683     // v2f16 is stored using st.b32
1684     toTypeWidth = 32;
1685   }
1686 
1687   unsigned int toType = getLdStRegType(ScalarVT);
1688 
1689   // Create the machine instruction DAG
1690   SDValue Chain = ST->getChain();
1691   SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1692   SDValue BasePtr = ST->getBasePtr();
1693   SDValue Addr;
1694   SDValue Offset, Base;
1695   std::optional<unsigned> Opcode;
1696   MVT::SimpleValueType SourceVT =
1697       Value.getNode()->getSimpleValueType(0).SimpleTy;
1698 
1699   if (SelectDirectAddr(BasePtr, Addr)) {
1700     Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
1701                              NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
1702                              NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
1703     if (!Opcode)
1704       return false;
1705     SDValue Ops[] = {Value,
1706                      getI32Imm(isVolatile, dl),
1707                      getI32Imm(CodeAddrSpace, dl),
1708                      getI32Imm(vecType, dl),
1709                      getI32Imm(toType, dl),
1710                      getI32Imm(toTypeWidth, dl),
1711                      Addr,
1712                      Chain};
1713     NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1714   } else if (PointerSize == 64
1715                  ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
1716                  : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
1717     Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
1718                              NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
1719                              NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
1720     if (!Opcode)
1721       return false;
1722     SDValue Ops[] = {Value,
1723                      getI32Imm(isVolatile, dl),
1724                      getI32Imm(CodeAddrSpace, dl),
1725                      getI32Imm(vecType, dl),
1726                      getI32Imm(toType, dl),
1727                      getI32Imm(toTypeWidth, dl),
1728                      Base,
1729                      Offset,
1730                      Chain};
1731     NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1732   } else if (PointerSize == 64
1733                  ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
1734                  : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
1735     if (PointerSize == 64)
1736       Opcode =
1737           pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
1738                           NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64,
1739                           NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
1740     else
1741       Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
1742                                NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
1743                                NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
1744     if (!Opcode)
1745       return false;
1746 
1747     SDValue Ops[] = {Value,
1748                      getI32Imm(isVolatile, dl),
1749                      getI32Imm(CodeAddrSpace, dl),
1750                      getI32Imm(vecType, dl),
1751                      getI32Imm(toType, dl),
1752                      getI32Imm(toTypeWidth, dl),
1753                      Base,
1754                      Offset,
1755                      Chain};
1756     NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1757   } else {
1758     if (PointerSize == 64)
1759       Opcode =
1760           pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
1761                           NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
1762                           NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
1763     else
1764       Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
1765                                NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
1766                                NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
1767     if (!Opcode)
1768       return false;
1769     SDValue Ops[] = {Value,
1770                      getI32Imm(isVolatile, dl),
1771                      getI32Imm(CodeAddrSpace, dl),
1772                      getI32Imm(vecType, dl),
1773                      getI32Imm(toType, dl),
1774                      getI32Imm(toTypeWidth, dl),
1775                      BasePtr,
1776                      Chain};
1777     NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1778   }
1779 
1780   if (!NVPTXST)
1781     return false;
1782 
1783   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1784   CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1785   ReplaceNode(N, NVPTXST);
1786   return true;
1787 }
1788 
1789 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1790   SDValue Chain = N->getOperand(0);
1791   SDValue Op1 = N->getOperand(1);
1792   SDValue Addr, Offset, Base;
1793   std::optional<unsigned> Opcode;
1794   SDLoc DL(N);
1795   SDNode *ST;
1796   EVT EltVT = Op1.getValueType();
1797   MemSDNode *MemSD = cast<MemSDNode>(N);
1798   EVT StoreVT = MemSD->getMemoryVT();
1799 
1800   // Address Space Setting
1801   unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
1802   if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
1803     report_fatal_error("Cannot store to pointer that points to constant "
1804                        "memory space");
1805   }
1806   unsigned int PointerSize =
1807       CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
1808 
1809   // Volatile Setting
1810   // - .volatile is only availalble for .global and .shared
1811   bool IsVolatile = MemSD->isVolatile();
1812   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1813       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1814       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1815     IsVolatile = false;
1816 
1817   // Type Setting: toType + toTypeWidth
1818   // - for integer type, always use 'u'
1819   assert(StoreVT.isSimple() && "Store value is not simple");
1820   MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
1821   unsigned ToTypeWidth = ScalarVT.getSizeInBits();
1822   unsigned ToType = getLdStRegType(ScalarVT);
1823 
1824   SmallVector<SDValue, 12> StOps;
1825   SDValue N2;
1826   unsigned VecType;
1827 
1828   switch (N->getOpcode()) {
1829   case NVPTXISD::StoreV2:
1830     VecType = NVPTX::PTXLdStInstCode::V2;
1831     StOps.push_back(N->getOperand(1));
1832     StOps.push_back(N->getOperand(2));
1833     N2 = N->getOperand(3);
1834     break;
1835   case NVPTXISD::StoreV4:
1836     VecType = NVPTX::PTXLdStInstCode::V4;
1837     StOps.push_back(N->getOperand(1));
1838     StOps.push_back(N->getOperand(2));
1839     StOps.push_back(N->getOperand(3));
1840     StOps.push_back(N->getOperand(4));
1841     N2 = N->getOperand(5);
1842     break;
1843   default:
1844     return false;
1845   }
1846 
1847   // v8f16 is a special case. PTX doesn't have st.v8.f16
1848   // instruction. Instead, we split the vector into v2f16 chunks and
1849   // store them with st.v4.b32.
1850   if (EltVT == MVT::v2f16 || EltVT == MVT::v2bf16) {
1851     assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
1852     EltVT = MVT::i32;
1853     ToType = NVPTX::PTXLdStInstCode::Untyped;
1854     ToTypeWidth = 32;
1855   }
1856 
1857   StOps.push_back(getI32Imm(IsVolatile, DL));
1858   StOps.push_back(getI32Imm(CodeAddrSpace, DL));
1859   StOps.push_back(getI32Imm(VecType, DL));
1860   StOps.push_back(getI32Imm(ToType, DL));
1861   StOps.push_back(getI32Imm(ToTypeWidth, DL));
1862 
1863   if (SelectDirectAddr(N2, Addr)) {
1864     switch (N->getOpcode()) {
1865     default:
1866       return false;
1867     case NVPTXISD::StoreV2:
1868       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1869                                NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
1870                                NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
1871                                NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
1872       break;
1873     case NVPTXISD::StoreV4:
1874       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1875                                NVPTX::STV_i8_v4_avar, NVPTX::STV_i16_v4_avar,
1876                                NVPTX::STV_i32_v4_avar, std::nullopt,
1877                                NVPTX::STV_f32_v4_avar, std::nullopt);
1878       break;
1879     }
1880     StOps.push_back(Addr);
1881   } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
1882                                : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
1883     switch (N->getOpcode()) {
1884     default:
1885       return false;
1886     case NVPTXISD::StoreV2:
1887       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1888                                NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
1889                                NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
1890                                NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
1891       break;
1892     case NVPTXISD::StoreV4:
1893       Opcode =
1894           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
1895                           NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi,
1896                           std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt);
1897       break;
1898     }
1899     StOps.push_back(Base);
1900     StOps.push_back(Offset);
1901   } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
1902                                : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
1903     if (PointerSize == 64) {
1904       switch (N->getOpcode()) {
1905       default:
1906         return false;
1907       case NVPTXISD::StoreV2:
1908         Opcode =
1909             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1910                             NVPTX::STV_i8_v2_ari_64, NVPTX::STV_i16_v2_ari_64,
1911                             NVPTX::STV_i32_v2_ari_64, NVPTX::STV_i64_v2_ari_64,
1912                             NVPTX::STV_f32_v2_ari_64, NVPTX::STV_f64_v2_ari_64);
1913         break;
1914       case NVPTXISD::StoreV4:
1915         Opcode = pickOpcodeForVT(
1916             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
1917             NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, std::nullopt,
1918             NVPTX::STV_f32_v4_ari_64, std::nullopt);
1919         break;
1920       }
1921     } else {
1922       switch (N->getOpcode()) {
1923       default:
1924         return false;
1925       case NVPTXISD::StoreV2:
1926         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1927                                  NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
1928                                  NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
1929                                  NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
1930         break;
1931       case NVPTXISD::StoreV4:
1932         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1933                                  NVPTX::STV_i8_v4_ari, NVPTX::STV_i16_v4_ari,
1934                                  NVPTX::STV_i32_v4_ari, std::nullopt,
1935                                  NVPTX::STV_f32_v4_ari, std::nullopt);
1936         break;
1937       }
1938     }
1939     StOps.push_back(Base);
1940     StOps.push_back(Offset);
1941   } else {
1942     if (PointerSize == 64) {
1943       switch (N->getOpcode()) {
1944       default:
1945         return false;
1946       case NVPTXISD::StoreV2:
1947         Opcode = pickOpcodeForVT(
1948             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
1949             NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
1950             NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
1951             NVPTX::STV_f64_v2_areg_64);
1952         break;
1953       case NVPTXISD::StoreV4:
1954         Opcode = pickOpcodeForVT(
1955             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
1956             NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, std::nullopt,
1957             NVPTX::STV_f32_v4_areg_64, std::nullopt);
1958         break;
1959       }
1960     } else {
1961       switch (N->getOpcode()) {
1962       default:
1963         return false;
1964       case NVPTXISD::StoreV2:
1965         Opcode =
1966             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
1967                             NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
1968                             NVPTX::STV_i64_v2_areg, NVPTX::STV_f32_v2_areg,
1969                             NVPTX::STV_f64_v2_areg);
1970         break;
1971       case NVPTXISD::StoreV4:
1972         Opcode =
1973             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
1974                             NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg,
1975                             std::nullopt, NVPTX::STV_f32_v4_areg, std::nullopt);
1976         break;
1977       }
1978     }
1979     StOps.push_back(N2);
1980   }
1981 
1982   if (!Opcode)
1983     return false;
1984 
1985   StOps.push_back(Chain);
1986 
1987   ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, StOps);
1988 
1989   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1990   CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
1991 
1992   ReplaceNode(N, ST);
1993   return true;
1994 }
1995 
1996 bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
1997   SDValue Chain = Node->getOperand(0);
1998   SDValue Offset = Node->getOperand(2);
1999   SDValue Glue = Node->getOperand(3);
2000   SDLoc DL(Node);
2001   MemSDNode *Mem = cast<MemSDNode>(Node);
2002 
2003   unsigned VecSize;
2004   switch (Node->getOpcode()) {
2005   default:
2006     return false;
2007   case NVPTXISD::LoadParam:
2008     VecSize = 1;
2009     break;
2010   case NVPTXISD::LoadParamV2:
2011     VecSize = 2;
2012     break;
2013   case NVPTXISD::LoadParamV4:
2014     VecSize = 4;
2015     break;
2016   }
2017 
2018   EVT EltVT = Node->getValueType(0);
2019   EVT MemVT = Mem->getMemoryVT();
2020 
2021   std::optional<unsigned> Opcode;
2022 
2023   switch (VecSize) {
2024   default:
2025     return false;
2026   case 1:
2027     Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
2028                              NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
2029                              NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
2030                              NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
2031     break;
2032   case 2:
2033     Opcode =
2034         pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
2035                         NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
2036                         NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F32,
2037                         NVPTX::LoadParamMemV2F64);
2038     break;
2039   case 4:
2040     Opcode =
2041         pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
2042                         NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32,
2043                         std::nullopt, NVPTX::LoadParamMemV4F32, std::nullopt);
2044     break;
2045   }
2046   if (!Opcode)
2047     return false;
2048 
2049   SDVTList VTs;
2050   if (VecSize == 1) {
2051     VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
2052   } else if (VecSize == 2) {
2053     VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
2054   } else {
2055     EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
2056     VTs = CurDAG->getVTList(EVTs);
2057   }
2058 
2059   unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2060 
2061   SmallVector<SDValue, 2> Ops;
2062   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2063   Ops.push_back(Chain);
2064   Ops.push_back(Glue);
2065 
2066   ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops));
2067   return true;
2068 }
2069 
2070 bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
2071   SDLoc DL(N);
2072   SDValue Chain = N->getOperand(0);
2073   SDValue Offset = N->getOperand(1);
2074   unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2075   MemSDNode *Mem = cast<MemSDNode>(N);
2076 
2077   // How many elements do we have?
2078   unsigned NumElts = 1;
2079   switch (N->getOpcode()) {
2080   default:
2081     return false;
2082   case NVPTXISD::StoreRetval:
2083     NumElts = 1;
2084     break;
2085   case NVPTXISD::StoreRetvalV2:
2086     NumElts = 2;
2087     break;
2088   case NVPTXISD::StoreRetvalV4:
2089     NumElts = 4;
2090     break;
2091   }
2092 
2093   // Build vector of operands
2094   SmallVector<SDValue, 6> Ops;
2095   for (unsigned i = 0; i < NumElts; ++i)
2096     Ops.push_back(N->getOperand(i + 2));
2097   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2098   Ops.push_back(Chain);
2099 
2100   // Determine target opcode
2101   // If we have an i1, use an 8-bit store. The lowering code in
2102   // NVPTXISelLowering will have already emitted an upcast.
2103   std::optional<unsigned> Opcode = 0;
2104   switch (NumElts) {
2105   default:
2106     return false;
2107   case 1:
2108     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2109                              NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
2110                              NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
2111                              NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
2112     break;
2113   case 2:
2114     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2115                              NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
2116                              NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
2117                              NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
2118     break;
2119   case 4:
2120     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2121                              NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
2122                              NVPTX::StoreRetvalV4I32, std::nullopt,
2123                              NVPTX::StoreRetvalV4F32, std::nullopt);
2124     break;
2125   }
2126   if (!Opcode)
2127     return false;
2128 
2129   SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
2130   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2131   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2132 
2133   ReplaceNode(N, Ret);
2134   return true;
2135 }
2136 
2137 bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
2138   SDLoc DL(N);
2139   SDValue Chain = N->getOperand(0);
2140   SDValue Param = N->getOperand(1);
2141   unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue();
2142   SDValue Offset = N->getOperand(2);
2143   unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2144   MemSDNode *Mem = cast<MemSDNode>(N);
2145   SDValue Glue = N->getOperand(N->getNumOperands() - 1);
2146 
2147   // How many elements do we have?
2148   unsigned NumElts = 1;
2149   switch (N->getOpcode()) {
2150   default:
2151     return false;
2152   case NVPTXISD::StoreParamU32:
2153   case NVPTXISD::StoreParamS32:
2154   case NVPTXISD::StoreParam:
2155     NumElts = 1;
2156     break;
2157   case NVPTXISD::StoreParamV2:
2158     NumElts = 2;
2159     break;
2160   case NVPTXISD::StoreParamV4:
2161     NumElts = 4;
2162     break;
2163   }
2164 
2165   // Build vector of operands
2166   SmallVector<SDValue, 8> Ops;
2167   for (unsigned i = 0; i < NumElts; ++i)
2168     Ops.push_back(N->getOperand(i + 3));
2169   Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32));
2170   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2171   Ops.push_back(Chain);
2172   Ops.push_back(Glue);
2173 
2174   // Determine target opcode
2175   // If we have an i1, use an 8-bit store. The lowering code in
2176   // NVPTXISelLowering will have already emitted an upcast.
2177   std::optional<unsigned> Opcode = 0;
2178   switch (N->getOpcode()) {
2179   default:
2180     switch (NumElts) {
2181     default:
2182       return false;
2183     case 1:
2184       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2185                                NVPTX::StoreParamI8, NVPTX::StoreParamI16,
2186                                NVPTX::StoreParamI32, NVPTX::StoreParamI64,
2187                                NVPTX::StoreParamF32, NVPTX::StoreParamF64);
2188       break;
2189     case 2:
2190       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2191                                NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16,
2192                                NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64,
2193                                NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64);
2194       break;
2195     case 4:
2196       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2197                                NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16,
2198                                NVPTX::StoreParamV4I32, std::nullopt,
2199                                NVPTX::StoreParamV4F32, std::nullopt);
2200       break;
2201     }
2202     if (!Opcode)
2203       return false;
2204     break;
2205   // Special case: if we have a sign-extend/zero-extend node, insert the
2206   // conversion instruction first, and use that as the value operand to
2207   // the selected StoreParam node.
2208   case NVPTXISD::StoreParamU32: {
2209     Opcode = NVPTX::StoreParamI32;
2210     SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
2211                                                 MVT::i32);
2212     SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
2213                                          MVT::i32, Ops[0], CvtNone);
2214     Ops[0] = SDValue(Cvt, 0);
2215     break;
2216   }
2217   case NVPTXISD::StoreParamS32: {
2218     Opcode = NVPTX::StoreParamI32;
2219     SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
2220                                                 MVT::i32);
2221     SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
2222                                          MVT::i32, Ops[0], CvtNone);
2223     Ops[0] = SDValue(Cvt, 0);
2224     break;
2225   }
2226   }
2227 
2228   SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
2229   SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops);
2230   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2231   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2232 
2233   ReplaceNode(N, Ret);
2234   return true;
2235 }
2236 
2237 bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
2238   unsigned Opc = 0;
2239 
2240   switch (N->getOpcode()) {
2241   default: return false;
2242   case NVPTXISD::Tex1DFloatS32:
2243     Opc = NVPTX::TEX_1D_F32_S32_RR;
2244     break;
2245   case NVPTXISD::Tex1DFloatFloat:
2246     Opc = NVPTX::TEX_1D_F32_F32_RR;
2247     break;
2248   case NVPTXISD::Tex1DFloatFloatLevel:
2249     Opc = NVPTX::TEX_1D_F32_F32_LEVEL_RR;
2250     break;
2251   case NVPTXISD::Tex1DFloatFloatGrad:
2252     Opc = NVPTX::TEX_1D_F32_F32_GRAD_RR;
2253     break;
2254   case NVPTXISD::Tex1DS32S32:
2255     Opc = NVPTX::TEX_1D_S32_S32_RR;
2256     break;
2257   case NVPTXISD::Tex1DS32Float:
2258     Opc = NVPTX::TEX_1D_S32_F32_RR;
2259     break;
2260   case NVPTXISD::Tex1DS32FloatLevel:
2261     Opc = NVPTX::TEX_1D_S32_F32_LEVEL_RR;
2262     break;
2263   case NVPTXISD::Tex1DS32FloatGrad:
2264     Opc = NVPTX::TEX_1D_S32_F32_GRAD_RR;
2265     break;
2266   case NVPTXISD::Tex1DU32S32:
2267     Opc = NVPTX::TEX_1D_U32_S32_RR;
2268     break;
2269   case NVPTXISD::Tex1DU32Float:
2270     Opc = NVPTX::TEX_1D_U32_F32_RR;
2271     break;
2272   case NVPTXISD::Tex1DU32FloatLevel:
2273     Opc = NVPTX::TEX_1D_U32_F32_LEVEL_RR;
2274     break;
2275   case NVPTXISD::Tex1DU32FloatGrad:
2276     Opc = NVPTX::TEX_1D_U32_F32_GRAD_RR;
2277     break;
2278   case NVPTXISD::Tex1DArrayFloatS32:
2279     Opc = NVPTX::TEX_1D_ARRAY_F32_S32_RR;
2280     break;
2281   case NVPTXISD::Tex1DArrayFloatFloat:
2282     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_RR;
2283     break;
2284   case NVPTXISD::Tex1DArrayFloatFloatLevel:
2285     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR;
2286     break;
2287   case NVPTXISD::Tex1DArrayFloatFloatGrad:
2288     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR;
2289     break;
2290   case NVPTXISD::Tex1DArrayS32S32:
2291     Opc = NVPTX::TEX_1D_ARRAY_S32_S32_RR;
2292     break;
2293   case NVPTXISD::Tex1DArrayS32Float:
2294     Opc = NVPTX::TEX_1D_ARRAY_S32_F32_RR;
2295     break;
2296   case NVPTXISD::Tex1DArrayS32FloatLevel:
2297     Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR;
2298     break;
2299   case NVPTXISD::Tex1DArrayS32FloatGrad:
2300     Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR;
2301     break;
2302   case NVPTXISD::Tex1DArrayU32S32:
2303     Opc = NVPTX::TEX_1D_ARRAY_U32_S32_RR;
2304     break;
2305   case NVPTXISD::Tex1DArrayU32Float:
2306     Opc = NVPTX::TEX_1D_ARRAY_U32_F32_RR;
2307     break;
2308   case NVPTXISD::Tex1DArrayU32FloatLevel:
2309     Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR;
2310     break;
2311   case NVPTXISD::Tex1DArrayU32FloatGrad:
2312     Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR;
2313     break;
2314   case NVPTXISD::Tex2DFloatS32:
2315     Opc = NVPTX::TEX_2D_F32_S32_RR;
2316     break;
2317   case NVPTXISD::Tex2DFloatFloat:
2318     Opc = NVPTX::TEX_2D_F32_F32_RR;
2319     break;
2320   case NVPTXISD::Tex2DFloatFloatLevel:
2321     Opc = NVPTX::TEX_2D_F32_F32_LEVEL_RR;
2322     break;
2323   case NVPTXISD::Tex2DFloatFloatGrad:
2324     Opc = NVPTX::TEX_2D_F32_F32_GRAD_RR;
2325     break;
2326   case NVPTXISD::Tex2DS32S32:
2327     Opc = NVPTX::TEX_2D_S32_S32_RR;
2328     break;
2329   case NVPTXISD::Tex2DS32Float:
2330     Opc = NVPTX::TEX_2D_S32_F32_RR;
2331     break;
2332   case NVPTXISD::Tex2DS32FloatLevel:
2333     Opc = NVPTX::TEX_2D_S32_F32_LEVEL_RR;
2334     break;
2335   case NVPTXISD::Tex2DS32FloatGrad:
2336     Opc = NVPTX::TEX_2D_S32_F32_GRAD_RR;
2337     break;
2338   case NVPTXISD::Tex2DU32S32:
2339     Opc = NVPTX::TEX_2D_U32_S32_RR;
2340     break;
2341   case NVPTXISD::Tex2DU32Float:
2342     Opc = NVPTX::TEX_2D_U32_F32_RR;
2343     break;
2344   case NVPTXISD::Tex2DU32FloatLevel:
2345     Opc = NVPTX::TEX_2D_U32_F32_LEVEL_RR;
2346     break;
2347   case NVPTXISD::Tex2DU32FloatGrad:
2348     Opc = NVPTX::TEX_2D_U32_F32_GRAD_RR;
2349     break;
2350   case NVPTXISD::Tex2DArrayFloatS32:
2351     Opc = NVPTX::TEX_2D_ARRAY_F32_S32_RR;
2352     break;
2353   case NVPTXISD::Tex2DArrayFloatFloat:
2354     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_RR;
2355     break;
2356   case NVPTXISD::Tex2DArrayFloatFloatLevel:
2357     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR;
2358     break;
2359   case NVPTXISD::Tex2DArrayFloatFloatGrad:
2360     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR;
2361     break;
2362   case NVPTXISD::Tex2DArrayS32S32:
2363     Opc = NVPTX::TEX_2D_ARRAY_S32_S32_RR;
2364     break;
2365   case NVPTXISD::Tex2DArrayS32Float:
2366     Opc = NVPTX::TEX_2D_ARRAY_S32_F32_RR;
2367     break;
2368   case NVPTXISD::Tex2DArrayS32FloatLevel:
2369     Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR;
2370     break;
2371   case NVPTXISD::Tex2DArrayS32FloatGrad:
2372     Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR;
2373     break;
2374   case NVPTXISD::Tex2DArrayU32S32:
2375     Opc = NVPTX::TEX_2D_ARRAY_U32_S32_RR;
2376     break;
2377   case NVPTXISD::Tex2DArrayU32Float:
2378     Opc = NVPTX::TEX_2D_ARRAY_U32_F32_RR;
2379     break;
2380   case NVPTXISD::Tex2DArrayU32FloatLevel:
2381     Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR;
2382     break;
2383   case NVPTXISD::Tex2DArrayU32FloatGrad:
2384     Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR;
2385     break;
2386   case NVPTXISD::Tex3DFloatS32:
2387     Opc = NVPTX::TEX_3D_F32_S32_RR;
2388     break;
2389   case NVPTXISD::Tex3DFloatFloat:
2390     Opc = NVPTX::TEX_3D_F32_F32_RR;
2391     break;
2392   case NVPTXISD::Tex3DFloatFloatLevel:
2393     Opc = NVPTX::TEX_3D_F32_F32_LEVEL_RR;
2394     break;
2395   case NVPTXISD::Tex3DFloatFloatGrad:
2396     Opc = NVPTX::TEX_3D_F32_F32_GRAD_RR;
2397     break;
2398   case NVPTXISD::Tex3DS32S32:
2399     Opc = NVPTX::TEX_3D_S32_S32_RR;
2400     break;
2401   case NVPTXISD::Tex3DS32Float:
2402     Opc = NVPTX::TEX_3D_S32_F32_RR;
2403     break;
2404   case NVPTXISD::Tex3DS32FloatLevel:
2405     Opc = NVPTX::TEX_3D_S32_F32_LEVEL_RR;
2406     break;
2407   case NVPTXISD::Tex3DS32FloatGrad:
2408     Opc = NVPTX::TEX_3D_S32_F32_GRAD_RR;
2409     break;
2410   case NVPTXISD::Tex3DU32S32:
2411     Opc = NVPTX::TEX_3D_U32_S32_RR;
2412     break;
2413   case NVPTXISD::Tex3DU32Float:
2414     Opc = NVPTX::TEX_3D_U32_F32_RR;
2415     break;
2416   case NVPTXISD::Tex3DU32FloatLevel:
2417     Opc = NVPTX::TEX_3D_U32_F32_LEVEL_RR;
2418     break;
2419   case NVPTXISD::Tex3DU32FloatGrad:
2420     Opc = NVPTX::TEX_3D_U32_F32_GRAD_RR;
2421     break;
2422   case NVPTXISD::TexCubeFloatFloat:
2423     Opc = NVPTX::TEX_CUBE_F32_F32_RR;
2424     break;
2425   case NVPTXISD::TexCubeFloatFloatLevel:
2426     Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL_RR;
2427     break;
2428   case NVPTXISD::TexCubeS32Float:
2429     Opc = NVPTX::TEX_CUBE_S32_F32_RR;
2430     break;
2431   case NVPTXISD::TexCubeS32FloatLevel:
2432     Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL_RR;
2433     break;
2434   case NVPTXISD::TexCubeU32Float:
2435     Opc = NVPTX::TEX_CUBE_U32_F32_RR;
2436     break;
2437   case NVPTXISD::TexCubeU32FloatLevel:
2438     Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL_RR;
2439     break;
2440   case NVPTXISD::TexCubeArrayFloatFloat:
2441     Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_RR;
2442     break;
2443   case NVPTXISD::TexCubeArrayFloatFloatLevel:
2444     Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR;
2445     break;
2446   case NVPTXISD::TexCubeArrayS32Float:
2447     Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_RR;
2448     break;
2449   case NVPTXISD::TexCubeArrayS32FloatLevel:
2450     Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR;
2451     break;
2452   case NVPTXISD::TexCubeArrayU32Float:
2453     Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_RR;
2454     break;
2455   case NVPTXISD::TexCubeArrayU32FloatLevel:
2456     Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR;
2457     break;
2458   case NVPTXISD::Tld4R2DFloatFloat:
2459     Opc = NVPTX::TLD4_R_2D_F32_F32_RR;
2460     break;
2461   case NVPTXISD::Tld4G2DFloatFloat:
2462     Opc = NVPTX::TLD4_G_2D_F32_F32_RR;
2463     break;
2464   case NVPTXISD::Tld4B2DFloatFloat:
2465     Opc = NVPTX::TLD4_B_2D_F32_F32_RR;
2466     break;
2467   case NVPTXISD::Tld4A2DFloatFloat:
2468     Opc = NVPTX::TLD4_A_2D_F32_F32_RR;
2469     break;
2470   case NVPTXISD::Tld4R2DS64Float:
2471     Opc = NVPTX::TLD4_R_2D_S32_F32_RR;
2472     break;
2473   case NVPTXISD::Tld4G2DS64Float:
2474     Opc = NVPTX::TLD4_G_2D_S32_F32_RR;
2475     break;
2476   case NVPTXISD::Tld4B2DS64Float:
2477     Opc = NVPTX::TLD4_B_2D_S32_F32_RR;
2478     break;
2479   case NVPTXISD::Tld4A2DS64Float:
2480     Opc = NVPTX::TLD4_A_2D_S32_F32_RR;
2481     break;
2482   case NVPTXISD::Tld4R2DU64Float:
2483     Opc = NVPTX::TLD4_R_2D_U32_F32_RR;
2484     break;
2485   case NVPTXISD::Tld4G2DU64Float:
2486     Opc = NVPTX::TLD4_G_2D_U32_F32_RR;
2487     break;
2488   case NVPTXISD::Tld4B2DU64Float:
2489     Opc = NVPTX::TLD4_B_2D_U32_F32_RR;
2490     break;
2491   case NVPTXISD::Tld4A2DU64Float:
2492     Opc = NVPTX::TLD4_A_2D_U32_F32_RR;
2493     break;
2494   case NVPTXISD::TexUnified1DFloatS32:
2495     Opc = NVPTX::TEX_UNIFIED_1D_F32_S32_R;
2496     break;
2497   case NVPTXISD::TexUnified1DFloatFloat:
2498     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_R;
2499     break;
2500   case NVPTXISD::TexUnified1DFloatFloatLevel:
2501     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R;
2502     break;
2503   case NVPTXISD::TexUnified1DFloatFloatGrad:
2504     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R;
2505     break;
2506   case NVPTXISD::TexUnified1DS32S32:
2507     Opc = NVPTX::TEX_UNIFIED_1D_S32_S32_R;
2508     break;
2509   case NVPTXISD::TexUnified1DS32Float:
2510     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_R;
2511     break;
2512   case NVPTXISD::TexUnified1DS32FloatLevel:
2513     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R;
2514     break;
2515   case NVPTXISD::TexUnified1DS32FloatGrad:
2516     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R;
2517     break;
2518   case NVPTXISD::TexUnified1DU32S32:
2519     Opc = NVPTX::TEX_UNIFIED_1D_U32_S32_R;
2520     break;
2521   case NVPTXISD::TexUnified1DU32Float:
2522     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_R;
2523     break;
2524   case NVPTXISD::TexUnified1DU32FloatLevel:
2525     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R;
2526     break;
2527   case NVPTXISD::TexUnified1DU32FloatGrad:
2528     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R;
2529     break;
2530   case NVPTXISD::TexUnified1DArrayFloatS32:
2531     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R;
2532     break;
2533   case NVPTXISD::TexUnified1DArrayFloatFloat:
2534     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R;
2535     break;
2536   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
2537     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R;
2538     break;
2539   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
2540     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R;
2541     break;
2542   case NVPTXISD::TexUnified1DArrayS32S32:
2543     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R;
2544     break;
2545   case NVPTXISD::TexUnified1DArrayS32Float:
2546     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R;
2547     break;
2548   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
2549     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R;
2550     break;
2551   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
2552     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R;
2553     break;
2554   case NVPTXISD::TexUnified1DArrayU32S32:
2555     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R;
2556     break;
2557   case NVPTXISD::TexUnified1DArrayU32Float:
2558     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R;
2559     break;
2560   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
2561     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R;
2562     break;
2563   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
2564     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R;
2565     break;
2566   case NVPTXISD::TexUnified2DFloatS32:
2567     Opc = NVPTX::TEX_UNIFIED_2D_F32_S32_R;
2568     break;
2569   case NVPTXISD::TexUnified2DFloatFloat:
2570     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_R;
2571     break;
2572   case NVPTXISD::TexUnified2DFloatFloatLevel:
2573     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R;
2574     break;
2575   case NVPTXISD::TexUnified2DFloatFloatGrad:
2576     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R;
2577     break;
2578   case NVPTXISD::TexUnified2DS32S32:
2579     Opc = NVPTX::TEX_UNIFIED_2D_S32_S32_R;
2580     break;
2581   case NVPTXISD::TexUnified2DS32Float:
2582     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_R;
2583     break;
2584   case NVPTXISD::TexUnified2DS32FloatLevel:
2585     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R;
2586     break;
2587   case NVPTXISD::TexUnified2DS32FloatGrad:
2588     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R;
2589     break;
2590   case NVPTXISD::TexUnified2DU32S32:
2591     Opc = NVPTX::TEX_UNIFIED_2D_U32_S32_R;
2592     break;
2593   case NVPTXISD::TexUnified2DU32Float:
2594     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_R;
2595     break;
2596   case NVPTXISD::TexUnified2DU32FloatLevel:
2597     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R;
2598     break;
2599   case NVPTXISD::TexUnified2DU32FloatGrad:
2600     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R;
2601     break;
2602   case NVPTXISD::TexUnified2DArrayFloatS32:
2603     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R;
2604     break;
2605   case NVPTXISD::TexUnified2DArrayFloatFloat:
2606     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R;
2607     break;
2608   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
2609     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R;
2610     break;
2611   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
2612     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R;
2613     break;
2614   case NVPTXISD::TexUnified2DArrayS32S32:
2615     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R;
2616     break;
2617   case NVPTXISD::TexUnified2DArrayS32Float:
2618     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R;
2619     break;
2620   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
2621     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R;
2622     break;
2623   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
2624     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R;
2625     break;
2626   case NVPTXISD::TexUnified2DArrayU32S32:
2627     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R;
2628     break;
2629   case NVPTXISD::TexUnified2DArrayU32Float:
2630     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R;
2631     break;
2632   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
2633     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R;
2634     break;
2635   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
2636     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R;
2637     break;
2638   case NVPTXISD::TexUnified3DFloatS32:
2639     Opc = NVPTX::TEX_UNIFIED_3D_F32_S32_R;
2640     break;
2641   case NVPTXISD::TexUnified3DFloatFloat:
2642     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_R;
2643     break;
2644   case NVPTXISD::TexUnified3DFloatFloatLevel:
2645     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R;
2646     break;
2647   case NVPTXISD::TexUnified3DFloatFloatGrad:
2648     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R;
2649     break;
2650   case NVPTXISD::TexUnified3DS32S32:
2651     Opc = NVPTX::TEX_UNIFIED_3D_S32_S32_R;
2652     break;
2653   case NVPTXISD::TexUnified3DS32Float:
2654     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_R;
2655     break;
2656   case NVPTXISD::TexUnified3DS32FloatLevel:
2657     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R;
2658     break;
2659   case NVPTXISD::TexUnified3DS32FloatGrad:
2660     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R;
2661     break;
2662   case NVPTXISD::TexUnified3DU32S32:
2663     Opc = NVPTX::TEX_UNIFIED_3D_U32_S32_R;
2664     break;
2665   case NVPTXISD::TexUnified3DU32Float:
2666     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_R;
2667     break;
2668   case NVPTXISD::TexUnified3DU32FloatLevel:
2669     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R;
2670     break;
2671   case NVPTXISD::TexUnified3DU32FloatGrad:
2672     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R;
2673     break;
2674   case NVPTXISD::TexUnifiedCubeFloatFloat:
2675     Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_R;
2676     break;
2677   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
2678     Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R;
2679     break;
2680   case NVPTXISD::TexUnifiedCubeS32Float:
2681     Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_R;
2682     break;
2683   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
2684     Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R;
2685     break;
2686   case NVPTXISD::TexUnifiedCubeU32Float:
2687     Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_R;
2688     break;
2689   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
2690     Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R;
2691     break;
2692   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
2693     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R;
2694     break;
2695   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
2696     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R;
2697     break;
2698   case NVPTXISD::TexUnifiedCubeArrayS32Float:
2699     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R;
2700     break;
2701   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
2702     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R;
2703     break;
2704   case NVPTXISD::TexUnifiedCubeArrayU32Float:
2705     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R;
2706     break;
2707   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
2708     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R;
2709     break;
2710   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
2711     Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R;
2712     break;
2713   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
2714     Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R;
2715     break;
2716   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
2717     Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R;
2718     break;
2719   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
2720     Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R;
2721     break;
2722   case NVPTXISD::Tld4UnifiedR2DS64Float:
2723     Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R;
2724     break;
2725   case NVPTXISD::Tld4UnifiedG2DS64Float:
2726     Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R;
2727     break;
2728   case NVPTXISD::Tld4UnifiedB2DS64Float:
2729     Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R;
2730     break;
2731   case NVPTXISD::Tld4UnifiedA2DS64Float:
2732     Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R;
2733     break;
2734   case NVPTXISD::Tld4UnifiedR2DU64Float:
2735     Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R;
2736     break;
2737   case NVPTXISD::Tld4UnifiedG2DU64Float:
2738     Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R;
2739     break;
2740   case NVPTXISD::Tld4UnifiedB2DU64Float:
2741     Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R;
2742     break;
2743   case NVPTXISD::Tld4UnifiedA2DU64Float:
2744     Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R;
2745     break;
2746   }
2747 
2748   // Copy over operands
2749   SmallVector<SDValue, 8> Ops(drop_begin(N->ops()));
2750   Ops.push_back(N->getOperand(0)); // Move chain to the back.
2751 
2752   ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
2753   return true;
2754 }
2755 
2756 bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
2757   unsigned Opc = 0;
2758   switch (N->getOpcode()) {
2759   default: return false;
2760   case NVPTXISD::Suld1DI8Clamp:
2761     Opc = NVPTX::SULD_1D_I8_CLAMP_R;
2762     break;
2763   case NVPTXISD::Suld1DI16Clamp:
2764     Opc = NVPTX::SULD_1D_I16_CLAMP_R;
2765     break;
2766   case NVPTXISD::Suld1DI32Clamp:
2767     Opc = NVPTX::SULD_1D_I32_CLAMP_R;
2768     break;
2769   case NVPTXISD::Suld1DI64Clamp:
2770     Opc = NVPTX::SULD_1D_I64_CLAMP_R;
2771     break;
2772   case NVPTXISD::Suld1DV2I8Clamp:
2773     Opc = NVPTX::SULD_1D_V2I8_CLAMP_R;
2774     break;
2775   case NVPTXISD::Suld1DV2I16Clamp:
2776     Opc = NVPTX::SULD_1D_V2I16_CLAMP_R;
2777     break;
2778   case NVPTXISD::Suld1DV2I32Clamp:
2779     Opc = NVPTX::SULD_1D_V2I32_CLAMP_R;
2780     break;
2781   case NVPTXISD::Suld1DV2I64Clamp:
2782     Opc = NVPTX::SULD_1D_V2I64_CLAMP_R;
2783     break;
2784   case NVPTXISD::Suld1DV4I8Clamp:
2785     Opc = NVPTX::SULD_1D_V4I8_CLAMP_R;
2786     break;
2787   case NVPTXISD::Suld1DV4I16Clamp:
2788     Opc = NVPTX::SULD_1D_V4I16_CLAMP_R;
2789     break;
2790   case NVPTXISD::Suld1DV4I32Clamp:
2791     Opc = NVPTX::SULD_1D_V4I32_CLAMP_R;
2792     break;
2793   case NVPTXISD::Suld1DArrayI8Clamp:
2794     Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP_R;
2795     break;
2796   case NVPTXISD::Suld1DArrayI16Clamp:
2797     Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP_R;
2798     break;
2799   case NVPTXISD::Suld1DArrayI32Clamp:
2800     Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP_R;
2801     break;
2802   case NVPTXISD::Suld1DArrayI64Clamp:
2803     Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP_R;
2804     break;
2805   case NVPTXISD::Suld1DArrayV2I8Clamp:
2806     Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R;
2807     break;
2808   case NVPTXISD::Suld1DArrayV2I16Clamp:
2809     Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R;
2810     break;
2811   case NVPTXISD::Suld1DArrayV2I32Clamp:
2812     Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R;
2813     break;
2814   case NVPTXISD::Suld1DArrayV2I64Clamp:
2815     Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R;
2816     break;
2817   case NVPTXISD::Suld1DArrayV4I8Clamp:
2818     Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R;
2819     break;
2820   case NVPTXISD::Suld1DArrayV4I16Clamp:
2821     Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R;
2822     break;
2823   case NVPTXISD::Suld1DArrayV4I32Clamp:
2824     Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R;
2825     break;
2826   case NVPTXISD::Suld2DI8Clamp:
2827     Opc = NVPTX::SULD_2D_I8_CLAMP_R;
2828     break;
2829   case NVPTXISD::Suld2DI16Clamp:
2830     Opc = NVPTX::SULD_2D_I16_CLAMP_R;
2831     break;
2832   case NVPTXISD::Suld2DI32Clamp:
2833     Opc = NVPTX::SULD_2D_I32_CLAMP_R;
2834     break;
2835   case NVPTXISD::Suld2DI64Clamp:
2836     Opc = NVPTX::SULD_2D_I64_CLAMP_R;
2837     break;
2838   case NVPTXISD::Suld2DV2I8Clamp:
2839     Opc = NVPTX::SULD_2D_V2I8_CLAMP_R;
2840     break;
2841   case NVPTXISD::Suld2DV2I16Clamp:
2842     Opc = NVPTX::SULD_2D_V2I16_CLAMP_R;
2843     break;
2844   case NVPTXISD::Suld2DV2I32Clamp:
2845     Opc = NVPTX::SULD_2D_V2I32_CLAMP_R;
2846     break;
2847   case NVPTXISD::Suld2DV2I64Clamp:
2848     Opc = NVPTX::SULD_2D_V2I64_CLAMP_R;
2849     break;
2850   case NVPTXISD::Suld2DV4I8Clamp:
2851     Opc = NVPTX::SULD_2D_V4I8_CLAMP_R;
2852     break;
2853   case NVPTXISD::Suld2DV4I16Clamp:
2854     Opc = NVPTX::SULD_2D_V4I16_CLAMP_R;
2855     break;
2856   case NVPTXISD::Suld2DV4I32Clamp:
2857     Opc = NVPTX::SULD_2D_V4I32_CLAMP_R;
2858     break;
2859   case NVPTXISD::Suld2DArrayI8Clamp:
2860     Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP_R;
2861     break;
2862   case NVPTXISD::Suld2DArrayI16Clamp:
2863     Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP_R;
2864     break;
2865   case NVPTXISD::Suld2DArrayI32Clamp:
2866     Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP_R;
2867     break;
2868   case NVPTXISD::Suld2DArrayI64Clamp:
2869     Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP_R;
2870     break;
2871   case NVPTXISD::Suld2DArrayV2I8Clamp:
2872     Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R;
2873     break;
2874   case NVPTXISD::Suld2DArrayV2I16Clamp:
2875     Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R;
2876     break;
2877   case NVPTXISD::Suld2DArrayV2I32Clamp:
2878     Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R;
2879     break;
2880   case NVPTXISD::Suld2DArrayV2I64Clamp:
2881     Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R;
2882     break;
2883   case NVPTXISD::Suld2DArrayV4I8Clamp:
2884     Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R;
2885     break;
2886   case NVPTXISD::Suld2DArrayV4I16Clamp:
2887     Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R;
2888     break;
2889   case NVPTXISD::Suld2DArrayV4I32Clamp:
2890     Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R;
2891     break;
2892   case NVPTXISD::Suld3DI8Clamp:
2893     Opc = NVPTX::SULD_3D_I8_CLAMP_R;
2894     break;
2895   case NVPTXISD::Suld3DI16Clamp:
2896     Opc = NVPTX::SULD_3D_I16_CLAMP_R;
2897     break;
2898   case NVPTXISD::Suld3DI32Clamp:
2899     Opc = NVPTX::SULD_3D_I32_CLAMP_R;
2900     break;
2901   case NVPTXISD::Suld3DI64Clamp:
2902     Opc = NVPTX::SULD_3D_I64_CLAMP_R;
2903     break;
2904   case NVPTXISD::Suld3DV2I8Clamp:
2905     Opc = NVPTX::SULD_3D_V2I8_CLAMP_R;
2906     break;
2907   case NVPTXISD::Suld3DV2I16Clamp:
2908     Opc = NVPTX::SULD_3D_V2I16_CLAMP_R;
2909     break;
2910   case NVPTXISD::Suld3DV2I32Clamp:
2911     Opc = NVPTX::SULD_3D_V2I32_CLAMP_R;
2912     break;
2913   case NVPTXISD::Suld3DV2I64Clamp:
2914     Opc = NVPTX::SULD_3D_V2I64_CLAMP_R;
2915     break;
2916   case NVPTXISD::Suld3DV4I8Clamp:
2917     Opc = NVPTX::SULD_3D_V4I8_CLAMP_R;
2918     break;
2919   case NVPTXISD::Suld3DV4I16Clamp:
2920     Opc = NVPTX::SULD_3D_V4I16_CLAMP_R;
2921     break;
2922   case NVPTXISD::Suld3DV4I32Clamp:
2923     Opc = NVPTX::SULD_3D_V4I32_CLAMP_R;
2924     break;
2925   case NVPTXISD::Suld1DI8Trap:
2926     Opc = NVPTX::SULD_1D_I8_TRAP_R;
2927     break;
2928   case NVPTXISD::Suld1DI16Trap:
2929     Opc = NVPTX::SULD_1D_I16_TRAP_R;
2930     break;
2931   case NVPTXISD::Suld1DI32Trap:
2932     Opc = NVPTX::SULD_1D_I32_TRAP_R;
2933     break;
2934   case NVPTXISD::Suld1DI64Trap:
2935     Opc = NVPTX::SULD_1D_I64_TRAP_R;
2936     break;
2937   case NVPTXISD::Suld1DV2I8Trap:
2938     Opc = NVPTX::SULD_1D_V2I8_TRAP_R;
2939     break;
2940   case NVPTXISD::Suld1DV2I16Trap:
2941     Opc = NVPTX::SULD_1D_V2I16_TRAP_R;
2942     break;
2943   case NVPTXISD::Suld1DV2I32Trap:
2944     Opc = NVPTX::SULD_1D_V2I32_TRAP_R;
2945     break;
2946   case NVPTXISD::Suld1DV2I64Trap:
2947     Opc = NVPTX::SULD_1D_V2I64_TRAP_R;
2948     break;
2949   case NVPTXISD::Suld1DV4I8Trap:
2950     Opc = NVPTX::SULD_1D_V4I8_TRAP_R;
2951     break;
2952   case NVPTXISD::Suld1DV4I16Trap:
2953     Opc = NVPTX::SULD_1D_V4I16_TRAP_R;
2954     break;
2955   case NVPTXISD::Suld1DV4I32Trap:
2956     Opc = NVPTX::SULD_1D_V4I32_TRAP_R;
2957     break;
2958   case NVPTXISD::Suld1DArrayI8Trap:
2959     Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP_R;
2960     break;
2961   case NVPTXISD::Suld1DArrayI16Trap:
2962     Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP_R;
2963     break;
2964   case NVPTXISD::Suld1DArrayI32Trap:
2965     Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP_R;
2966     break;
2967   case NVPTXISD::Suld1DArrayI64Trap:
2968     Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP_R;
2969     break;
2970   case NVPTXISD::Suld1DArrayV2I8Trap:
2971     Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R;
2972     break;
2973   case NVPTXISD::Suld1DArrayV2I16Trap:
2974     Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R;
2975     break;
2976   case NVPTXISD::Suld1DArrayV2I32Trap:
2977     Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R;
2978     break;
2979   case NVPTXISD::Suld1DArrayV2I64Trap:
2980     Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R;
2981     break;
2982   case NVPTXISD::Suld1DArrayV4I8Trap:
2983     Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R;
2984     break;
2985   case NVPTXISD::Suld1DArrayV4I16Trap:
2986     Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R;
2987     break;
2988   case NVPTXISD::Suld1DArrayV4I32Trap:
2989     Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R;
2990     break;
2991   case NVPTXISD::Suld2DI8Trap:
2992     Opc = NVPTX::SULD_2D_I8_TRAP_R;
2993     break;
2994   case NVPTXISD::Suld2DI16Trap:
2995     Opc = NVPTX::SULD_2D_I16_TRAP_R;
2996     break;
2997   case NVPTXISD::Suld2DI32Trap:
2998     Opc = NVPTX::SULD_2D_I32_TRAP_R;
2999     break;
3000   case NVPTXISD::Suld2DI64Trap:
3001     Opc = NVPTX::SULD_2D_I64_TRAP_R;
3002     break;
3003   case NVPTXISD::Suld2DV2I8Trap:
3004     Opc = NVPTX::SULD_2D_V2I8_TRAP_R;
3005     break;
3006   case NVPTXISD::Suld2DV2I16Trap:
3007     Opc = NVPTX::SULD_2D_V2I16_TRAP_R;
3008     break;
3009   case NVPTXISD::Suld2DV2I32Trap:
3010     Opc = NVPTX::SULD_2D_V2I32_TRAP_R;
3011     break;
3012   case NVPTXISD::Suld2DV2I64Trap:
3013     Opc = NVPTX::SULD_2D_V2I64_TRAP_R;
3014     break;
3015   case NVPTXISD::Suld2DV4I8Trap:
3016     Opc = NVPTX::SULD_2D_V4I8_TRAP_R;
3017     break;
3018   case NVPTXISD::Suld2DV4I16Trap:
3019     Opc = NVPTX::SULD_2D_V4I16_TRAP_R;
3020     break;
3021   case NVPTXISD::Suld2DV4I32Trap:
3022     Opc = NVPTX::SULD_2D_V4I32_TRAP_R;
3023     break;
3024   case NVPTXISD::Suld2DArrayI8Trap:
3025     Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP_R;
3026     break;
3027   case NVPTXISD::Suld2DArrayI16Trap:
3028     Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP_R;
3029     break;
3030   case NVPTXISD::Suld2DArrayI32Trap:
3031     Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP_R;
3032     break;
3033   case NVPTXISD::Suld2DArrayI64Trap:
3034     Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP_R;
3035     break;
3036   case NVPTXISD::Suld2DArrayV2I8Trap:
3037     Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R;
3038     break;
3039   case NVPTXISD::Suld2DArrayV2I16Trap:
3040     Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R;
3041     break;
3042   case NVPTXISD::Suld2DArrayV2I32Trap:
3043     Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R;
3044     break;
3045   case NVPTXISD::Suld2DArrayV2I64Trap:
3046     Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R;
3047     break;
3048   case NVPTXISD::Suld2DArrayV4I8Trap:
3049     Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R;
3050     break;
3051   case NVPTXISD::Suld2DArrayV4I16Trap:
3052     Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R;
3053     break;
3054   case NVPTXISD::Suld2DArrayV4I32Trap:
3055     Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R;
3056     break;
3057   case NVPTXISD::Suld3DI8Trap:
3058     Opc = NVPTX::SULD_3D_I8_TRAP_R;
3059     break;
3060   case NVPTXISD::Suld3DI16Trap:
3061     Opc = NVPTX::SULD_3D_I16_TRAP_R;
3062     break;
3063   case NVPTXISD::Suld3DI32Trap:
3064     Opc = NVPTX::SULD_3D_I32_TRAP_R;
3065     break;
3066   case NVPTXISD::Suld3DI64Trap:
3067     Opc = NVPTX::SULD_3D_I64_TRAP_R;
3068     break;
3069   case NVPTXISD::Suld3DV2I8Trap:
3070     Opc = NVPTX::SULD_3D_V2I8_TRAP_R;
3071     break;
3072   case NVPTXISD::Suld3DV2I16Trap:
3073     Opc = NVPTX::SULD_3D_V2I16_TRAP_R;
3074     break;
3075   case NVPTXISD::Suld3DV2I32Trap:
3076     Opc = NVPTX::SULD_3D_V2I32_TRAP_R;
3077     break;
3078   case NVPTXISD::Suld3DV2I64Trap:
3079     Opc = NVPTX::SULD_3D_V2I64_TRAP_R;
3080     break;
3081   case NVPTXISD::Suld3DV4I8Trap:
3082     Opc = NVPTX::SULD_3D_V4I8_TRAP_R;
3083     break;
3084   case NVPTXISD::Suld3DV4I16Trap:
3085     Opc = NVPTX::SULD_3D_V4I16_TRAP_R;
3086     break;
3087   case NVPTXISD::Suld3DV4I32Trap:
3088     Opc = NVPTX::SULD_3D_V4I32_TRAP_R;
3089     break;
3090   case NVPTXISD::Suld1DI8Zero:
3091     Opc = NVPTX::SULD_1D_I8_ZERO_R;
3092     break;
3093   case NVPTXISD::Suld1DI16Zero:
3094     Opc = NVPTX::SULD_1D_I16_ZERO_R;
3095     break;
3096   case NVPTXISD::Suld1DI32Zero:
3097     Opc = NVPTX::SULD_1D_I32_ZERO_R;
3098     break;
3099   case NVPTXISD::Suld1DI64Zero:
3100     Opc = NVPTX::SULD_1D_I64_ZERO_R;
3101     break;
3102   case NVPTXISD::Suld1DV2I8Zero:
3103     Opc = NVPTX::SULD_1D_V2I8_ZERO_R;
3104     break;
3105   case NVPTXISD::Suld1DV2I16Zero:
3106     Opc = NVPTX::SULD_1D_V2I16_ZERO_R;
3107     break;
3108   case NVPTXISD::Suld1DV2I32Zero:
3109     Opc = NVPTX::SULD_1D_V2I32_ZERO_R;
3110     break;
3111   case NVPTXISD::Suld1DV2I64Zero:
3112     Opc = NVPTX::SULD_1D_V2I64_ZERO_R;
3113     break;
3114   case NVPTXISD::Suld1DV4I8Zero:
3115     Opc = NVPTX::SULD_1D_V4I8_ZERO_R;
3116     break;
3117   case NVPTXISD::Suld1DV4I16Zero:
3118     Opc = NVPTX::SULD_1D_V4I16_ZERO_R;
3119     break;
3120   case NVPTXISD::Suld1DV4I32Zero:
3121     Opc = NVPTX::SULD_1D_V4I32_ZERO_R;
3122     break;
3123   case NVPTXISD::Suld1DArrayI8Zero:
3124     Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO_R;
3125     break;
3126   case NVPTXISD::Suld1DArrayI16Zero:
3127     Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO_R;
3128     break;
3129   case NVPTXISD::Suld1DArrayI32Zero:
3130     Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO_R;
3131     break;
3132   case NVPTXISD::Suld1DArrayI64Zero:
3133     Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO_R;
3134     break;
3135   case NVPTXISD::Suld1DArrayV2I8Zero:
3136     Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R;
3137     break;
3138   case NVPTXISD::Suld1DArrayV2I16Zero:
3139     Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R;
3140     break;
3141   case NVPTXISD::Suld1DArrayV2I32Zero:
3142     Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R;
3143     break;
3144   case NVPTXISD::Suld1DArrayV2I64Zero:
3145     Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R;
3146     break;
3147   case NVPTXISD::Suld1DArrayV4I8Zero:
3148     Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R;
3149     break;
3150   case NVPTXISD::Suld1DArrayV4I16Zero:
3151     Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R;
3152     break;
3153   case NVPTXISD::Suld1DArrayV4I32Zero:
3154     Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R;
3155     break;
3156   case NVPTXISD::Suld2DI8Zero:
3157     Opc = NVPTX::SULD_2D_I8_ZERO_R;
3158     break;
3159   case NVPTXISD::Suld2DI16Zero:
3160     Opc = NVPTX::SULD_2D_I16_ZERO_R;
3161     break;
3162   case NVPTXISD::Suld2DI32Zero:
3163     Opc = NVPTX::SULD_2D_I32_ZERO_R;
3164     break;
3165   case NVPTXISD::Suld2DI64Zero:
3166     Opc = NVPTX::SULD_2D_I64_ZERO_R;
3167     break;
3168   case NVPTXISD::Suld2DV2I8Zero:
3169     Opc = NVPTX::SULD_2D_V2I8_ZERO_R;
3170     break;
3171   case NVPTXISD::Suld2DV2I16Zero:
3172     Opc = NVPTX::SULD_2D_V2I16_ZERO_R;
3173     break;
3174   case NVPTXISD::Suld2DV2I32Zero:
3175     Opc = NVPTX::SULD_2D_V2I32_ZERO_R;
3176     break;
3177   case NVPTXISD::Suld2DV2I64Zero:
3178     Opc = NVPTX::SULD_2D_V2I64_ZERO_R;
3179     break;
3180   case NVPTXISD::Suld2DV4I8Zero:
3181     Opc = NVPTX::SULD_2D_V4I8_ZERO_R;
3182     break;
3183   case NVPTXISD::Suld2DV4I16Zero:
3184     Opc = NVPTX::SULD_2D_V4I16_ZERO_R;
3185     break;
3186   case NVPTXISD::Suld2DV4I32Zero:
3187     Opc = NVPTX::SULD_2D_V4I32_ZERO_R;
3188     break;
3189   case NVPTXISD::Suld2DArrayI8Zero:
3190     Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO_R;
3191     break;
3192   case NVPTXISD::Suld2DArrayI16Zero:
3193     Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO_R;
3194     break;
3195   case NVPTXISD::Suld2DArrayI32Zero:
3196     Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO_R;
3197     break;
3198   case NVPTXISD::Suld2DArrayI64Zero:
3199     Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO_R;
3200     break;
3201   case NVPTXISD::Suld2DArrayV2I8Zero:
3202     Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R;
3203     break;
3204   case NVPTXISD::Suld2DArrayV2I16Zero:
3205     Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R;
3206     break;
3207   case NVPTXISD::Suld2DArrayV2I32Zero:
3208     Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R;
3209     break;
3210   case NVPTXISD::Suld2DArrayV2I64Zero:
3211     Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R;
3212     break;
3213   case NVPTXISD::Suld2DArrayV4I8Zero:
3214     Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R;
3215     break;
3216   case NVPTXISD::Suld2DArrayV4I16Zero:
3217     Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R;
3218     break;
3219   case NVPTXISD::Suld2DArrayV4I32Zero:
3220     Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R;
3221     break;
3222   case NVPTXISD::Suld3DI8Zero:
3223     Opc = NVPTX::SULD_3D_I8_ZERO_R;
3224     break;
3225   case NVPTXISD::Suld3DI16Zero:
3226     Opc = NVPTX::SULD_3D_I16_ZERO_R;
3227     break;
3228   case NVPTXISD::Suld3DI32Zero:
3229     Opc = NVPTX::SULD_3D_I32_ZERO_R;
3230     break;
3231   case NVPTXISD::Suld3DI64Zero:
3232     Opc = NVPTX::SULD_3D_I64_ZERO_R;
3233     break;
3234   case NVPTXISD::Suld3DV2I8Zero:
3235     Opc = NVPTX::SULD_3D_V2I8_ZERO_R;
3236     break;
3237   case NVPTXISD::Suld3DV2I16Zero:
3238     Opc = NVPTX::SULD_3D_V2I16_ZERO_R;
3239     break;
3240   case NVPTXISD::Suld3DV2I32Zero:
3241     Opc = NVPTX::SULD_3D_V2I32_ZERO_R;
3242     break;
3243   case NVPTXISD::Suld3DV2I64Zero:
3244     Opc = NVPTX::SULD_3D_V2I64_ZERO_R;
3245     break;
3246   case NVPTXISD::Suld3DV4I8Zero:
3247     Opc = NVPTX::SULD_3D_V4I8_ZERO_R;
3248     break;
3249   case NVPTXISD::Suld3DV4I16Zero:
3250     Opc = NVPTX::SULD_3D_V4I16_ZERO_R;
3251     break;
3252   case NVPTXISD::Suld3DV4I32Zero:
3253     Opc = NVPTX::SULD_3D_V4I32_ZERO_R;
3254     break;
3255   }
3256 
3257   // Copy over operands
3258   SmallVector<SDValue, 8> Ops(drop_begin(N->ops()));
3259   Ops.push_back(N->getOperand(0)); // Move chain to the back.
3260 
3261   ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
3262   return true;
3263 }
3264 
3265 
3266 /// SelectBFE - Look for instruction sequences that can be made more efficient
3267 /// by using the 'bfe' (bit-field extract) PTX instruction
3268 bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
3269   SDLoc DL(N);
3270   SDValue LHS = N->getOperand(0);
3271   SDValue RHS = N->getOperand(1);
3272   SDValue Len;
3273   SDValue Start;
3274   SDValue Val;
3275   bool IsSigned = false;
3276 
3277   if (N->getOpcode() == ISD::AND) {
3278     // Canonicalize the operands
3279     // We want 'and %val, %mask'
3280     if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
3281       std::swap(LHS, RHS);
3282     }
3283 
3284     ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
3285     if (!Mask) {
3286       // We need a constant mask on the RHS of the AND
3287       return false;
3288     }
3289 
3290     // Extract the mask bits
3291     uint64_t MaskVal = Mask->getZExtValue();
3292     if (!isMask_64(MaskVal)) {
3293       // We *could* handle shifted masks here, but doing so would require an
3294       // 'and' operation to fix up the low-order bits so we would trade
3295       // shr+and for bfe+and, which has the same throughput
3296       return false;
3297     }
3298 
3299     // How many bits are in our mask?
3300     int64_t NumBits = countr_one(MaskVal);
3301     Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3302 
3303     if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
3304       // We have a 'srl/and' pair, extract the effective start bit and length
3305       Val = LHS.getNode()->getOperand(0);
3306       Start = LHS.getNode()->getOperand(1);
3307       ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
3308       if (StartConst) {
3309         uint64_t StartVal = StartConst->getZExtValue();
3310         // How many "good" bits do we have left?  "good" is defined here as bits
3311         // that exist in the original value, not shifted in.
3312         int64_t GoodBits = Start.getValueSizeInBits() - StartVal;
3313         if (NumBits > GoodBits) {
3314           // Do not handle the case where bits have been shifted in. In theory
3315           // we could handle this, but the cost is likely higher than just
3316           // emitting the srl/and pair.
3317           return false;
3318         }
3319         Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
3320       } else {
3321         // Do not handle the case where the shift amount (can be zero if no srl
3322         // was found) is not constant. We could handle this case, but it would
3323         // require run-time logic that would be more expensive than just
3324         // emitting the srl/and pair.
3325         return false;
3326       }
3327     } else {
3328       // Do not handle the case where the LHS of the and is not a shift. While
3329       // it would be trivial to handle this case, it would just transform
3330       // 'and' -> 'bfe', but 'and' has higher-throughput.
3331       return false;
3332     }
3333   } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
3334     if (LHS->getOpcode() == ISD::AND) {
3335       ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
3336       if (!ShiftCnst) {
3337         // Shift amount must be constant
3338         return false;
3339       }
3340 
3341       uint64_t ShiftAmt = ShiftCnst->getZExtValue();
3342 
3343       SDValue AndLHS = LHS->getOperand(0);
3344       SDValue AndRHS = LHS->getOperand(1);
3345 
3346       // Canonicalize the AND to have the mask on the RHS
3347       if (isa<ConstantSDNode>(AndLHS)) {
3348         std::swap(AndLHS, AndRHS);
3349       }
3350 
3351       ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
3352       if (!MaskCnst) {
3353         // Mask must be constant
3354         return false;
3355       }
3356 
3357       uint64_t MaskVal = MaskCnst->getZExtValue();
3358       uint64_t NumZeros;
3359       uint64_t NumBits;
3360       if (isMask_64(MaskVal)) {
3361         NumZeros = 0;
3362         // The number of bits in the result bitfield will be the number of
3363         // trailing ones (the AND) minus the number of bits we shift off
3364         NumBits = llvm::countr_one(MaskVal) - ShiftAmt;
3365       } else if (isShiftedMask_64(MaskVal)) {
3366         NumZeros = llvm::countr_zero(MaskVal);
3367         unsigned NumOnes = llvm::countr_one(MaskVal >> NumZeros);
3368         // The number of bits in the result bitfield will be the number of
3369         // trailing zeros plus the number of set bits in the mask minus the
3370         // number of bits we shift off
3371         NumBits = NumZeros + NumOnes - ShiftAmt;
3372       } else {
3373         // This is not a mask we can handle
3374         return false;
3375       }
3376 
3377       if (ShiftAmt < NumZeros) {
3378         // Handling this case would require extra logic that would make this
3379         // transformation non-profitable
3380         return false;
3381       }
3382 
3383       Val = AndLHS;
3384       Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
3385       Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3386     } else if (LHS->getOpcode() == ISD::SHL) {
3387       // Here, we have a pattern like:
3388       //
3389       // (sra (shl val, NN), MM)
3390       // or
3391       // (srl (shl val, NN), MM)
3392       //
3393       // If MM >= NN, we can efficiently optimize this with bfe
3394       Val = LHS->getOperand(0);
3395 
3396       SDValue ShlRHS = LHS->getOperand(1);
3397       ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
3398       if (!ShlCnst) {
3399         // Shift amount must be constant
3400         return false;
3401       }
3402       uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
3403 
3404       SDValue ShrRHS = RHS;
3405       ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
3406       if (!ShrCnst) {
3407         // Shift amount must be constant
3408         return false;
3409       }
3410       uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
3411 
3412       // To avoid extra codegen and be profitable, we need Outer >= Inner
3413       if (OuterShiftAmt < InnerShiftAmt) {
3414         return false;
3415       }
3416 
3417       // If the outer shift is more than the type size, we have no bitfield to
3418       // extract (since we also check that the inner shift is <= the outer shift
3419       // then this also implies that the inner shift is < the type size)
3420       if (OuterShiftAmt >= Val.getValueSizeInBits()) {
3421         return false;
3422       }
3423 
3424       Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
3425                                         MVT::i32);
3426       Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
3427                                       DL, MVT::i32);
3428 
3429       if (N->getOpcode() == ISD::SRA) {
3430         // If we have a arithmetic right shift, we need to use the signed bfe
3431         // variant
3432         IsSigned = true;
3433       }
3434     } else {
3435       // No can do...
3436       return false;
3437     }
3438   } else {
3439     // No can do...
3440     return false;
3441   }
3442 
3443 
3444   unsigned Opc;
3445   // For the BFE operations we form here from "and" and "srl", always use the
3446   // unsigned variants.
3447   if (Val.getValueType() == MVT::i32) {
3448     if (IsSigned) {
3449       Opc = NVPTX::BFE_S32rii;
3450     } else {
3451       Opc = NVPTX::BFE_U32rii;
3452     }
3453   } else if (Val.getValueType() == MVT::i64) {
3454     if (IsSigned) {
3455       Opc = NVPTX::BFE_S64rii;
3456     } else {
3457       Opc = NVPTX::BFE_U64rii;
3458     }
3459   } else {
3460     // We cannot handle this type
3461     return false;
3462   }
3463 
3464   SDValue Ops[] = {
3465     Val, Start, Len
3466   };
3467 
3468   ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
3469   return true;
3470 }
3471 
3472 // SelectDirectAddr - Match a direct address for DAG.
3473 // A direct address could be a globaladdress or externalsymbol.
3474 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
3475   // Return true if TGA or ES.
3476   if (N.getOpcode() == ISD::TargetGlobalAddress ||
3477       N.getOpcode() == ISD::TargetExternalSymbol) {
3478     Address = N;
3479     return true;
3480   }
3481   if (N.getOpcode() == NVPTXISD::Wrapper) {
3482     Address = N.getOperand(0);
3483     return true;
3484   }
3485   // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
3486   if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
3487     if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
3488         CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM &&
3489         CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam)
3490       return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
3491   }
3492   return false;
3493 }
3494 
3495 // symbol+offset
3496 bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
3497     SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3498   if (Addr.getOpcode() == ISD::ADD) {
3499     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3500       SDValue base = Addr.getOperand(0);
3501       if (SelectDirectAddr(base, Base)) {
3502         Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3503                                            mvt);
3504         return true;
3505       }
3506     }
3507   }
3508   return false;
3509 }
3510 
3511 // symbol+offset
3512 bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
3513                                      SDValue &Base, SDValue &Offset) {
3514   return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
3515 }
3516 
3517 // symbol+offset
3518 bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
3519                                        SDValue &Base, SDValue &Offset) {
3520   return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
3521 }
3522 
3523 // register+offset
3524 bool NVPTXDAGToDAGISel::SelectADDRri_imp(
3525     SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3526   if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
3527     Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3528     Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
3529     return true;
3530   }
3531   if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
3532       Addr.getOpcode() == ISD::TargetGlobalAddress)
3533     return false; // direct calls.
3534 
3535   if (Addr.getOpcode() == ISD::ADD) {
3536     if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
3537       return false;
3538     }
3539     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3540       if (FrameIndexSDNode *FIN =
3541               dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
3542         // Constant offset from frame ref.
3543         Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3544       else
3545         Base = Addr.getOperand(0);
3546       Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3547                                          mvt);
3548       return true;
3549     }
3550   }
3551   return false;
3552 }
3553 
3554 // register+offset
3555 bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
3556                                      SDValue &Base, SDValue &Offset) {
3557   return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
3558 }
3559 
3560 // register+offset
3561 bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
3562                                        SDValue &Base, SDValue &Offset) {
3563   return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
3564 }
3565 
3566 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
3567                                                  unsigned int spN) const {
3568   const Value *Src = nullptr;
3569   if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
3570     if (spN == 0 && mN->getMemOperand()->getPseudoValue())
3571       return true;
3572     Src = mN->getMemOperand()->getValue();
3573   }
3574   if (!Src)
3575     return false;
3576   if (auto *PT = dyn_cast<PointerType>(Src->getType()))
3577     return (PT->getAddressSpace() == spN);
3578   return false;
3579 }
3580 
3581 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
3582 /// inline asm expressions.
3583 bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
3584     const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
3585   SDValue Op0, Op1;
3586   switch (ConstraintID) {
3587   default:
3588     return true;
3589   case InlineAsm::Constraint_m: // memory
3590     if (SelectDirectAddr(Op, Op0)) {
3591       OutOps.push_back(Op0);
3592       OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
3593       return false;
3594     }
3595     if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
3596       OutOps.push_back(Op0);
3597       OutOps.push_back(Op1);
3598       return false;
3599     }
3600     break;
3601   }
3602   return true;
3603 }
3604 
3605 /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
3606 /// conversion from \p SrcTy to \p DestTy.
3607 unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
3608                                              LoadSDNode *LdNode) {
3609   bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD;
3610   switch (SrcTy.SimpleTy) {
3611   default:
3612     llvm_unreachable("Unhandled source type");
3613   case MVT::i8:
3614     switch (DestTy.SimpleTy) {
3615     default:
3616       llvm_unreachable("Unhandled dest type");
3617     case MVT::i16:
3618       return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
3619     case MVT::i32:
3620       return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
3621     case MVT::i64:
3622       return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
3623     }
3624   case MVT::i16:
3625     switch (DestTy.SimpleTy) {
3626     default:
3627       llvm_unreachable("Unhandled dest type");
3628     case MVT::i8:
3629       return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
3630     case MVT::i32:
3631       return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
3632     case MVT::i64:
3633       return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
3634     }
3635   case MVT::i32:
3636     switch (DestTy.SimpleTy) {
3637     default:
3638       llvm_unreachable("Unhandled dest type");
3639     case MVT::i8:
3640       return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
3641     case MVT::i16:
3642       return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
3643     case MVT::i64:
3644       return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
3645     }
3646   case MVT::i64:
3647     switch (DestTy.SimpleTy) {
3648     default:
3649       llvm_unreachable("Unhandled dest type");
3650     case MVT::i8:
3651       return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
3652     case MVT::i16:
3653       return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
3654     case MVT::i32:
3655       return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
3656     }
3657   case MVT::f16:
3658     switch (DestTy.SimpleTy) {
3659     default:
3660       llvm_unreachable("Unhandled dest type");
3661     case MVT::f32:
3662       return NVPTX::CVT_f32_f16;
3663     case MVT::f64:
3664       return NVPTX::CVT_f64_f16;
3665     }
3666   }
3667 }
3668