1 //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines an instruction selector for the NVPTX target.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "NVPTXISelDAGToDAG.h"
14 #include "MCTargetDesc/NVPTXBaseInfo.h"
15 #include "NVPTXUtilities.h"
16 #include "llvm/Analysis/ValueTracking.h"
17 #include "llvm/IR/GlobalValue.h"
18 #include "llvm/IR/Instructions.h"
19 #include "llvm/IR/IntrinsicsNVPTX.h"
20 #include "llvm/Support/AtomicOrdering.h"
21 #include "llvm/Support/CommandLine.h"
22 #include "llvm/Support/Debug.h"
23 #include "llvm/Support/ErrorHandling.h"
24 #include "llvm/Support/raw_ostream.h"
25 #include "llvm/Target/TargetIntrinsicInfo.h"
26
27 using namespace llvm;
28
29 #define DEBUG_TYPE "nvptx-isel"
30 #define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
31
32 /// createNVPTXISelDag - This pass converts a legalized DAG into a
33 /// NVPTX-specific DAG, ready for instruction scheduling.
createNVPTXISelDag(NVPTXTargetMachine & TM,llvm::CodeGenOpt::Level OptLevel)34 FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
35 llvm::CodeGenOpt::Level OptLevel) {
36 return new NVPTXDAGToDAGISel(TM, OptLevel);
37 }
38
39 char NVPTXDAGToDAGISel::ID = 0;
40
INITIALIZE_PASS(NVPTXDAGToDAGISel,DEBUG_TYPE,PASS_NAME,false,false)41 INITIALIZE_PASS(NVPTXDAGToDAGISel, DEBUG_TYPE, PASS_NAME, false, false)
42
43 NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
44 CodeGenOpt::Level OptLevel)
45 : SelectionDAGISel(ID, tm, OptLevel), TM(tm) {
46 doMulWide = (OptLevel > 0);
47 }
48
runOnMachineFunction(MachineFunction & MF)49 bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
50 Subtarget = &MF.getSubtarget<NVPTXSubtarget>();
51 return SelectionDAGISel::runOnMachineFunction(MF);
52 }
53
getDivF32Level() const54 int NVPTXDAGToDAGISel::getDivF32Level() const {
55 return Subtarget->getTargetLowering()->getDivF32Level();
56 }
57
usePrecSqrtF32() const58 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
59 return Subtarget->getTargetLowering()->usePrecSqrtF32();
60 }
61
useF32FTZ() const62 bool NVPTXDAGToDAGISel::useF32FTZ() const {
63 return Subtarget->getTargetLowering()->useF32FTZ(*MF);
64 }
65
allowFMA() const66 bool NVPTXDAGToDAGISel::allowFMA() const {
67 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
68 return TL->allowFMA(*MF, OptLevel);
69 }
70
allowUnsafeFPMath() const71 bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
72 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
73 return TL->allowUnsafeFPMath(*MF);
74 }
75
useShortPointers() const76 bool NVPTXDAGToDAGISel::useShortPointers() const {
77 return TM.useShortPointers();
78 }
79
80 /// Select - Select instructions not customized! Used for
81 /// expanded, promoted and normal instructions.
Select(SDNode * N)82 void NVPTXDAGToDAGISel::Select(SDNode *N) {
83
84 if (N->isMachineOpcode()) {
85 N->setNodeId(-1);
86 return; // Already selected.
87 }
88
89 switch (N->getOpcode()) {
90 case ISD::LOAD:
91 case ISD::ATOMIC_LOAD:
92 if (tryLoad(N))
93 return;
94 break;
95 case ISD::STORE:
96 case ISD::ATOMIC_STORE:
97 if (tryStore(N))
98 return;
99 break;
100 case ISD::EXTRACT_VECTOR_ELT:
101 if (tryEXTRACT_VECTOR_ELEMENT(N))
102 return;
103 break;
104 case NVPTXISD::SETP_F16X2:
105 SelectSETP_F16X2(N);
106 return;
107
108 case NVPTXISD::LoadV2:
109 case NVPTXISD::LoadV4:
110 if (tryLoadVector(N))
111 return;
112 break;
113 case NVPTXISD::LDGV2:
114 case NVPTXISD::LDGV4:
115 case NVPTXISD::LDUV2:
116 case NVPTXISD::LDUV4:
117 if (tryLDGLDU(N))
118 return;
119 break;
120 case NVPTXISD::StoreV2:
121 case NVPTXISD::StoreV4:
122 if (tryStoreVector(N))
123 return;
124 break;
125 case NVPTXISD::LoadParam:
126 case NVPTXISD::LoadParamV2:
127 case NVPTXISD::LoadParamV4:
128 if (tryLoadParam(N))
129 return;
130 break;
131 case NVPTXISD::StoreRetval:
132 case NVPTXISD::StoreRetvalV2:
133 case NVPTXISD::StoreRetvalV4:
134 if (tryStoreRetval(N))
135 return;
136 break;
137 case NVPTXISD::StoreParam:
138 case NVPTXISD::StoreParamV2:
139 case NVPTXISD::StoreParamV4:
140 case NVPTXISD::StoreParamS32:
141 case NVPTXISD::StoreParamU32:
142 if (tryStoreParam(N))
143 return;
144 break;
145 case ISD::INTRINSIC_WO_CHAIN:
146 if (tryIntrinsicNoChain(N))
147 return;
148 break;
149 case ISD::INTRINSIC_W_CHAIN:
150 if (tryIntrinsicChain(N))
151 return;
152 break;
153 case NVPTXISD::Tex1DFloatS32:
154 case NVPTXISD::Tex1DFloatFloat:
155 case NVPTXISD::Tex1DFloatFloatLevel:
156 case NVPTXISD::Tex1DFloatFloatGrad:
157 case NVPTXISD::Tex1DS32S32:
158 case NVPTXISD::Tex1DS32Float:
159 case NVPTXISD::Tex1DS32FloatLevel:
160 case NVPTXISD::Tex1DS32FloatGrad:
161 case NVPTXISD::Tex1DU32S32:
162 case NVPTXISD::Tex1DU32Float:
163 case NVPTXISD::Tex1DU32FloatLevel:
164 case NVPTXISD::Tex1DU32FloatGrad:
165 case NVPTXISD::Tex1DArrayFloatS32:
166 case NVPTXISD::Tex1DArrayFloatFloat:
167 case NVPTXISD::Tex1DArrayFloatFloatLevel:
168 case NVPTXISD::Tex1DArrayFloatFloatGrad:
169 case NVPTXISD::Tex1DArrayS32S32:
170 case NVPTXISD::Tex1DArrayS32Float:
171 case NVPTXISD::Tex1DArrayS32FloatLevel:
172 case NVPTXISD::Tex1DArrayS32FloatGrad:
173 case NVPTXISD::Tex1DArrayU32S32:
174 case NVPTXISD::Tex1DArrayU32Float:
175 case NVPTXISD::Tex1DArrayU32FloatLevel:
176 case NVPTXISD::Tex1DArrayU32FloatGrad:
177 case NVPTXISD::Tex2DFloatS32:
178 case NVPTXISD::Tex2DFloatFloat:
179 case NVPTXISD::Tex2DFloatFloatLevel:
180 case NVPTXISD::Tex2DFloatFloatGrad:
181 case NVPTXISD::Tex2DS32S32:
182 case NVPTXISD::Tex2DS32Float:
183 case NVPTXISD::Tex2DS32FloatLevel:
184 case NVPTXISD::Tex2DS32FloatGrad:
185 case NVPTXISD::Tex2DU32S32:
186 case NVPTXISD::Tex2DU32Float:
187 case NVPTXISD::Tex2DU32FloatLevel:
188 case NVPTXISD::Tex2DU32FloatGrad:
189 case NVPTXISD::Tex2DArrayFloatS32:
190 case NVPTXISD::Tex2DArrayFloatFloat:
191 case NVPTXISD::Tex2DArrayFloatFloatLevel:
192 case NVPTXISD::Tex2DArrayFloatFloatGrad:
193 case NVPTXISD::Tex2DArrayS32S32:
194 case NVPTXISD::Tex2DArrayS32Float:
195 case NVPTXISD::Tex2DArrayS32FloatLevel:
196 case NVPTXISD::Tex2DArrayS32FloatGrad:
197 case NVPTXISD::Tex2DArrayU32S32:
198 case NVPTXISD::Tex2DArrayU32Float:
199 case NVPTXISD::Tex2DArrayU32FloatLevel:
200 case NVPTXISD::Tex2DArrayU32FloatGrad:
201 case NVPTXISD::Tex3DFloatS32:
202 case NVPTXISD::Tex3DFloatFloat:
203 case NVPTXISD::Tex3DFloatFloatLevel:
204 case NVPTXISD::Tex3DFloatFloatGrad:
205 case NVPTXISD::Tex3DS32S32:
206 case NVPTXISD::Tex3DS32Float:
207 case NVPTXISD::Tex3DS32FloatLevel:
208 case NVPTXISD::Tex3DS32FloatGrad:
209 case NVPTXISD::Tex3DU32S32:
210 case NVPTXISD::Tex3DU32Float:
211 case NVPTXISD::Tex3DU32FloatLevel:
212 case NVPTXISD::Tex3DU32FloatGrad:
213 case NVPTXISD::TexCubeFloatFloat:
214 case NVPTXISD::TexCubeFloatFloatLevel:
215 case NVPTXISD::TexCubeS32Float:
216 case NVPTXISD::TexCubeS32FloatLevel:
217 case NVPTXISD::TexCubeU32Float:
218 case NVPTXISD::TexCubeU32FloatLevel:
219 case NVPTXISD::TexCubeArrayFloatFloat:
220 case NVPTXISD::TexCubeArrayFloatFloatLevel:
221 case NVPTXISD::TexCubeArrayS32Float:
222 case NVPTXISD::TexCubeArrayS32FloatLevel:
223 case NVPTXISD::TexCubeArrayU32Float:
224 case NVPTXISD::TexCubeArrayU32FloatLevel:
225 case NVPTXISD::Tld4R2DFloatFloat:
226 case NVPTXISD::Tld4G2DFloatFloat:
227 case NVPTXISD::Tld4B2DFloatFloat:
228 case NVPTXISD::Tld4A2DFloatFloat:
229 case NVPTXISD::Tld4R2DS64Float:
230 case NVPTXISD::Tld4G2DS64Float:
231 case NVPTXISD::Tld4B2DS64Float:
232 case NVPTXISD::Tld4A2DS64Float:
233 case NVPTXISD::Tld4R2DU64Float:
234 case NVPTXISD::Tld4G2DU64Float:
235 case NVPTXISD::Tld4B2DU64Float:
236 case NVPTXISD::Tld4A2DU64Float:
237 case NVPTXISD::TexUnified1DFloatS32:
238 case NVPTXISD::TexUnified1DFloatFloat:
239 case NVPTXISD::TexUnified1DFloatFloatLevel:
240 case NVPTXISD::TexUnified1DFloatFloatGrad:
241 case NVPTXISD::TexUnified1DS32S32:
242 case NVPTXISD::TexUnified1DS32Float:
243 case NVPTXISD::TexUnified1DS32FloatLevel:
244 case NVPTXISD::TexUnified1DS32FloatGrad:
245 case NVPTXISD::TexUnified1DU32S32:
246 case NVPTXISD::TexUnified1DU32Float:
247 case NVPTXISD::TexUnified1DU32FloatLevel:
248 case NVPTXISD::TexUnified1DU32FloatGrad:
249 case NVPTXISD::TexUnified1DArrayFloatS32:
250 case NVPTXISD::TexUnified1DArrayFloatFloat:
251 case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
252 case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
253 case NVPTXISD::TexUnified1DArrayS32S32:
254 case NVPTXISD::TexUnified1DArrayS32Float:
255 case NVPTXISD::TexUnified1DArrayS32FloatLevel:
256 case NVPTXISD::TexUnified1DArrayS32FloatGrad:
257 case NVPTXISD::TexUnified1DArrayU32S32:
258 case NVPTXISD::TexUnified1DArrayU32Float:
259 case NVPTXISD::TexUnified1DArrayU32FloatLevel:
260 case NVPTXISD::TexUnified1DArrayU32FloatGrad:
261 case NVPTXISD::TexUnified2DFloatS32:
262 case NVPTXISD::TexUnified2DFloatFloat:
263 case NVPTXISD::TexUnified2DFloatFloatLevel:
264 case NVPTXISD::TexUnified2DFloatFloatGrad:
265 case NVPTXISD::TexUnified2DS32S32:
266 case NVPTXISD::TexUnified2DS32Float:
267 case NVPTXISD::TexUnified2DS32FloatLevel:
268 case NVPTXISD::TexUnified2DS32FloatGrad:
269 case NVPTXISD::TexUnified2DU32S32:
270 case NVPTXISD::TexUnified2DU32Float:
271 case NVPTXISD::TexUnified2DU32FloatLevel:
272 case NVPTXISD::TexUnified2DU32FloatGrad:
273 case NVPTXISD::TexUnified2DArrayFloatS32:
274 case NVPTXISD::TexUnified2DArrayFloatFloat:
275 case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
276 case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
277 case NVPTXISD::TexUnified2DArrayS32S32:
278 case NVPTXISD::TexUnified2DArrayS32Float:
279 case NVPTXISD::TexUnified2DArrayS32FloatLevel:
280 case NVPTXISD::TexUnified2DArrayS32FloatGrad:
281 case NVPTXISD::TexUnified2DArrayU32S32:
282 case NVPTXISD::TexUnified2DArrayU32Float:
283 case NVPTXISD::TexUnified2DArrayU32FloatLevel:
284 case NVPTXISD::TexUnified2DArrayU32FloatGrad:
285 case NVPTXISD::TexUnified3DFloatS32:
286 case NVPTXISD::TexUnified3DFloatFloat:
287 case NVPTXISD::TexUnified3DFloatFloatLevel:
288 case NVPTXISD::TexUnified3DFloatFloatGrad:
289 case NVPTXISD::TexUnified3DS32S32:
290 case NVPTXISD::TexUnified3DS32Float:
291 case NVPTXISD::TexUnified3DS32FloatLevel:
292 case NVPTXISD::TexUnified3DS32FloatGrad:
293 case NVPTXISD::TexUnified3DU32S32:
294 case NVPTXISD::TexUnified3DU32Float:
295 case NVPTXISD::TexUnified3DU32FloatLevel:
296 case NVPTXISD::TexUnified3DU32FloatGrad:
297 case NVPTXISD::TexUnifiedCubeFloatFloat:
298 case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
299 case NVPTXISD::TexUnifiedCubeS32Float:
300 case NVPTXISD::TexUnifiedCubeS32FloatLevel:
301 case NVPTXISD::TexUnifiedCubeU32Float:
302 case NVPTXISD::TexUnifiedCubeU32FloatLevel:
303 case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
304 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
305 case NVPTXISD::TexUnifiedCubeArrayS32Float:
306 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
307 case NVPTXISD::TexUnifiedCubeArrayU32Float:
308 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
309 case NVPTXISD::Tld4UnifiedR2DFloatFloat:
310 case NVPTXISD::Tld4UnifiedG2DFloatFloat:
311 case NVPTXISD::Tld4UnifiedB2DFloatFloat:
312 case NVPTXISD::Tld4UnifiedA2DFloatFloat:
313 case NVPTXISD::Tld4UnifiedR2DS64Float:
314 case NVPTXISD::Tld4UnifiedG2DS64Float:
315 case NVPTXISD::Tld4UnifiedB2DS64Float:
316 case NVPTXISD::Tld4UnifiedA2DS64Float:
317 case NVPTXISD::Tld4UnifiedR2DU64Float:
318 case NVPTXISD::Tld4UnifiedG2DU64Float:
319 case NVPTXISD::Tld4UnifiedB2DU64Float:
320 case NVPTXISD::Tld4UnifiedA2DU64Float:
321 if (tryTextureIntrinsic(N))
322 return;
323 break;
324 case NVPTXISD::Suld1DI8Clamp:
325 case NVPTXISD::Suld1DI16Clamp:
326 case NVPTXISD::Suld1DI32Clamp:
327 case NVPTXISD::Suld1DI64Clamp:
328 case NVPTXISD::Suld1DV2I8Clamp:
329 case NVPTXISD::Suld1DV2I16Clamp:
330 case NVPTXISD::Suld1DV2I32Clamp:
331 case NVPTXISD::Suld1DV2I64Clamp:
332 case NVPTXISD::Suld1DV4I8Clamp:
333 case NVPTXISD::Suld1DV4I16Clamp:
334 case NVPTXISD::Suld1DV4I32Clamp:
335 case NVPTXISD::Suld1DArrayI8Clamp:
336 case NVPTXISD::Suld1DArrayI16Clamp:
337 case NVPTXISD::Suld1DArrayI32Clamp:
338 case NVPTXISD::Suld1DArrayI64Clamp:
339 case NVPTXISD::Suld1DArrayV2I8Clamp:
340 case NVPTXISD::Suld1DArrayV2I16Clamp:
341 case NVPTXISD::Suld1DArrayV2I32Clamp:
342 case NVPTXISD::Suld1DArrayV2I64Clamp:
343 case NVPTXISD::Suld1DArrayV4I8Clamp:
344 case NVPTXISD::Suld1DArrayV4I16Clamp:
345 case NVPTXISD::Suld1DArrayV4I32Clamp:
346 case NVPTXISD::Suld2DI8Clamp:
347 case NVPTXISD::Suld2DI16Clamp:
348 case NVPTXISD::Suld2DI32Clamp:
349 case NVPTXISD::Suld2DI64Clamp:
350 case NVPTXISD::Suld2DV2I8Clamp:
351 case NVPTXISD::Suld2DV2I16Clamp:
352 case NVPTXISD::Suld2DV2I32Clamp:
353 case NVPTXISD::Suld2DV2I64Clamp:
354 case NVPTXISD::Suld2DV4I8Clamp:
355 case NVPTXISD::Suld2DV4I16Clamp:
356 case NVPTXISD::Suld2DV4I32Clamp:
357 case NVPTXISD::Suld2DArrayI8Clamp:
358 case NVPTXISD::Suld2DArrayI16Clamp:
359 case NVPTXISD::Suld2DArrayI32Clamp:
360 case NVPTXISD::Suld2DArrayI64Clamp:
361 case NVPTXISD::Suld2DArrayV2I8Clamp:
362 case NVPTXISD::Suld2DArrayV2I16Clamp:
363 case NVPTXISD::Suld2DArrayV2I32Clamp:
364 case NVPTXISD::Suld2DArrayV2I64Clamp:
365 case NVPTXISD::Suld2DArrayV4I8Clamp:
366 case NVPTXISD::Suld2DArrayV4I16Clamp:
367 case NVPTXISD::Suld2DArrayV4I32Clamp:
368 case NVPTXISD::Suld3DI8Clamp:
369 case NVPTXISD::Suld3DI16Clamp:
370 case NVPTXISD::Suld3DI32Clamp:
371 case NVPTXISD::Suld3DI64Clamp:
372 case NVPTXISD::Suld3DV2I8Clamp:
373 case NVPTXISD::Suld3DV2I16Clamp:
374 case NVPTXISD::Suld3DV2I32Clamp:
375 case NVPTXISD::Suld3DV2I64Clamp:
376 case NVPTXISD::Suld3DV4I8Clamp:
377 case NVPTXISD::Suld3DV4I16Clamp:
378 case NVPTXISD::Suld3DV4I32Clamp:
379 case NVPTXISD::Suld1DI8Trap:
380 case NVPTXISD::Suld1DI16Trap:
381 case NVPTXISD::Suld1DI32Trap:
382 case NVPTXISD::Suld1DI64Trap:
383 case NVPTXISD::Suld1DV2I8Trap:
384 case NVPTXISD::Suld1DV2I16Trap:
385 case NVPTXISD::Suld1DV2I32Trap:
386 case NVPTXISD::Suld1DV2I64Trap:
387 case NVPTXISD::Suld1DV4I8Trap:
388 case NVPTXISD::Suld1DV4I16Trap:
389 case NVPTXISD::Suld1DV4I32Trap:
390 case NVPTXISD::Suld1DArrayI8Trap:
391 case NVPTXISD::Suld1DArrayI16Trap:
392 case NVPTXISD::Suld1DArrayI32Trap:
393 case NVPTXISD::Suld1DArrayI64Trap:
394 case NVPTXISD::Suld1DArrayV2I8Trap:
395 case NVPTXISD::Suld1DArrayV2I16Trap:
396 case NVPTXISD::Suld1DArrayV2I32Trap:
397 case NVPTXISD::Suld1DArrayV2I64Trap:
398 case NVPTXISD::Suld1DArrayV4I8Trap:
399 case NVPTXISD::Suld1DArrayV4I16Trap:
400 case NVPTXISD::Suld1DArrayV4I32Trap:
401 case NVPTXISD::Suld2DI8Trap:
402 case NVPTXISD::Suld2DI16Trap:
403 case NVPTXISD::Suld2DI32Trap:
404 case NVPTXISD::Suld2DI64Trap:
405 case NVPTXISD::Suld2DV2I8Trap:
406 case NVPTXISD::Suld2DV2I16Trap:
407 case NVPTXISD::Suld2DV2I32Trap:
408 case NVPTXISD::Suld2DV2I64Trap:
409 case NVPTXISD::Suld2DV4I8Trap:
410 case NVPTXISD::Suld2DV4I16Trap:
411 case NVPTXISD::Suld2DV4I32Trap:
412 case NVPTXISD::Suld2DArrayI8Trap:
413 case NVPTXISD::Suld2DArrayI16Trap:
414 case NVPTXISD::Suld2DArrayI32Trap:
415 case NVPTXISD::Suld2DArrayI64Trap:
416 case NVPTXISD::Suld2DArrayV2I8Trap:
417 case NVPTXISD::Suld2DArrayV2I16Trap:
418 case NVPTXISD::Suld2DArrayV2I32Trap:
419 case NVPTXISD::Suld2DArrayV2I64Trap:
420 case NVPTXISD::Suld2DArrayV4I8Trap:
421 case NVPTXISD::Suld2DArrayV4I16Trap:
422 case NVPTXISD::Suld2DArrayV4I32Trap:
423 case NVPTXISD::Suld3DI8Trap:
424 case NVPTXISD::Suld3DI16Trap:
425 case NVPTXISD::Suld3DI32Trap:
426 case NVPTXISD::Suld3DI64Trap:
427 case NVPTXISD::Suld3DV2I8Trap:
428 case NVPTXISD::Suld3DV2I16Trap:
429 case NVPTXISD::Suld3DV2I32Trap:
430 case NVPTXISD::Suld3DV2I64Trap:
431 case NVPTXISD::Suld3DV4I8Trap:
432 case NVPTXISD::Suld3DV4I16Trap:
433 case NVPTXISD::Suld3DV4I32Trap:
434 case NVPTXISD::Suld1DI8Zero:
435 case NVPTXISD::Suld1DI16Zero:
436 case NVPTXISD::Suld1DI32Zero:
437 case NVPTXISD::Suld1DI64Zero:
438 case NVPTXISD::Suld1DV2I8Zero:
439 case NVPTXISD::Suld1DV2I16Zero:
440 case NVPTXISD::Suld1DV2I32Zero:
441 case NVPTXISD::Suld1DV2I64Zero:
442 case NVPTXISD::Suld1DV4I8Zero:
443 case NVPTXISD::Suld1DV4I16Zero:
444 case NVPTXISD::Suld1DV4I32Zero:
445 case NVPTXISD::Suld1DArrayI8Zero:
446 case NVPTXISD::Suld1DArrayI16Zero:
447 case NVPTXISD::Suld1DArrayI32Zero:
448 case NVPTXISD::Suld1DArrayI64Zero:
449 case NVPTXISD::Suld1DArrayV2I8Zero:
450 case NVPTXISD::Suld1DArrayV2I16Zero:
451 case NVPTXISD::Suld1DArrayV2I32Zero:
452 case NVPTXISD::Suld1DArrayV2I64Zero:
453 case NVPTXISD::Suld1DArrayV4I8Zero:
454 case NVPTXISD::Suld1DArrayV4I16Zero:
455 case NVPTXISD::Suld1DArrayV4I32Zero:
456 case NVPTXISD::Suld2DI8Zero:
457 case NVPTXISD::Suld2DI16Zero:
458 case NVPTXISD::Suld2DI32Zero:
459 case NVPTXISD::Suld2DI64Zero:
460 case NVPTXISD::Suld2DV2I8Zero:
461 case NVPTXISD::Suld2DV2I16Zero:
462 case NVPTXISD::Suld2DV2I32Zero:
463 case NVPTXISD::Suld2DV2I64Zero:
464 case NVPTXISD::Suld2DV4I8Zero:
465 case NVPTXISD::Suld2DV4I16Zero:
466 case NVPTXISD::Suld2DV4I32Zero:
467 case NVPTXISD::Suld2DArrayI8Zero:
468 case NVPTXISD::Suld2DArrayI16Zero:
469 case NVPTXISD::Suld2DArrayI32Zero:
470 case NVPTXISD::Suld2DArrayI64Zero:
471 case NVPTXISD::Suld2DArrayV2I8Zero:
472 case NVPTXISD::Suld2DArrayV2I16Zero:
473 case NVPTXISD::Suld2DArrayV2I32Zero:
474 case NVPTXISD::Suld2DArrayV2I64Zero:
475 case NVPTXISD::Suld2DArrayV4I8Zero:
476 case NVPTXISD::Suld2DArrayV4I16Zero:
477 case NVPTXISD::Suld2DArrayV4I32Zero:
478 case NVPTXISD::Suld3DI8Zero:
479 case NVPTXISD::Suld3DI16Zero:
480 case NVPTXISD::Suld3DI32Zero:
481 case NVPTXISD::Suld3DI64Zero:
482 case NVPTXISD::Suld3DV2I8Zero:
483 case NVPTXISD::Suld3DV2I16Zero:
484 case NVPTXISD::Suld3DV2I32Zero:
485 case NVPTXISD::Suld3DV2I64Zero:
486 case NVPTXISD::Suld3DV4I8Zero:
487 case NVPTXISD::Suld3DV4I16Zero:
488 case NVPTXISD::Suld3DV4I32Zero:
489 if (trySurfaceIntrinsic(N))
490 return;
491 break;
492 case ISD::AND:
493 case ISD::SRA:
494 case ISD::SRL:
495 // Try to select BFE
496 if (tryBFE(N))
497 return;
498 break;
499 case ISD::ADDRSPACECAST:
500 SelectAddrSpaceCast(N);
501 return;
502 case ISD::ConstantFP:
503 if (tryConstantFP16(N))
504 return;
505 break;
506 default:
507 break;
508 }
509 SelectCode(N);
510 }
511
tryIntrinsicChain(SDNode * N)512 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
513 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
514 switch (IID) {
515 default:
516 return false;
517 case Intrinsic::nvvm_ldg_global_f:
518 case Intrinsic::nvvm_ldg_global_i:
519 case Intrinsic::nvvm_ldg_global_p:
520 case Intrinsic::nvvm_ldu_global_f:
521 case Intrinsic::nvvm_ldu_global_i:
522 case Intrinsic::nvvm_ldu_global_p:
523 return tryLDGLDU(N);
524 }
525 }
526
527 // There's no way to specify FP16 immediates in .f16 ops, so we have to
528 // load them into an .f16 register first.
tryConstantFP16(SDNode * N)529 bool NVPTXDAGToDAGISel::tryConstantFP16(SDNode *N) {
530 if (N->getValueType(0) != MVT::f16)
531 return false;
532 SDValue Val = CurDAG->getTargetConstantFP(
533 cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), MVT::f16);
534 SDNode *LoadConstF16 =
535 CurDAG->getMachineNode(NVPTX::LOAD_CONST_F16, SDLoc(N), MVT::f16, Val);
536 ReplaceNode(N, LoadConstF16);
537 return true;
538 }
539
540 // Map ISD:CONDCODE value to appropriate CmpMode expected by
541 // NVPTXInstPrinter::printCmpMode()
getPTXCmpMode(const CondCodeSDNode & CondCode,bool FTZ)542 static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
543 using NVPTX::PTXCmpMode::CmpMode;
544 unsigned PTXCmpMode = [](ISD::CondCode CC) {
545 switch (CC) {
546 default:
547 llvm_unreachable("Unexpected condition code.");
548 case ISD::SETOEQ:
549 return CmpMode::EQ;
550 case ISD::SETOGT:
551 return CmpMode::GT;
552 case ISD::SETOGE:
553 return CmpMode::GE;
554 case ISD::SETOLT:
555 return CmpMode::LT;
556 case ISD::SETOLE:
557 return CmpMode::LE;
558 case ISD::SETONE:
559 return CmpMode::NE;
560 case ISD::SETO:
561 return CmpMode::NUM;
562 case ISD::SETUO:
563 return CmpMode::NotANumber;
564 case ISD::SETUEQ:
565 return CmpMode::EQU;
566 case ISD::SETUGT:
567 return CmpMode::GTU;
568 case ISD::SETUGE:
569 return CmpMode::GEU;
570 case ISD::SETULT:
571 return CmpMode::LTU;
572 case ISD::SETULE:
573 return CmpMode::LEU;
574 case ISD::SETUNE:
575 return CmpMode::NEU;
576 case ISD::SETEQ:
577 return CmpMode::EQ;
578 case ISD::SETGT:
579 return CmpMode::GT;
580 case ISD::SETGE:
581 return CmpMode::GE;
582 case ISD::SETLT:
583 return CmpMode::LT;
584 case ISD::SETLE:
585 return CmpMode::LE;
586 case ISD::SETNE:
587 return CmpMode::NE;
588 }
589 }(CondCode.get());
590
591 if (FTZ)
592 PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
593
594 return PTXCmpMode;
595 }
596
SelectSETP_F16X2(SDNode * N)597 bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
598 unsigned PTXCmpMode =
599 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
600 SDLoc DL(N);
601 SDNode *SetP = CurDAG->getMachineNode(
602 NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
603 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
604 ReplaceNode(N, SetP);
605 return true;
606 }
607
608 // Find all instances of extract_vector_elt that use this v2f16 vector
609 // and coalesce them into a scattering move instruction.
tryEXTRACT_VECTOR_ELEMENT(SDNode * N)610 bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
611 SDValue Vector = N->getOperand(0);
612
613 // We only care about f16x2 as it's the only real vector type we
614 // need to deal with.
615 if (Vector.getSimpleValueType() != MVT::v2f16)
616 return false;
617
618 // Find and record all uses of this vector that extract element 0 or 1.
619 SmallVector<SDNode *, 4> E0, E1;
620 for (auto *U : Vector.getNode()->uses()) {
621 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
622 continue;
623 if (U->getOperand(0) != Vector)
624 continue;
625 if (const ConstantSDNode *IdxConst =
626 dyn_cast<ConstantSDNode>(U->getOperand(1))) {
627 if (IdxConst->getZExtValue() == 0)
628 E0.push_back(U);
629 else if (IdxConst->getZExtValue() == 1)
630 E1.push_back(U);
631 else
632 llvm_unreachable("Invalid vector index.");
633 }
634 }
635
636 // There's no point scattering f16x2 if we only ever access one
637 // element of it.
638 if (E0.empty() || E1.empty())
639 return false;
640
641 unsigned Op = NVPTX::SplitF16x2;
642 // If the vector has been BITCAST'ed from i32, we can use original
643 // value directly and avoid register-to-register move.
644 SDValue Source = Vector;
645 if (Vector->getOpcode() == ISD::BITCAST) {
646 Op = NVPTX::SplitI32toF16x2;
647 Source = Vector->getOperand(0);
648 }
649 // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
650 // into f16,f16 SplitF16x2(V)
651 SDNode *ScatterOp =
652 CurDAG->getMachineNode(Op, SDLoc(N), MVT::f16, MVT::f16, Source);
653 for (auto *Node : E0)
654 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
655 for (auto *Node : E1)
656 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
657
658 return true;
659 }
660
getCodeAddrSpace(MemSDNode * N)661 static unsigned int getCodeAddrSpace(MemSDNode *N) {
662 const Value *Src = N->getMemOperand()->getValue();
663
664 if (!Src)
665 return NVPTX::PTXLdStInstCode::GENERIC;
666
667 if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
668 switch (PT->getAddressSpace()) {
669 case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
670 case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
671 case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED;
672 case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC;
673 case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM;
674 case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT;
675 default: break;
676 }
677 }
678 return NVPTX::PTXLdStInstCode::GENERIC;
679 }
680
canLowerToLDG(MemSDNode * N,const NVPTXSubtarget & Subtarget,unsigned CodeAddrSpace,MachineFunction * F)681 static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
682 unsigned CodeAddrSpace, MachineFunction *F) {
683 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
684 // space.
685 //
686 // We have two ways of identifying invariant loads: Loads may be explicitly
687 // marked as invariant, or we may infer them to be invariant.
688 //
689 // We currently infer invariance for loads from
690 // - constant global variables, and
691 // - kernel function pointer params that are noalias (i.e. __restrict) and
692 // never written to.
693 //
694 // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
695 // not during the SelectionDAG phase).
696 //
697 // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for
698 // explicitly invariant loads because these are how clang tells us to use ldg
699 // when the user uses a builtin.
700 if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
701 return false;
702
703 if (N->isInvariant())
704 return true;
705
706 bool IsKernelFn = isKernelFunction(F->getFunction());
707
708 // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly
709 // because the former looks through phi nodes while the latter does not. We
710 // need to look through phi nodes to handle pointer induction variables.
711 SmallVector<const Value *, 8> Objs;
712 getUnderlyingObjects(N->getMemOperand()->getValue(), Objs);
713
714 return all_of(Objs, [&](const Value *V) {
715 if (auto *A = dyn_cast<const Argument>(V))
716 return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
717 if (auto *GV = dyn_cast<const GlobalVariable>(V))
718 return GV->isConstant();
719 return false;
720 });
721 }
722
tryIntrinsicNoChain(SDNode * N)723 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
724 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
725 switch (IID) {
726 default:
727 return false;
728 case Intrinsic::nvvm_texsurf_handle_internal:
729 SelectTexSurfHandle(N);
730 return true;
731 }
732 }
733
SelectTexSurfHandle(SDNode * N)734 void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
735 // Op 0 is the intrinsic ID
736 SDValue Wrapper = N->getOperand(1);
737 SDValue GlobalVal = Wrapper.getOperand(0);
738 ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
739 MVT::i64, GlobalVal));
740 }
741
SelectAddrSpaceCast(SDNode * N)742 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
743 SDValue Src = N->getOperand(0);
744 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
745 unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
746 unsigned DstAddrSpace = CastN->getDestAddressSpace();
747 assert(SrcAddrSpace != DstAddrSpace &&
748 "addrspacecast must be between different address spaces");
749
750 if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
751 // Specific to generic
752 unsigned Opc;
753 switch (SrcAddrSpace) {
754 default: report_fatal_error("Bad address space in addrspacecast");
755 case ADDRESS_SPACE_GLOBAL:
756 Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes;
757 break;
758 case ADDRESS_SPACE_SHARED:
759 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_shared_yes_6432
760 : NVPTX::cvta_shared_yes_64)
761 : NVPTX::cvta_shared_yes;
762 break;
763 case ADDRESS_SPACE_CONST:
764 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_const_yes_6432
765 : NVPTX::cvta_const_yes_64)
766 : NVPTX::cvta_const_yes;
767 break;
768 case ADDRESS_SPACE_LOCAL:
769 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_local_yes_6432
770 : NVPTX::cvta_local_yes_64)
771 : NVPTX::cvta_local_yes;
772 break;
773 }
774 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
775 Src));
776 return;
777 } else {
778 // Generic to specific
779 if (SrcAddrSpace != 0)
780 report_fatal_error("Cannot cast between two non-generic address spaces");
781 unsigned Opc;
782 switch (DstAddrSpace) {
783 default: report_fatal_error("Bad address space in addrspacecast");
784 case ADDRESS_SPACE_GLOBAL:
785 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_yes_64
786 : NVPTX::cvta_to_global_yes;
787 break;
788 case ADDRESS_SPACE_SHARED:
789 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_shared_yes_3264
790 : NVPTX::cvta_to_shared_yes_64)
791 : NVPTX::cvta_to_shared_yes;
792 break;
793 case ADDRESS_SPACE_CONST:
794 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_const_yes_3264
795 : NVPTX::cvta_to_const_yes_64)
796 : NVPTX::cvta_to_const_yes;
797 break;
798 case ADDRESS_SPACE_LOCAL:
799 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_local_yes_3264
800 : NVPTX::cvta_to_local_yes_64)
801 : NVPTX::cvta_to_local_yes;
802 break;
803 case ADDRESS_SPACE_PARAM:
804 Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
805 : NVPTX::nvvm_ptr_gen_to_param;
806 break;
807 }
808 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
809 Src));
810 return;
811 }
812 }
813
814 // Helper function template to reduce amount of boilerplate code for
815 // opcode selection.
816 static std::optional<unsigned>
pickOpcodeForVT(MVT::SimpleValueType VT,unsigned Opcode_i8,unsigned Opcode_i16,unsigned Opcode_i32,std::optional<unsigned> Opcode_i64,unsigned Opcode_f16,unsigned Opcode_f16x2,unsigned Opcode_f32,std::optional<unsigned> Opcode_f64)817 pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8,
818 unsigned Opcode_i16, unsigned Opcode_i32,
819 std::optional<unsigned> Opcode_i64, unsigned Opcode_f16,
820 unsigned Opcode_f16x2, unsigned Opcode_f32,
821 std::optional<unsigned> Opcode_f64) {
822 switch (VT) {
823 case MVT::i1:
824 case MVT::i8:
825 return Opcode_i8;
826 case MVT::i16:
827 return Opcode_i16;
828 case MVT::i32:
829 return Opcode_i32;
830 case MVT::i64:
831 return Opcode_i64;
832 case MVT::f16:
833 case MVT::bf16:
834 return Opcode_f16;
835 case MVT::v2f16:
836 case MVT::v2bf16:
837 return Opcode_f16x2;
838 case MVT::f32:
839 return Opcode_f32;
840 case MVT::f64:
841 return Opcode_f64;
842 default:
843 return std::nullopt;
844 }
845 }
846
getLdStRegType(EVT VT)847 static int getLdStRegType(EVT VT) {
848 if (VT.isFloatingPoint())
849 switch (VT.getSimpleVT().SimpleTy) {
850 case MVT::f16:
851 case MVT::bf16:
852 case MVT::v2f16:
853 case MVT::v2bf16:
854 return NVPTX::PTXLdStInstCode::Untyped;
855 default:
856 return NVPTX::PTXLdStInstCode::Float;
857 }
858 else
859 return NVPTX::PTXLdStInstCode::Unsigned;
860 }
861
tryLoad(SDNode * N)862 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
863 SDLoc dl(N);
864 MemSDNode *LD = cast<MemSDNode>(N);
865 assert(LD->readMem() && "Expected load");
866 LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
867 EVT LoadedVT = LD->getMemoryVT();
868 SDNode *NVPTXLD = nullptr;
869
870 // do not support pre/post inc/dec
871 if (PlainLoad && PlainLoad->isIndexed())
872 return false;
873
874 if (!LoadedVT.isSimple())
875 return false;
876
877 AtomicOrdering Ordering = LD->getSuccessOrdering();
878 // In order to lower atomic loads with stronger guarantees we would need to
879 // use load.acquire or insert fences. However these features were only added
880 // with PTX ISA 6.0 / sm_70.
881 // TODO: Check if we can actually use the new instructions and implement them.
882 if (isStrongerThanMonotonic(Ordering))
883 return false;
884
885 // Address Space Setting
886 unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
887 if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
888 return tryLDGLDU(N);
889 }
890
891 unsigned int PointerSize =
892 CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
893
894 // Volatile Setting
895 // - .volatile is only available for .global and .shared
896 // - .volatile has the same memory synchronization semantics as .relaxed.sys
897 bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
898 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
899 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
900 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
901 isVolatile = false;
902
903 // Type Setting: fromType + fromTypeWidth
904 //
905 // Sign : ISD::SEXTLOAD
906 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
907 // type is integer
908 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
909 MVT SimpleVT = LoadedVT.getSimpleVT();
910 MVT ScalarVT = SimpleVT.getScalarType();
911 // Read at least 8 bits (predicates are stored as 8-bit values)
912 unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
913 unsigned int fromType;
914
915 // Vector Setting
916 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
917 if (SimpleVT.isVector()) {
918 assert((LoadedVT == MVT::v2f16 || LoadedVT == MVT::v2bf16) &&
919 "Unexpected vector type");
920 // v2f16/v2bf16 is loaded using ld.b32
921 fromTypeWidth = 32;
922 }
923
924 if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
925 fromType = NVPTX::PTXLdStInstCode::Signed;
926 else
927 fromType = getLdStRegType(ScalarVT);
928
929 // Create the machine instruction DAG
930 SDValue Chain = N->getOperand(0);
931 SDValue N1 = N->getOperand(1);
932 SDValue Addr;
933 SDValue Offset, Base;
934 std::optional<unsigned> Opcode;
935 MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
936
937 if (SelectDirectAddr(N1, Addr)) {
938 Opcode = pickOpcodeForVT(
939 TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, NVPTX::LD_i32_avar,
940 NVPTX::LD_i64_avar, NVPTX::LD_f16_avar, NVPTX::LD_f16x2_avar,
941 NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
942 if (!Opcode)
943 return false;
944 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
945 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
946 getI32Imm(fromTypeWidth, dl), Addr, Chain };
947 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
948 } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
949 : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
950 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
951 NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
952 NVPTX::LD_f16_asi, NVPTX::LD_f16x2_asi,
953 NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
954 if (!Opcode)
955 return false;
956 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
957 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
958 getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
959 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
960 } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
961 : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
962 if (PointerSize == 64)
963 Opcode = pickOpcodeForVT(
964 TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
965 NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, NVPTX::LD_f16_ari_64,
966 NVPTX::LD_f16x2_ari_64, NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
967 else
968 Opcode = pickOpcodeForVT(
969 TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari, NVPTX::LD_i32_ari,
970 NVPTX::LD_i64_ari, NVPTX::LD_f16_ari, NVPTX::LD_f16x2_ari,
971 NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
972 if (!Opcode)
973 return false;
974 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
975 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
976 getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
977 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
978 } else {
979 if (PointerSize == 64)
980 Opcode = pickOpcodeForVT(
981 TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
982 NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, NVPTX::LD_f16_areg_64,
983 NVPTX::LD_f16x2_areg_64, NVPTX::LD_f32_areg_64,
984 NVPTX::LD_f64_areg_64);
985 else
986 Opcode = pickOpcodeForVT(
987 TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg, NVPTX::LD_i32_areg,
988 NVPTX::LD_i64_areg, NVPTX::LD_f16_areg, NVPTX::LD_f16x2_areg,
989 NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
990 if (!Opcode)
991 return false;
992 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
993 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
994 getI32Imm(fromTypeWidth, dl), N1, Chain };
995 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
996 }
997
998 if (!NVPTXLD)
999 return false;
1000
1001 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1002 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
1003
1004 ReplaceNode(N, NVPTXLD);
1005 return true;
1006 }
1007
tryLoadVector(SDNode * N)1008 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1009
1010 SDValue Chain = N->getOperand(0);
1011 SDValue Op1 = N->getOperand(1);
1012 SDValue Addr, Offset, Base;
1013 std::optional<unsigned> Opcode;
1014 SDLoc DL(N);
1015 SDNode *LD;
1016 MemSDNode *MemSD = cast<MemSDNode>(N);
1017 EVT LoadedVT = MemSD->getMemoryVT();
1018
1019 if (!LoadedVT.isSimple())
1020 return false;
1021
1022 // Address Space Setting
1023 unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
1024 if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
1025 return tryLDGLDU(N);
1026 }
1027
1028 unsigned int PointerSize =
1029 CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
1030
1031 // Volatile Setting
1032 // - .volatile is only availalble for .global and .shared
1033 bool IsVolatile = MemSD->isVolatile();
1034 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1035 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1036 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1037 IsVolatile = false;
1038
1039 // Vector Setting
1040 MVT SimpleVT = LoadedVT.getSimpleVT();
1041
1042 // Type Setting: fromType + fromTypeWidth
1043 //
1044 // Sign : ISD::SEXTLOAD
1045 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1046 // type is integer
1047 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1048 MVT ScalarVT = SimpleVT.getScalarType();
1049 // Read at least 8 bits (predicates are stored as 8-bit values)
1050 unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
1051 unsigned int FromType;
1052 // The last operand holds the original LoadSDNode::getExtensionType() value
1053 unsigned ExtensionType = cast<ConstantSDNode>(
1054 N->getOperand(N->getNumOperands() - 1))->getZExtValue();
1055 if (ExtensionType == ISD::SEXTLOAD)
1056 FromType = NVPTX::PTXLdStInstCode::Signed;
1057 else
1058 FromType = getLdStRegType(ScalarVT);
1059
1060 unsigned VecType;
1061
1062 switch (N->getOpcode()) {
1063 case NVPTXISD::LoadV2:
1064 VecType = NVPTX::PTXLdStInstCode::V2;
1065 break;
1066 case NVPTXISD::LoadV4:
1067 VecType = NVPTX::PTXLdStInstCode::V4;
1068 break;
1069 default:
1070 return false;
1071 }
1072
1073 EVT EltVT = N->getValueType(0);
1074
1075 // v8f16 is a special case. PTX doesn't have ld.v8.f16
1076 // instruction. Instead, we split the vector into v2f16 chunks and
1077 // load them with ld.v4.b32.
1078 if (EltVT == MVT::v2f16 || EltVT == MVT::v2bf16) {
1079 assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
1080 EltVT = MVT::i32;
1081 FromType = NVPTX::PTXLdStInstCode::Untyped;
1082 FromTypeWidth = 32;
1083 }
1084
1085 if (SelectDirectAddr(Op1, Addr)) {
1086 switch (N->getOpcode()) {
1087 default:
1088 return false;
1089 case NVPTXISD::LoadV2:
1090 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1091 NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
1092 NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
1093 NVPTX::LDV_f16_v2_avar, NVPTX::LDV_f16x2_v2_avar,
1094 NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
1095 break;
1096 case NVPTXISD::LoadV4:
1097 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1098 NVPTX::LDV_i8_v4_avar, NVPTX::LDV_i16_v4_avar,
1099 NVPTX::LDV_i32_v4_avar, std::nullopt,
1100 NVPTX::LDV_f16_v4_avar, NVPTX::LDV_f16x2_v4_avar,
1101 NVPTX::LDV_f32_v4_avar, std::nullopt);
1102 break;
1103 }
1104 if (!Opcode)
1105 return false;
1106 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1107 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1108 getI32Imm(FromTypeWidth, DL), Addr, Chain };
1109 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1110 } else if (PointerSize == 64
1111 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
1112 : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
1113 switch (N->getOpcode()) {
1114 default:
1115 return false;
1116 case NVPTXISD::LoadV2:
1117 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1118 NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
1119 NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
1120 NVPTX::LDV_f16_v2_asi, NVPTX::LDV_f16x2_v2_asi,
1121 NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
1122 break;
1123 case NVPTXISD::LoadV4:
1124 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1125 NVPTX::LDV_i8_v4_asi, NVPTX::LDV_i16_v4_asi,
1126 NVPTX::LDV_i32_v4_asi, std::nullopt,
1127 NVPTX::LDV_f16_v4_asi, NVPTX::LDV_f16x2_v4_asi,
1128 NVPTX::LDV_f32_v4_asi, std::nullopt);
1129 break;
1130 }
1131 if (!Opcode)
1132 return false;
1133 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1134 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1135 getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1136 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1137 } else if (PointerSize == 64
1138 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1139 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1140 if (PointerSize == 64) {
1141 switch (N->getOpcode()) {
1142 default:
1143 return false;
1144 case NVPTXISD::LoadV2:
1145 Opcode = pickOpcodeForVT(
1146 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_ari_64,
1147 NVPTX::LDV_i16_v2_ari_64, NVPTX::LDV_i32_v2_ari_64,
1148 NVPTX::LDV_i64_v2_ari_64, NVPTX::LDV_f16_v2_ari_64,
1149 NVPTX::LDV_f16x2_v2_ari_64, NVPTX::LDV_f32_v2_ari_64,
1150 NVPTX::LDV_f64_v2_ari_64);
1151 break;
1152 case NVPTXISD::LoadV4:
1153 Opcode = pickOpcodeForVT(
1154 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
1155 NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, std::nullopt,
1156 NVPTX::LDV_f16_v4_ari_64, NVPTX::LDV_f16x2_v4_ari_64,
1157 NVPTX::LDV_f32_v4_ari_64, std::nullopt);
1158 break;
1159 }
1160 } else {
1161 switch (N->getOpcode()) {
1162 default:
1163 return false;
1164 case NVPTXISD::LoadV2:
1165 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1166 NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
1167 NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
1168 NVPTX::LDV_f16_v2_ari, NVPTX::LDV_f16x2_v2_ari,
1169 NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
1170 break;
1171 case NVPTXISD::LoadV4:
1172 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1173 NVPTX::LDV_i8_v4_ari, NVPTX::LDV_i16_v4_ari,
1174 NVPTX::LDV_i32_v4_ari, std::nullopt,
1175 NVPTX::LDV_f16_v4_ari, NVPTX::LDV_f16x2_v4_ari,
1176 NVPTX::LDV_f32_v4_ari, std::nullopt);
1177 break;
1178 }
1179 }
1180 if (!Opcode)
1181 return false;
1182 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1183 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1184 getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1185
1186 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1187 } else {
1188 if (PointerSize == 64) {
1189 switch (N->getOpcode()) {
1190 default:
1191 return false;
1192 case NVPTXISD::LoadV2:
1193 Opcode = pickOpcodeForVT(
1194 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
1195 NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
1196 NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f16_v2_areg_64,
1197 NVPTX::LDV_f16x2_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
1198 NVPTX::LDV_f64_v2_areg_64);
1199 break;
1200 case NVPTXISD::LoadV4:
1201 Opcode = pickOpcodeForVT(
1202 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
1203 NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, std::nullopt,
1204 NVPTX::LDV_f16_v4_areg_64, NVPTX::LDV_f16x2_v4_areg_64,
1205 NVPTX::LDV_f32_v4_areg_64, std::nullopt);
1206 break;
1207 }
1208 } else {
1209 switch (N->getOpcode()) {
1210 default:
1211 return false;
1212 case NVPTXISD::LoadV2:
1213 Opcode =
1214 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
1215 NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
1216 NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f16_v2_areg,
1217 NVPTX::LDV_f16x2_v2_areg, NVPTX::LDV_f32_v2_areg,
1218 NVPTX::LDV_f64_v2_areg);
1219 break;
1220 case NVPTXISD::LoadV4:
1221 Opcode = pickOpcodeForVT(
1222 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
1223 NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg, std::nullopt,
1224 NVPTX::LDV_f16_v4_areg, NVPTX::LDV_f16x2_v4_areg,
1225 NVPTX::LDV_f32_v4_areg, std::nullopt);
1226 break;
1227 }
1228 }
1229 if (!Opcode)
1230 return false;
1231 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1232 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1233 getI32Imm(FromTypeWidth, DL), Op1, Chain };
1234 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1235 }
1236
1237 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1238 CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1239
1240 ReplaceNode(N, LD);
1241 return true;
1242 }
1243
tryLDGLDU(SDNode * N)1244 bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
1245
1246 SDValue Chain = N->getOperand(0);
1247 SDValue Op1;
1248 MemSDNode *Mem;
1249 bool IsLDG = true;
1250
1251 // If this is an LDG intrinsic, the address is the third operand. If its an
1252 // LDG/LDU SD node (from custom vector handling), then its the second operand
1253 if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
1254 Op1 = N->getOperand(2);
1255 Mem = cast<MemIntrinsicSDNode>(N);
1256 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
1257 switch (IID) {
1258 default:
1259 return false;
1260 case Intrinsic::nvvm_ldg_global_f:
1261 case Intrinsic::nvvm_ldg_global_i:
1262 case Intrinsic::nvvm_ldg_global_p:
1263 IsLDG = true;
1264 break;
1265 case Intrinsic::nvvm_ldu_global_f:
1266 case Intrinsic::nvvm_ldu_global_i:
1267 case Intrinsic::nvvm_ldu_global_p:
1268 IsLDG = false;
1269 break;
1270 }
1271 } else {
1272 Op1 = N->getOperand(1);
1273 Mem = cast<MemSDNode>(N);
1274 }
1275
1276 std::optional<unsigned> Opcode;
1277 SDLoc DL(N);
1278 SDNode *LD;
1279 SDValue Base, Offset, Addr;
1280
1281 EVT EltVT = Mem->getMemoryVT();
1282 unsigned NumElts = 1;
1283 if (EltVT.isVector()) {
1284 NumElts = EltVT.getVectorNumElements();
1285 EltVT = EltVT.getVectorElementType();
1286 // vectors of f16 are loaded/stored as multiples of v2f16 elements.
1287 if (EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) {
1288 assert(NumElts % 2 == 0 && "Vector must have even number of elements");
1289 EltVT = MVT::v2f16;
1290 NumElts /= 2;
1291 }
1292 }
1293
1294 // Build the "promoted" result VTList for the load. If we are really loading
1295 // i8s, then the return type will be promoted to i16 since we do not expose
1296 // 8-bit registers in NVPTX.
1297 EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
1298 SmallVector<EVT, 5> InstVTs;
1299 for (unsigned i = 0; i != NumElts; ++i) {
1300 InstVTs.push_back(NodeVT);
1301 }
1302 InstVTs.push_back(MVT::Other);
1303 SDVTList InstVTList = CurDAG->getVTList(InstVTs);
1304
1305 if (SelectDirectAddr(Op1, Addr)) {
1306 switch (N->getOpcode()) {
1307 default:
1308 return false;
1309 case ISD::LOAD:
1310 case ISD::INTRINSIC_W_CHAIN:
1311 if (IsLDG)
1312 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1313 NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1314 NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
1315 NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1316 NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
1317 NVPTX::INT_PTX_LDG_GLOBAL_f16avar,
1318 NVPTX::INT_PTX_LDG_GLOBAL_f16x2avar,
1319 NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1320 NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1321 else
1322 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1323 NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1324 NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
1325 NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1326 NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
1327 NVPTX::INT_PTX_LDU_GLOBAL_f16avar,
1328 NVPTX::INT_PTX_LDU_GLOBAL_f16x2avar,
1329 NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1330 NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1331 break;
1332 case NVPTXISD::LoadV2:
1333 case NVPTXISD::LDGV2:
1334 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1335 NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1336 NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1337 NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1338 NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1339 NVPTX::INT_PTX_LDG_G_v2f16_ELE_avar,
1340 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_avar,
1341 NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1342 NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1343 break;
1344 case NVPTXISD::LDUV2:
1345 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1346 NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1347 NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1348 NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1349 NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1350 NVPTX::INT_PTX_LDU_G_v2f16_ELE_avar,
1351 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_avar,
1352 NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1353 NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1354 break;
1355 case NVPTXISD::LoadV4:
1356 case NVPTXISD::LDGV4:
1357 Opcode = pickOpcodeForVT(
1358 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1359 NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1360 NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,
1361 NVPTX::INT_PTX_LDG_G_v4f16_ELE_avar,
1362 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_avar,
1363 NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);
1364 break;
1365 case NVPTXISD::LDUV4:
1366 Opcode = pickOpcodeForVT(
1367 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1368 NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1369 NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,
1370 NVPTX::INT_PTX_LDU_G_v4f16_ELE_avar,
1371 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_avar,
1372 NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);
1373 break;
1374 }
1375 if (!Opcode)
1376 return false;
1377 SDValue Ops[] = { Addr, Chain };
1378 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1379 } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1380 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1381 if (TM.is64Bit()) {
1382 switch (N->getOpcode()) {
1383 default:
1384 return false;
1385 case ISD::LOAD:
1386 case ISD::INTRINSIC_W_CHAIN:
1387 if (IsLDG)
1388 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1389 NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
1390 NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
1391 NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
1392 NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
1393 NVPTX::INT_PTX_LDG_GLOBAL_f16ari64,
1394 NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari64,
1395 NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
1396 NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
1397 else
1398 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1399 NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
1400 NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
1401 NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
1402 NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
1403 NVPTX::INT_PTX_LDU_GLOBAL_f16ari64,
1404 NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari64,
1405 NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
1406 NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
1407 break;
1408 case NVPTXISD::LoadV2:
1409 case NVPTXISD::LDGV2:
1410 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1411 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
1412 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
1413 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
1414 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
1415 NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari64,
1416 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari64,
1417 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
1418 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
1419 break;
1420 case NVPTXISD::LDUV2:
1421 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1422 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
1423 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
1424 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
1425 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
1426 NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari64,
1427 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari64,
1428 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
1429 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
1430 break;
1431 case NVPTXISD::LoadV4:
1432 case NVPTXISD::LDGV4:
1433 Opcode = pickOpcodeForVT(
1434 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
1435 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
1436 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, std::nullopt,
1437 NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari64,
1438 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari64,
1439 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, std::nullopt);
1440 break;
1441 case NVPTXISD::LDUV4:
1442 Opcode = pickOpcodeForVT(
1443 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
1444 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
1445 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, std::nullopt,
1446 NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari64,
1447 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari64,
1448 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, std::nullopt);
1449 break;
1450 }
1451 } else {
1452 switch (N->getOpcode()) {
1453 default:
1454 return false;
1455 case ISD::LOAD:
1456 case ISD::INTRINSIC_W_CHAIN:
1457 if (IsLDG)
1458 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1459 NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
1460 NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
1461 NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
1462 NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
1463 NVPTX::INT_PTX_LDG_GLOBAL_f16ari,
1464 NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari,
1465 NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
1466 NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
1467 else
1468 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1469 NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
1470 NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
1471 NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
1472 NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
1473 NVPTX::INT_PTX_LDU_GLOBAL_f16ari,
1474 NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari,
1475 NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
1476 NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
1477 break;
1478 case NVPTXISD::LoadV2:
1479 case NVPTXISD::LDGV2:
1480 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1481 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
1482 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
1483 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
1484 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
1485 NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari32,
1486 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari32,
1487 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
1488 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
1489 break;
1490 case NVPTXISD::LDUV2:
1491 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1492 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
1493 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
1494 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
1495 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
1496 NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari32,
1497 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari32,
1498 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
1499 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
1500 break;
1501 case NVPTXISD::LoadV4:
1502 case NVPTXISD::LDGV4:
1503 Opcode = pickOpcodeForVT(
1504 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
1505 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
1506 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, std::nullopt,
1507 NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari32,
1508 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari32,
1509 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, std::nullopt);
1510 break;
1511 case NVPTXISD::LDUV4:
1512 Opcode = pickOpcodeForVT(
1513 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
1514 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
1515 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, std::nullopt,
1516 NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari32,
1517 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari32,
1518 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, std::nullopt);
1519 break;
1520 }
1521 }
1522 if (!Opcode)
1523 return false;
1524 SDValue Ops[] = {Base, Offset, Chain};
1525 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1526 } else {
1527 if (TM.is64Bit()) {
1528 switch (N->getOpcode()) {
1529 default:
1530 return false;
1531 case ISD::LOAD:
1532 case ISD::INTRINSIC_W_CHAIN:
1533 if (IsLDG)
1534 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1535 NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
1536 NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
1537 NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
1538 NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
1539 NVPTX::INT_PTX_LDG_GLOBAL_f16areg64,
1540 NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg64,
1541 NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
1542 NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
1543 else
1544 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1545 NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
1546 NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
1547 NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
1548 NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
1549 NVPTX::INT_PTX_LDU_GLOBAL_f16areg64,
1550 NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg64,
1551 NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
1552 NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
1553 break;
1554 case NVPTXISD::LoadV2:
1555 case NVPTXISD::LDGV2:
1556 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1557 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
1558 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
1559 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
1560 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
1561 NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg64,
1562 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg64,
1563 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
1564 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
1565 break;
1566 case NVPTXISD::LDUV2:
1567 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1568 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
1569 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
1570 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
1571 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
1572 NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg64,
1573 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg64,
1574 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
1575 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
1576 break;
1577 case NVPTXISD::LoadV4:
1578 case NVPTXISD::LDGV4:
1579 Opcode = pickOpcodeForVT(
1580 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
1581 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
1582 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, std::nullopt,
1583 NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg64,
1584 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg64,
1585 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, std::nullopt);
1586 break;
1587 case NVPTXISD::LDUV4:
1588 Opcode = pickOpcodeForVT(
1589 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
1590 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
1591 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, std::nullopt,
1592 NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg64,
1593 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg64,
1594 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, std::nullopt);
1595 break;
1596 }
1597 } else {
1598 switch (N->getOpcode()) {
1599 default:
1600 return false;
1601 case ISD::LOAD:
1602 case ISD::INTRINSIC_W_CHAIN:
1603 if (IsLDG)
1604 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1605 NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
1606 NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
1607 NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
1608 NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
1609 NVPTX::INT_PTX_LDG_GLOBAL_f16areg,
1610 NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg,
1611 NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
1612 NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
1613 else
1614 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1615 NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
1616 NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
1617 NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
1618 NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
1619 NVPTX::INT_PTX_LDU_GLOBAL_f16areg,
1620 NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg,
1621 NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
1622 NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
1623 break;
1624 case NVPTXISD::LoadV2:
1625 case NVPTXISD::LDGV2:
1626 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1627 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
1628 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
1629 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
1630 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
1631 NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg32,
1632 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg32,
1633 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
1634 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
1635 break;
1636 case NVPTXISD::LDUV2:
1637 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1638 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
1639 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
1640 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
1641 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
1642 NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg32,
1643 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg32,
1644 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
1645 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
1646 break;
1647 case NVPTXISD::LoadV4:
1648 case NVPTXISD::LDGV4:
1649 Opcode = pickOpcodeForVT(
1650 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
1651 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
1652 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, std::nullopt,
1653 NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg32,
1654 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg32,
1655 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, std::nullopt);
1656 break;
1657 case NVPTXISD::LDUV4:
1658 Opcode = pickOpcodeForVT(
1659 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
1660 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
1661 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, std::nullopt,
1662 NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg32,
1663 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg32,
1664 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, std::nullopt);
1665 break;
1666 }
1667 }
1668 if (!Opcode)
1669 return false;
1670 SDValue Ops[] = { Op1, Chain };
1671 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1672 }
1673
1674 // For automatic generation of LDG (through SelectLoad[Vector], not the
1675 // intrinsics), we may have an extending load like:
1676 //
1677 // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
1678 //
1679 // In this case, the matching logic above will select a load for the original
1680 // memory type (in this case, i8) and our types will not match (the node needs
1681 // to return an i32 in this case). Our LDG/LDU nodes do not support the
1682 // concept of sign-/zero-extension, so emulate it here by adding an explicit
1683 // CVT instruction. Ptxas should clean up any redundancies here.
1684
1685 EVT OrigType = N->getValueType(0);
1686 LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
1687
1688 if (OrigType != EltVT && LdNode) {
1689 // We have an extending-load. The instruction we selected operates on the
1690 // smaller type, but the SDNode we are replacing has the larger type. We
1691 // need to emit a CVT to make the types match.
1692 bool IsSigned = LdNode->getExtensionType() == ISD::SEXTLOAD;
1693 unsigned CvtOpc = GetConvertOpcode(OrigType.getSimpleVT(),
1694 EltVT.getSimpleVT(), IsSigned);
1695
1696 // For each output value, apply the manual sign/zero-extension and make sure
1697 // all users of the load go through that CVT.
1698 for (unsigned i = 0; i != NumElts; ++i) {
1699 SDValue Res(LD, i);
1700 SDValue OrigVal(N, i);
1701
1702 SDNode *CvtNode =
1703 CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
1704 CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
1705 DL, MVT::i32));
1706 ReplaceUses(OrigVal, SDValue(CvtNode, 0));
1707 }
1708 }
1709
1710 ReplaceNode(N, LD);
1711 return true;
1712 }
1713
tryStore(SDNode * N)1714 bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1715 SDLoc dl(N);
1716 MemSDNode *ST = cast<MemSDNode>(N);
1717 assert(ST->writeMem() && "Expected store");
1718 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
1719 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
1720 assert((PlainStore || AtomicStore) && "Expected store");
1721 EVT StoreVT = ST->getMemoryVT();
1722 SDNode *NVPTXST = nullptr;
1723
1724 // do not support pre/post inc/dec
1725 if (PlainStore && PlainStore->isIndexed())
1726 return false;
1727
1728 if (!StoreVT.isSimple())
1729 return false;
1730
1731 AtomicOrdering Ordering = ST->getSuccessOrdering();
1732 // In order to lower atomic loads with stronger guarantees we would need to
1733 // use store.release or insert fences. However these features were only added
1734 // with PTX ISA 6.0 / sm_70.
1735 // TODO: Check if we can actually use the new instructions and implement them.
1736 if (isStrongerThanMonotonic(Ordering))
1737 return false;
1738
1739 // Address Space Setting
1740 unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
1741 unsigned int PointerSize =
1742 CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
1743
1744 // Volatile Setting
1745 // - .volatile is only available for .global and .shared
1746 // - .volatile has the same memory synchronization semantics as .relaxed.sys
1747 bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;
1748 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1749 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1750 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1751 isVolatile = false;
1752
1753 // Vector Setting
1754 MVT SimpleVT = StoreVT.getSimpleVT();
1755 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
1756
1757 // Type Setting: toType + toTypeWidth
1758 // - for integer type, always use 'u'
1759 //
1760 MVT ScalarVT = SimpleVT.getScalarType();
1761 unsigned toTypeWidth = ScalarVT.getSizeInBits();
1762 if (SimpleVT.isVector()) {
1763 assert((StoreVT == MVT::v2f16 || StoreVT == MVT::v2bf16) &&
1764 "Unexpected vector type");
1765 // v2f16 is stored using st.b32
1766 toTypeWidth = 32;
1767 }
1768
1769 unsigned int toType = getLdStRegType(ScalarVT);
1770
1771 // Create the machine instruction DAG
1772 SDValue Chain = ST->getChain();
1773 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1774 SDValue BasePtr = ST->getBasePtr();
1775 SDValue Addr;
1776 SDValue Offset, Base;
1777 std::optional<unsigned> Opcode;
1778 MVT::SimpleValueType SourceVT =
1779 Value.getNode()->getSimpleValueType(0).SimpleTy;
1780
1781 if (SelectDirectAddr(BasePtr, Addr)) {
1782 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
1783 NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
1784 NVPTX::ST_f16_avar, NVPTX::ST_f16x2_avar,
1785 NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
1786 if (!Opcode)
1787 return false;
1788 SDValue Ops[] = {Value,
1789 getI32Imm(isVolatile, dl),
1790 getI32Imm(CodeAddrSpace, dl),
1791 getI32Imm(vecType, dl),
1792 getI32Imm(toType, dl),
1793 getI32Imm(toTypeWidth, dl),
1794 Addr,
1795 Chain};
1796 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1797 } else if (PointerSize == 64
1798 ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
1799 : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
1800 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
1801 NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
1802 NVPTX::ST_f16_asi, NVPTX::ST_f16x2_asi,
1803 NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
1804 if (!Opcode)
1805 return false;
1806 SDValue Ops[] = {Value,
1807 getI32Imm(isVolatile, dl),
1808 getI32Imm(CodeAddrSpace, dl),
1809 getI32Imm(vecType, dl),
1810 getI32Imm(toType, dl),
1811 getI32Imm(toTypeWidth, dl),
1812 Base,
1813 Offset,
1814 Chain};
1815 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1816 } else if (PointerSize == 64
1817 ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
1818 : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
1819 if (PointerSize == 64)
1820 Opcode = pickOpcodeForVT(
1821 SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
1822 NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, NVPTX::ST_f16_ari_64,
1823 NVPTX::ST_f16x2_ari_64, NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
1824 else
1825 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
1826 NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
1827 NVPTX::ST_f16_ari, NVPTX::ST_f16x2_ari,
1828 NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
1829 if (!Opcode)
1830 return false;
1831
1832 SDValue Ops[] = {Value,
1833 getI32Imm(isVolatile, dl),
1834 getI32Imm(CodeAddrSpace, dl),
1835 getI32Imm(vecType, dl),
1836 getI32Imm(toType, dl),
1837 getI32Imm(toTypeWidth, dl),
1838 Base,
1839 Offset,
1840 Chain};
1841 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1842 } else {
1843 if (PointerSize == 64)
1844 Opcode =
1845 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
1846 NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
1847 NVPTX::ST_f16_areg_64, NVPTX::ST_f16x2_areg_64,
1848 NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
1849 else
1850 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
1851 NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
1852 NVPTX::ST_f16_areg, NVPTX::ST_f16x2_areg,
1853 NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
1854 if (!Opcode)
1855 return false;
1856 SDValue Ops[] = {Value,
1857 getI32Imm(isVolatile, dl),
1858 getI32Imm(CodeAddrSpace, dl),
1859 getI32Imm(vecType, dl),
1860 getI32Imm(toType, dl),
1861 getI32Imm(toTypeWidth, dl),
1862 BasePtr,
1863 Chain};
1864 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1865 }
1866
1867 if (!NVPTXST)
1868 return false;
1869
1870 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1871 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1872 ReplaceNode(N, NVPTXST);
1873 return true;
1874 }
1875
tryStoreVector(SDNode * N)1876 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1877 SDValue Chain = N->getOperand(0);
1878 SDValue Op1 = N->getOperand(1);
1879 SDValue Addr, Offset, Base;
1880 std::optional<unsigned> Opcode;
1881 SDLoc DL(N);
1882 SDNode *ST;
1883 EVT EltVT = Op1.getValueType();
1884 MemSDNode *MemSD = cast<MemSDNode>(N);
1885 EVT StoreVT = MemSD->getMemoryVT();
1886
1887 // Address Space Setting
1888 unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
1889 if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
1890 report_fatal_error("Cannot store to pointer that points to constant "
1891 "memory space");
1892 }
1893 unsigned int PointerSize =
1894 CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
1895
1896 // Volatile Setting
1897 // - .volatile is only availalble for .global and .shared
1898 bool IsVolatile = MemSD->isVolatile();
1899 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1900 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1901 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1902 IsVolatile = false;
1903
1904 // Type Setting: toType + toTypeWidth
1905 // - for integer type, always use 'u'
1906 assert(StoreVT.isSimple() && "Store value is not simple");
1907 MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
1908 unsigned ToTypeWidth = ScalarVT.getSizeInBits();
1909 unsigned ToType = getLdStRegType(ScalarVT);
1910
1911 SmallVector<SDValue, 12> StOps;
1912 SDValue N2;
1913 unsigned VecType;
1914
1915 switch (N->getOpcode()) {
1916 case NVPTXISD::StoreV2:
1917 VecType = NVPTX::PTXLdStInstCode::V2;
1918 StOps.push_back(N->getOperand(1));
1919 StOps.push_back(N->getOperand(2));
1920 N2 = N->getOperand(3);
1921 break;
1922 case NVPTXISD::StoreV4:
1923 VecType = NVPTX::PTXLdStInstCode::V4;
1924 StOps.push_back(N->getOperand(1));
1925 StOps.push_back(N->getOperand(2));
1926 StOps.push_back(N->getOperand(3));
1927 StOps.push_back(N->getOperand(4));
1928 N2 = N->getOperand(5);
1929 break;
1930 default:
1931 return false;
1932 }
1933
1934 // v8f16 is a special case. PTX doesn't have st.v8.f16
1935 // instruction. Instead, we split the vector into v2f16 chunks and
1936 // store them with st.v4.b32.
1937 if (EltVT == MVT::v2f16 || EltVT == MVT::v2bf16) {
1938 assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
1939 EltVT = MVT::i32;
1940 ToType = NVPTX::PTXLdStInstCode::Untyped;
1941 ToTypeWidth = 32;
1942 }
1943
1944 StOps.push_back(getI32Imm(IsVolatile, DL));
1945 StOps.push_back(getI32Imm(CodeAddrSpace, DL));
1946 StOps.push_back(getI32Imm(VecType, DL));
1947 StOps.push_back(getI32Imm(ToType, DL));
1948 StOps.push_back(getI32Imm(ToTypeWidth, DL));
1949
1950 if (SelectDirectAddr(N2, Addr)) {
1951 switch (N->getOpcode()) {
1952 default:
1953 return false;
1954 case NVPTXISD::StoreV2:
1955 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1956 NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
1957 NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
1958 NVPTX::STV_f16_v2_avar, NVPTX::STV_f16x2_v2_avar,
1959 NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
1960 break;
1961 case NVPTXISD::StoreV4:
1962 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1963 NVPTX::STV_i8_v4_avar, NVPTX::STV_i16_v4_avar,
1964 NVPTX::STV_i32_v4_avar, std::nullopt,
1965 NVPTX::STV_f16_v4_avar, NVPTX::STV_f16x2_v4_avar,
1966 NVPTX::STV_f32_v4_avar, std::nullopt);
1967 break;
1968 }
1969 StOps.push_back(Addr);
1970 } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
1971 : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
1972 switch (N->getOpcode()) {
1973 default:
1974 return false;
1975 case NVPTXISD::StoreV2:
1976 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1977 NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
1978 NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
1979 NVPTX::STV_f16_v2_asi, NVPTX::STV_f16x2_v2_asi,
1980 NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
1981 break;
1982 case NVPTXISD::StoreV4:
1983 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1984 NVPTX::STV_i8_v4_asi, NVPTX::STV_i16_v4_asi,
1985 NVPTX::STV_i32_v4_asi, std::nullopt,
1986 NVPTX::STV_f16_v4_asi, NVPTX::STV_f16x2_v4_asi,
1987 NVPTX::STV_f32_v4_asi, std::nullopt);
1988 break;
1989 }
1990 StOps.push_back(Base);
1991 StOps.push_back(Offset);
1992 } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
1993 : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
1994 if (PointerSize == 64) {
1995 switch (N->getOpcode()) {
1996 default:
1997 return false;
1998 case NVPTXISD::StoreV2:
1999 Opcode = pickOpcodeForVT(
2000 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_ari_64,
2001 NVPTX::STV_i16_v2_ari_64, NVPTX::STV_i32_v2_ari_64,
2002 NVPTX::STV_i64_v2_ari_64, NVPTX::STV_f16_v2_ari_64,
2003 NVPTX::STV_f16x2_v2_ari_64, NVPTX::STV_f32_v2_ari_64,
2004 NVPTX::STV_f64_v2_ari_64);
2005 break;
2006 case NVPTXISD::StoreV4:
2007 Opcode = pickOpcodeForVT(
2008 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
2009 NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, std::nullopt,
2010 NVPTX::STV_f16_v4_ari_64, NVPTX::STV_f16x2_v4_ari_64,
2011 NVPTX::STV_f32_v4_ari_64, std::nullopt);
2012 break;
2013 }
2014 } else {
2015 switch (N->getOpcode()) {
2016 default:
2017 return false;
2018 case NVPTXISD::StoreV2:
2019 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2020 NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
2021 NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
2022 NVPTX::STV_f16_v2_ari, NVPTX::STV_f16x2_v2_ari,
2023 NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
2024 break;
2025 case NVPTXISD::StoreV4:
2026 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2027 NVPTX::STV_i8_v4_ari, NVPTX::STV_i16_v4_ari,
2028 NVPTX::STV_i32_v4_ari, std::nullopt,
2029 NVPTX::STV_f16_v4_ari, NVPTX::STV_f16x2_v4_ari,
2030 NVPTX::STV_f32_v4_ari, std::nullopt);
2031 break;
2032 }
2033 }
2034 StOps.push_back(Base);
2035 StOps.push_back(Offset);
2036 } else {
2037 if (PointerSize == 64) {
2038 switch (N->getOpcode()) {
2039 default:
2040 return false;
2041 case NVPTXISD::StoreV2:
2042 Opcode = pickOpcodeForVT(
2043 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
2044 NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
2045 NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f16_v2_areg_64,
2046 NVPTX::STV_f16x2_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
2047 NVPTX::STV_f64_v2_areg_64);
2048 break;
2049 case NVPTXISD::StoreV4:
2050 Opcode = pickOpcodeForVT(
2051 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
2052 NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, std::nullopt,
2053 NVPTX::STV_f16_v4_areg_64, NVPTX::STV_f16x2_v4_areg_64,
2054 NVPTX::STV_f32_v4_areg_64, std::nullopt);
2055 break;
2056 }
2057 } else {
2058 switch (N->getOpcode()) {
2059 default:
2060 return false;
2061 case NVPTXISD::StoreV2:
2062 Opcode =
2063 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
2064 NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
2065 NVPTX::STV_i64_v2_areg, NVPTX::STV_f16_v2_areg,
2066 NVPTX::STV_f16x2_v2_areg, NVPTX::STV_f32_v2_areg,
2067 NVPTX::STV_f64_v2_areg);
2068 break;
2069 case NVPTXISD::StoreV4:
2070 Opcode = pickOpcodeForVT(
2071 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
2072 NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg, std::nullopt,
2073 NVPTX::STV_f16_v4_areg, NVPTX::STV_f16x2_v4_areg,
2074 NVPTX::STV_f32_v4_areg, std::nullopt);
2075 break;
2076 }
2077 }
2078 StOps.push_back(N2);
2079 }
2080
2081 if (!Opcode)
2082 return false;
2083
2084 StOps.push_back(Chain);
2085
2086 ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, StOps);
2087
2088 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2089 CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
2090
2091 ReplaceNode(N, ST);
2092 return true;
2093 }
2094
tryLoadParam(SDNode * Node)2095 bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
2096 SDValue Chain = Node->getOperand(0);
2097 SDValue Offset = Node->getOperand(2);
2098 SDValue Flag = Node->getOperand(3);
2099 SDLoc DL(Node);
2100 MemSDNode *Mem = cast<MemSDNode>(Node);
2101
2102 unsigned VecSize;
2103 switch (Node->getOpcode()) {
2104 default:
2105 return false;
2106 case NVPTXISD::LoadParam:
2107 VecSize = 1;
2108 break;
2109 case NVPTXISD::LoadParamV2:
2110 VecSize = 2;
2111 break;
2112 case NVPTXISD::LoadParamV4:
2113 VecSize = 4;
2114 break;
2115 }
2116
2117 EVT EltVT = Node->getValueType(0);
2118 EVT MemVT = Mem->getMemoryVT();
2119
2120 std::optional<unsigned> Opcode;
2121
2122 switch (VecSize) {
2123 default:
2124 return false;
2125 case 1:
2126 Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
2127 NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
2128 NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
2129 NVPTX::LoadParamMemF16, NVPTX::LoadParamMemF16x2,
2130 NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
2131 break;
2132 case 2:
2133 Opcode =
2134 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
2135 NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
2136 NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F16,
2137 NVPTX::LoadParamMemV2F16x2, NVPTX::LoadParamMemV2F32,
2138 NVPTX::LoadParamMemV2F64);
2139 break;
2140 case 4:
2141 Opcode = pickOpcodeForVT(
2142 MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
2143 NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32, std::nullopt,
2144 NVPTX::LoadParamMemV4F16, NVPTX::LoadParamMemV4F16x2,
2145 NVPTX::LoadParamMemV4F32, std::nullopt);
2146 break;
2147 }
2148 if (!Opcode)
2149 return false;
2150
2151 SDVTList VTs;
2152 if (VecSize == 1) {
2153 VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
2154 } else if (VecSize == 2) {
2155 VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
2156 } else {
2157 EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
2158 VTs = CurDAG->getVTList(EVTs);
2159 }
2160
2161 unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2162
2163 SmallVector<SDValue, 2> Ops;
2164 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2165 Ops.push_back(Chain);
2166 Ops.push_back(Flag);
2167
2168 ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops));
2169 return true;
2170 }
2171
tryStoreRetval(SDNode * N)2172 bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
2173 SDLoc DL(N);
2174 SDValue Chain = N->getOperand(0);
2175 SDValue Offset = N->getOperand(1);
2176 unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2177 MemSDNode *Mem = cast<MemSDNode>(N);
2178
2179 // How many elements do we have?
2180 unsigned NumElts = 1;
2181 switch (N->getOpcode()) {
2182 default:
2183 return false;
2184 case NVPTXISD::StoreRetval:
2185 NumElts = 1;
2186 break;
2187 case NVPTXISD::StoreRetvalV2:
2188 NumElts = 2;
2189 break;
2190 case NVPTXISD::StoreRetvalV4:
2191 NumElts = 4;
2192 break;
2193 }
2194
2195 // Build vector of operands
2196 SmallVector<SDValue, 6> Ops;
2197 for (unsigned i = 0; i < NumElts; ++i)
2198 Ops.push_back(N->getOperand(i + 2));
2199 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2200 Ops.push_back(Chain);
2201
2202 // Determine target opcode
2203 // If we have an i1, use an 8-bit store. The lowering code in
2204 // NVPTXISelLowering will have already emitted an upcast.
2205 std::optional<unsigned> Opcode = 0;
2206 switch (NumElts) {
2207 default:
2208 return false;
2209 case 1:
2210 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2211 NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
2212 NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
2213 NVPTX::StoreRetvalF16, NVPTX::StoreRetvalF16x2,
2214 NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
2215 break;
2216 case 2:
2217 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2218 NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
2219 NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
2220 NVPTX::StoreRetvalV2F16, NVPTX::StoreRetvalV2F16x2,
2221 NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
2222 break;
2223 case 4:
2224 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2225 NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
2226 NVPTX::StoreRetvalV4I32, std::nullopt,
2227 NVPTX::StoreRetvalV4F16, NVPTX::StoreRetvalV4F16x2,
2228 NVPTX::StoreRetvalV4F32, std::nullopt);
2229 break;
2230 }
2231 if (!Opcode)
2232 return false;
2233
2234 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
2235 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2236 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2237
2238 ReplaceNode(N, Ret);
2239 return true;
2240 }
2241
tryStoreParam(SDNode * N)2242 bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
2243 SDLoc DL(N);
2244 SDValue Chain = N->getOperand(0);
2245 SDValue Param = N->getOperand(1);
2246 unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue();
2247 SDValue Offset = N->getOperand(2);
2248 unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2249 MemSDNode *Mem = cast<MemSDNode>(N);
2250 SDValue Flag = N->getOperand(N->getNumOperands() - 1);
2251
2252 // How many elements do we have?
2253 unsigned NumElts = 1;
2254 switch (N->getOpcode()) {
2255 default:
2256 return false;
2257 case NVPTXISD::StoreParamU32:
2258 case NVPTXISD::StoreParamS32:
2259 case NVPTXISD::StoreParam:
2260 NumElts = 1;
2261 break;
2262 case NVPTXISD::StoreParamV2:
2263 NumElts = 2;
2264 break;
2265 case NVPTXISD::StoreParamV4:
2266 NumElts = 4;
2267 break;
2268 }
2269
2270 // Build vector of operands
2271 SmallVector<SDValue, 8> Ops;
2272 for (unsigned i = 0; i < NumElts; ++i)
2273 Ops.push_back(N->getOperand(i + 3));
2274 Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32));
2275 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2276 Ops.push_back(Chain);
2277 Ops.push_back(Flag);
2278
2279 // Determine target opcode
2280 // If we have an i1, use an 8-bit store. The lowering code in
2281 // NVPTXISelLowering will have already emitted an upcast.
2282 std::optional<unsigned> Opcode = 0;
2283 switch (N->getOpcode()) {
2284 default:
2285 switch (NumElts) {
2286 default:
2287 return false;
2288 case 1:
2289 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2290 NVPTX::StoreParamI8, NVPTX::StoreParamI16,
2291 NVPTX::StoreParamI32, NVPTX::StoreParamI64,
2292 NVPTX::StoreParamF16, NVPTX::StoreParamF16x2,
2293 NVPTX::StoreParamF32, NVPTX::StoreParamF64);
2294 break;
2295 case 2:
2296 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2297 NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16,
2298 NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64,
2299 NVPTX::StoreParamV2F16, NVPTX::StoreParamV2F16x2,
2300 NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64);
2301 break;
2302 case 4:
2303 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2304 NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16,
2305 NVPTX::StoreParamV4I32, std::nullopt,
2306 NVPTX::StoreParamV4F16, NVPTX::StoreParamV4F16x2,
2307 NVPTX::StoreParamV4F32, std::nullopt);
2308 break;
2309 }
2310 if (!Opcode)
2311 return false;
2312 break;
2313 // Special case: if we have a sign-extend/zero-extend node, insert the
2314 // conversion instruction first, and use that as the value operand to
2315 // the selected StoreParam node.
2316 case NVPTXISD::StoreParamU32: {
2317 Opcode = NVPTX::StoreParamI32;
2318 SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
2319 MVT::i32);
2320 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
2321 MVT::i32, Ops[0], CvtNone);
2322 Ops[0] = SDValue(Cvt, 0);
2323 break;
2324 }
2325 case NVPTXISD::StoreParamS32: {
2326 Opcode = NVPTX::StoreParamI32;
2327 SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
2328 MVT::i32);
2329 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
2330 MVT::i32, Ops[0], CvtNone);
2331 Ops[0] = SDValue(Cvt, 0);
2332 break;
2333 }
2334 }
2335
2336 SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
2337 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops);
2338 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2339 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2340
2341 ReplaceNode(N, Ret);
2342 return true;
2343 }
2344
tryTextureIntrinsic(SDNode * N)2345 bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
2346 unsigned Opc = 0;
2347
2348 switch (N->getOpcode()) {
2349 default: return false;
2350 case NVPTXISD::Tex1DFloatS32:
2351 Opc = NVPTX::TEX_1D_F32_S32_RR;
2352 break;
2353 case NVPTXISD::Tex1DFloatFloat:
2354 Opc = NVPTX::TEX_1D_F32_F32_RR;
2355 break;
2356 case NVPTXISD::Tex1DFloatFloatLevel:
2357 Opc = NVPTX::TEX_1D_F32_F32_LEVEL_RR;
2358 break;
2359 case NVPTXISD::Tex1DFloatFloatGrad:
2360 Opc = NVPTX::TEX_1D_F32_F32_GRAD_RR;
2361 break;
2362 case NVPTXISD::Tex1DS32S32:
2363 Opc = NVPTX::TEX_1D_S32_S32_RR;
2364 break;
2365 case NVPTXISD::Tex1DS32Float:
2366 Opc = NVPTX::TEX_1D_S32_F32_RR;
2367 break;
2368 case NVPTXISD::Tex1DS32FloatLevel:
2369 Opc = NVPTX::TEX_1D_S32_F32_LEVEL_RR;
2370 break;
2371 case NVPTXISD::Tex1DS32FloatGrad:
2372 Opc = NVPTX::TEX_1D_S32_F32_GRAD_RR;
2373 break;
2374 case NVPTXISD::Tex1DU32S32:
2375 Opc = NVPTX::TEX_1D_U32_S32_RR;
2376 break;
2377 case NVPTXISD::Tex1DU32Float:
2378 Opc = NVPTX::TEX_1D_U32_F32_RR;
2379 break;
2380 case NVPTXISD::Tex1DU32FloatLevel:
2381 Opc = NVPTX::TEX_1D_U32_F32_LEVEL_RR;
2382 break;
2383 case NVPTXISD::Tex1DU32FloatGrad:
2384 Opc = NVPTX::TEX_1D_U32_F32_GRAD_RR;
2385 break;
2386 case NVPTXISD::Tex1DArrayFloatS32:
2387 Opc = NVPTX::TEX_1D_ARRAY_F32_S32_RR;
2388 break;
2389 case NVPTXISD::Tex1DArrayFloatFloat:
2390 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_RR;
2391 break;
2392 case NVPTXISD::Tex1DArrayFloatFloatLevel:
2393 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR;
2394 break;
2395 case NVPTXISD::Tex1DArrayFloatFloatGrad:
2396 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR;
2397 break;
2398 case NVPTXISD::Tex1DArrayS32S32:
2399 Opc = NVPTX::TEX_1D_ARRAY_S32_S32_RR;
2400 break;
2401 case NVPTXISD::Tex1DArrayS32Float:
2402 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_RR;
2403 break;
2404 case NVPTXISD::Tex1DArrayS32FloatLevel:
2405 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR;
2406 break;
2407 case NVPTXISD::Tex1DArrayS32FloatGrad:
2408 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR;
2409 break;
2410 case NVPTXISD::Tex1DArrayU32S32:
2411 Opc = NVPTX::TEX_1D_ARRAY_U32_S32_RR;
2412 break;
2413 case NVPTXISD::Tex1DArrayU32Float:
2414 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_RR;
2415 break;
2416 case NVPTXISD::Tex1DArrayU32FloatLevel:
2417 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR;
2418 break;
2419 case NVPTXISD::Tex1DArrayU32FloatGrad:
2420 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR;
2421 break;
2422 case NVPTXISD::Tex2DFloatS32:
2423 Opc = NVPTX::TEX_2D_F32_S32_RR;
2424 break;
2425 case NVPTXISD::Tex2DFloatFloat:
2426 Opc = NVPTX::TEX_2D_F32_F32_RR;
2427 break;
2428 case NVPTXISD::Tex2DFloatFloatLevel:
2429 Opc = NVPTX::TEX_2D_F32_F32_LEVEL_RR;
2430 break;
2431 case NVPTXISD::Tex2DFloatFloatGrad:
2432 Opc = NVPTX::TEX_2D_F32_F32_GRAD_RR;
2433 break;
2434 case NVPTXISD::Tex2DS32S32:
2435 Opc = NVPTX::TEX_2D_S32_S32_RR;
2436 break;
2437 case NVPTXISD::Tex2DS32Float:
2438 Opc = NVPTX::TEX_2D_S32_F32_RR;
2439 break;
2440 case NVPTXISD::Tex2DS32FloatLevel:
2441 Opc = NVPTX::TEX_2D_S32_F32_LEVEL_RR;
2442 break;
2443 case NVPTXISD::Tex2DS32FloatGrad:
2444 Opc = NVPTX::TEX_2D_S32_F32_GRAD_RR;
2445 break;
2446 case NVPTXISD::Tex2DU32S32:
2447 Opc = NVPTX::TEX_2D_U32_S32_RR;
2448 break;
2449 case NVPTXISD::Tex2DU32Float:
2450 Opc = NVPTX::TEX_2D_U32_F32_RR;
2451 break;
2452 case NVPTXISD::Tex2DU32FloatLevel:
2453 Opc = NVPTX::TEX_2D_U32_F32_LEVEL_RR;
2454 break;
2455 case NVPTXISD::Tex2DU32FloatGrad:
2456 Opc = NVPTX::TEX_2D_U32_F32_GRAD_RR;
2457 break;
2458 case NVPTXISD::Tex2DArrayFloatS32:
2459 Opc = NVPTX::TEX_2D_ARRAY_F32_S32_RR;
2460 break;
2461 case NVPTXISD::Tex2DArrayFloatFloat:
2462 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_RR;
2463 break;
2464 case NVPTXISD::Tex2DArrayFloatFloatLevel:
2465 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR;
2466 break;
2467 case NVPTXISD::Tex2DArrayFloatFloatGrad:
2468 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR;
2469 break;
2470 case NVPTXISD::Tex2DArrayS32S32:
2471 Opc = NVPTX::TEX_2D_ARRAY_S32_S32_RR;
2472 break;
2473 case NVPTXISD::Tex2DArrayS32Float:
2474 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_RR;
2475 break;
2476 case NVPTXISD::Tex2DArrayS32FloatLevel:
2477 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR;
2478 break;
2479 case NVPTXISD::Tex2DArrayS32FloatGrad:
2480 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR;
2481 break;
2482 case NVPTXISD::Tex2DArrayU32S32:
2483 Opc = NVPTX::TEX_2D_ARRAY_U32_S32_RR;
2484 break;
2485 case NVPTXISD::Tex2DArrayU32Float:
2486 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_RR;
2487 break;
2488 case NVPTXISD::Tex2DArrayU32FloatLevel:
2489 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR;
2490 break;
2491 case NVPTXISD::Tex2DArrayU32FloatGrad:
2492 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR;
2493 break;
2494 case NVPTXISD::Tex3DFloatS32:
2495 Opc = NVPTX::TEX_3D_F32_S32_RR;
2496 break;
2497 case NVPTXISD::Tex3DFloatFloat:
2498 Opc = NVPTX::TEX_3D_F32_F32_RR;
2499 break;
2500 case NVPTXISD::Tex3DFloatFloatLevel:
2501 Opc = NVPTX::TEX_3D_F32_F32_LEVEL_RR;
2502 break;
2503 case NVPTXISD::Tex3DFloatFloatGrad:
2504 Opc = NVPTX::TEX_3D_F32_F32_GRAD_RR;
2505 break;
2506 case NVPTXISD::Tex3DS32S32:
2507 Opc = NVPTX::TEX_3D_S32_S32_RR;
2508 break;
2509 case NVPTXISD::Tex3DS32Float:
2510 Opc = NVPTX::TEX_3D_S32_F32_RR;
2511 break;
2512 case NVPTXISD::Tex3DS32FloatLevel:
2513 Opc = NVPTX::TEX_3D_S32_F32_LEVEL_RR;
2514 break;
2515 case NVPTXISD::Tex3DS32FloatGrad:
2516 Opc = NVPTX::TEX_3D_S32_F32_GRAD_RR;
2517 break;
2518 case NVPTXISD::Tex3DU32S32:
2519 Opc = NVPTX::TEX_3D_U32_S32_RR;
2520 break;
2521 case NVPTXISD::Tex3DU32Float:
2522 Opc = NVPTX::TEX_3D_U32_F32_RR;
2523 break;
2524 case NVPTXISD::Tex3DU32FloatLevel:
2525 Opc = NVPTX::TEX_3D_U32_F32_LEVEL_RR;
2526 break;
2527 case NVPTXISD::Tex3DU32FloatGrad:
2528 Opc = NVPTX::TEX_3D_U32_F32_GRAD_RR;
2529 break;
2530 case NVPTXISD::TexCubeFloatFloat:
2531 Opc = NVPTX::TEX_CUBE_F32_F32_RR;
2532 break;
2533 case NVPTXISD::TexCubeFloatFloatLevel:
2534 Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL_RR;
2535 break;
2536 case NVPTXISD::TexCubeS32Float:
2537 Opc = NVPTX::TEX_CUBE_S32_F32_RR;
2538 break;
2539 case NVPTXISD::TexCubeS32FloatLevel:
2540 Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL_RR;
2541 break;
2542 case NVPTXISD::TexCubeU32Float:
2543 Opc = NVPTX::TEX_CUBE_U32_F32_RR;
2544 break;
2545 case NVPTXISD::TexCubeU32FloatLevel:
2546 Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL_RR;
2547 break;
2548 case NVPTXISD::TexCubeArrayFloatFloat:
2549 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_RR;
2550 break;
2551 case NVPTXISD::TexCubeArrayFloatFloatLevel:
2552 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR;
2553 break;
2554 case NVPTXISD::TexCubeArrayS32Float:
2555 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_RR;
2556 break;
2557 case NVPTXISD::TexCubeArrayS32FloatLevel:
2558 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR;
2559 break;
2560 case NVPTXISD::TexCubeArrayU32Float:
2561 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_RR;
2562 break;
2563 case NVPTXISD::TexCubeArrayU32FloatLevel:
2564 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR;
2565 break;
2566 case NVPTXISD::Tld4R2DFloatFloat:
2567 Opc = NVPTX::TLD4_R_2D_F32_F32_RR;
2568 break;
2569 case NVPTXISD::Tld4G2DFloatFloat:
2570 Opc = NVPTX::TLD4_G_2D_F32_F32_RR;
2571 break;
2572 case NVPTXISD::Tld4B2DFloatFloat:
2573 Opc = NVPTX::TLD4_B_2D_F32_F32_RR;
2574 break;
2575 case NVPTXISD::Tld4A2DFloatFloat:
2576 Opc = NVPTX::TLD4_A_2D_F32_F32_RR;
2577 break;
2578 case NVPTXISD::Tld4R2DS64Float:
2579 Opc = NVPTX::TLD4_R_2D_S32_F32_RR;
2580 break;
2581 case NVPTXISD::Tld4G2DS64Float:
2582 Opc = NVPTX::TLD4_G_2D_S32_F32_RR;
2583 break;
2584 case NVPTXISD::Tld4B2DS64Float:
2585 Opc = NVPTX::TLD4_B_2D_S32_F32_RR;
2586 break;
2587 case NVPTXISD::Tld4A2DS64Float:
2588 Opc = NVPTX::TLD4_A_2D_S32_F32_RR;
2589 break;
2590 case NVPTXISD::Tld4R2DU64Float:
2591 Opc = NVPTX::TLD4_R_2D_U32_F32_RR;
2592 break;
2593 case NVPTXISD::Tld4G2DU64Float:
2594 Opc = NVPTX::TLD4_G_2D_U32_F32_RR;
2595 break;
2596 case NVPTXISD::Tld4B2DU64Float:
2597 Opc = NVPTX::TLD4_B_2D_U32_F32_RR;
2598 break;
2599 case NVPTXISD::Tld4A2DU64Float:
2600 Opc = NVPTX::TLD4_A_2D_U32_F32_RR;
2601 break;
2602 case NVPTXISD::TexUnified1DFloatS32:
2603 Opc = NVPTX::TEX_UNIFIED_1D_F32_S32_R;
2604 break;
2605 case NVPTXISD::TexUnified1DFloatFloat:
2606 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_R;
2607 break;
2608 case NVPTXISD::TexUnified1DFloatFloatLevel:
2609 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R;
2610 break;
2611 case NVPTXISD::TexUnified1DFloatFloatGrad:
2612 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R;
2613 break;
2614 case NVPTXISD::TexUnified1DS32S32:
2615 Opc = NVPTX::TEX_UNIFIED_1D_S32_S32_R;
2616 break;
2617 case NVPTXISD::TexUnified1DS32Float:
2618 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_R;
2619 break;
2620 case NVPTXISD::TexUnified1DS32FloatLevel:
2621 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R;
2622 break;
2623 case NVPTXISD::TexUnified1DS32FloatGrad:
2624 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R;
2625 break;
2626 case NVPTXISD::TexUnified1DU32S32:
2627 Opc = NVPTX::TEX_UNIFIED_1D_U32_S32_R;
2628 break;
2629 case NVPTXISD::TexUnified1DU32Float:
2630 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_R;
2631 break;
2632 case NVPTXISD::TexUnified1DU32FloatLevel:
2633 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R;
2634 break;
2635 case NVPTXISD::TexUnified1DU32FloatGrad:
2636 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R;
2637 break;
2638 case NVPTXISD::TexUnified1DArrayFloatS32:
2639 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R;
2640 break;
2641 case NVPTXISD::TexUnified1DArrayFloatFloat:
2642 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R;
2643 break;
2644 case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
2645 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R;
2646 break;
2647 case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
2648 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R;
2649 break;
2650 case NVPTXISD::TexUnified1DArrayS32S32:
2651 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R;
2652 break;
2653 case NVPTXISD::TexUnified1DArrayS32Float:
2654 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R;
2655 break;
2656 case NVPTXISD::TexUnified1DArrayS32FloatLevel:
2657 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R;
2658 break;
2659 case NVPTXISD::TexUnified1DArrayS32FloatGrad:
2660 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R;
2661 break;
2662 case NVPTXISD::TexUnified1DArrayU32S32:
2663 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R;
2664 break;
2665 case NVPTXISD::TexUnified1DArrayU32Float:
2666 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R;
2667 break;
2668 case NVPTXISD::TexUnified1DArrayU32FloatLevel:
2669 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R;
2670 break;
2671 case NVPTXISD::TexUnified1DArrayU32FloatGrad:
2672 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R;
2673 break;
2674 case NVPTXISD::TexUnified2DFloatS32:
2675 Opc = NVPTX::TEX_UNIFIED_2D_F32_S32_R;
2676 break;
2677 case NVPTXISD::TexUnified2DFloatFloat:
2678 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_R;
2679 break;
2680 case NVPTXISD::TexUnified2DFloatFloatLevel:
2681 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R;
2682 break;
2683 case NVPTXISD::TexUnified2DFloatFloatGrad:
2684 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R;
2685 break;
2686 case NVPTXISD::TexUnified2DS32S32:
2687 Opc = NVPTX::TEX_UNIFIED_2D_S32_S32_R;
2688 break;
2689 case NVPTXISD::TexUnified2DS32Float:
2690 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_R;
2691 break;
2692 case NVPTXISD::TexUnified2DS32FloatLevel:
2693 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R;
2694 break;
2695 case NVPTXISD::TexUnified2DS32FloatGrad:
2696 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R;
2697 break;
2698 case NVPTXISD::TexUnified2DU32S32:
2699 Opc = NVPTX::TEX_UNIFIED_2D_U32_S32_R;
2700 break;
2701 case NVPTXISD::TexUnified2DU32Float:
2702 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_R;
2703 break;
2704 case NVPTXISD::TexUnified2DU32FloatLevel:
2705 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R;
2706 break;
2707 case NVPTXISD::TexUnified2DU32FloatGrad:
2708 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R;
2709 break;
2710 case NVPTXISD::TexUnified2DArrayFloatS32:
2711 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R;
2712 break;
2713 case NVPTXISD::TexUnified2DArrayFloatFloat:
2714 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R;
2715 break;
2716 case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
2717 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R;
2718 break;
2719 case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
2720 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R;
2721 break;
2722 case NVPTXISD::TexUnified2DArrayS32S32:
2723 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R;
2724 break;
2725 case NVPTXISD::TexUnified2DArrayS32Float:
2726 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R;
2727 break;
2728 case NVPTXISD::TexUnified2DArrayS32FloatLevel:
2729 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R;
2730 break;
2731 case NVPTXISD::TexUnified2DArrayS32FloatGrad:
2732 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R;
2733 break;
2734 case NVPTXISD::TexUnified2DArrayU32S32:
2735 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R;
2736 break;
2737 case NVPTXISD::TexUnified2DArrayU32Float:
2738 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R;
2739 break;
2740 case NVPTXISD::TexUnified2DArrayU32FloatLevel:
2741 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R;
2742 break;
2743 case NVPTXISD::TexUnified2DArrayU32FloatGrad:
2744 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R;
2745 break;
2746 case NVPTXISD::TexUnified3DFloatS32:
2747 Opc = NVPTX::TEX_UNIFIED_3D_F32_S32_R;
2748 break;
2749 case NVPTXISD::TexUnified3DFloatFloat:
2750 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_R;
2751 break;
2752 case NVPTXISD::TexUnified3DFloatFloatLevel:
2753 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R;
2754 break;
2755 case NVPTXISD::TexUnified3DFloatFloatGrad:
2756 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R;
2757 break;
2758 case NVPTXISD::TexUnified3DS32S32:
2759 Opc = NVPTX::TEX_UNIFIED_3D_S32_S32_R;
2760 break;
2761 case NVPTXISD::TexUnified3DS32Float:
2762 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_R;
2763 break;
2764 case NVPTXISD::TexUnified3DS32FloatLevel:
2765 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R;
2766 break;
2767 case NVPTXISD::TexUnified3DS32FloatGrad:
2768 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R;
2769 break;
2770 case NVPTXISD::TexUnified3DU32S32:
2771 Opc = NVPTX::TEX_UNIFIED_3D_U32_S32_R;
2772 break;
2773 case NVPTXISD::TexUnified3DU32Float:
2774 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_R;
2775 break;
2776 case NVPTXISD::TexUnified3DU32FloatLevel:
2777 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R;
2778 break;
2779 case NVPTXISD::TexUnified3DU32FloatGrad:
2780 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R;
2781 break;
2782 case NVPTXISD::TexUnifiedCubeFloatFloat:
2783 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_R;
2784 break;
2785 case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
2786 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R;
2787 break;
2788 case NVPTXISD::TexUnifiedCubeS32Float:
2789 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_R;
2790 break;
2791 case NVPTXISD::TexUnifiedCubeS32FloatLevel:
2792 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R;
2793 break;
2794 case NVPTXISD::TexUnifiedCubeU32Float:
2795 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_R;
2796 break;
2797 case NVPTXISD::TexUnifiedCubeU32FloatLevel:
2798 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R;
2799 break;
2800 case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
2801 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R;
2802 break;
2803 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
2804 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R;
2805 break;
2806 case NVPTXISD::TexUnifiedCubeArrayS32Float:
2807 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R;
2808 break;
2809 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
2810 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R;
2811 break;
2812 case NVPTXISD::TexUnifiedCubeArrayU32Float:
2813 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R;
2814 break;
2815 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
2816 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R;
2817 break;
2818 case NVPTXISD::Tld4UnifiedR2DFloatFloat:
2819 Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R;
2820 break;
2821 case NVPTXISD::Tld4UnifiedG2DFloatFloat:
2822 Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R;
2823 break;
2824 case NVPTXISD::Tld4UnifiedB2DFloatFloat:
2825 Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R;
2826 break;
2827 case NVPTXISD::Tld4UnifiedA2DFloatFloat:
2828 Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R;
2829 break;
2830 case NVPTXISD::Tld4UnifiedR2DS64Float:
2831 Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R;
2832 break;
2833 case NVPTXISD::Tld4UnifiedG2DS64Float:
2834 Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R;
2835 break;
2836 case NVPTXISD::Tld4UnifiedB2DS64Float:
2837 Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R;
2838 break;
2839 case NVPTXISD::Tld4UnifiedA2DS64Float:
2840 Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R;
2841 break;
2842 case NVPTXISD::Tld4UnifiedR2DU64Float:
2843 Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R;
2844 break;
2845 case NVPTXISD::Tld4UnifiedG2DU64Float:
2846 Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R;
2847 break;
2848 case NVPTXISD::Tld4UnifiedB2DU64Float:
2849 Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R;
2850 break;
2851 case NVPTXISD::Tld4UnifiedA2DU64Float:
2852 Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R;
2853 break;
2854 }
2855
2856 // Copy over operands
2857 SmallVector<SDValue, 8> Ops(drop_begin(N->ops()));
2858 Ops.push_back(N->getOperand(0)); // Move chain to the back.
2859
2860 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
2861 return true;
2862 }
2863
trySurfaceIntrinsic(SDNode * N)2864 bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
2865 unsigned Opc = 0;
2866 switch (N->getOpcode()) {
2867 default: return false;
2868 case NVPTXISD::Suld1DI8Clamp:
2869 Opc = NVPTX::SULD_1D_I8_CLAMP_R;
2870 break;
2871 case NVPTXISD::Suld1DI16Clamp:
2872 Opc = NVPTX::SULD_1D_I16_CLAMP_R;
2873 break;
2874 case NVPTXISD::Suld1DI32Clamp:
2875 Opc = NVPTX::SULD_1D_I32_CLAMP_R;
2876 break;
2877 case NVPTXISD::Suld1DI64Clamp:
2878 Opc = NVPTX::SULD_1D_I64_CLAMP_R;
2879 break;
2880 case NVPTXISD::Suld1DV2I8Clamp:
2881 Opc = NVPTX::SULD_1D_V2I8_CLAMP_R;
2882 break;
2883 case NVPTXISD::Suld1DV2I16Clamp:
2884 Opc = NVPTX::SULD_1D_V2I16_CLAMP_R;
2885 break;
2886 case NVPTXISD::Suld1DV2I32Clamp:
2887 Opc = NVPTX::SULD_1D_V2I32_CLAMP_R;
2888 break;
2889 case NVPTXISD::Suld1DV2I64Clamp:
2890 Opc = NVPTX::SULD_1D_V2I64_CLAMP_R;
2891 break;
2892 case NVPTXISD::Suld1DV4I8Clamp:
2893 Opc = NVPTX::SULD_1D_V4I8_CLAMP_R;
2894 break;
2895 case NVPTXISD::Suld1DV4I16Clamp:
2896 Opc = NVPTX::SULD_1D_V4I16_CLAMP_R;
2897 break;
2898 case NVPTXISD::Suld1DV4I32Clamp:
2899 Opc = NVPTX::SULD_1D_V4I32_CLAMP_R;
2900 break;
2901 case NVPTXISD::Suld1DArrayI8Clamp:
2902 Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP_R;
2903 break;
2904 case NVPTXISD::Suld1DArrayI16Clamp:
2905 Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP_R;
2906 break;
2907 case NVPTXISD::Suld1DArrayI32Clamp:
2908 Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP_R;
2909 break;
2910 case NVPTXISD::Suld1DArrayI64Clamp:
2911 Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP_R;
2912 break;
2913 case NVPTXISD::Suld1DArrayV2I8Clamp:
2914 Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R;
2915 break;
2916 case NVPTXISD::Suld1DArrayV2I16Clamp:
2917 Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R;
2918 break;
2919 case NVPTXISD::Suld1DArrayV2I32Clamp:
2920 Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R;
2921 break;
2922 case NVPTXISD::Suld1DArrayV2I64Clamp:
2923 Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R;
2924 break;
2925 case NVPTXISD::Suld1DArrayV4I8Clamp:
2926 Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R;
2927 break;
2928 case NVPTXISD::Suld1DArrayV4I16Clamp:
2929 Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R;
2930 break;
2931 case NVPTXISD::Suld1DArrayV4I32Clamp:
2932 Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R;
2933 break;
2934 case NVPTXISD::Suld2DI8Clamp:
2935 Opc = NVPTX::SULD_2D_I8_CLAMP_R;
2936 break;
2937 case NVPTXISD::Suld2DI16Clamp:
2938 Opc = NVPTX::SULD_2D_I16_CLAMP_R;
2939 break;
2940 case NVPTXISD::Suld2DI32Clamp:
2941 Opc = NVPTX::SULD_2D_I32_CLAMP_R;
2942 break;
2943 case NVPTXISD::Suld2DI64Clamp:
2944 Opc = NVPTX::SULD_2D_I64_CLAMP_R;
2945 break;
2946 case NVPTXISD::Suld2DV2I8Clamp:
2947 Opc = NVPTX::SULD_2D_V2I8_CLAMP_R;
2948 break;
2949 case NVPTXISD::Suld2DV2I16Clamp:
2950 Opc = NVPTX::SULD_2D_V2I16_CLAMP_R;
2951 break;
2952 case NVPTXISD::Suld2DV2I32Clamp:
2953 Opc = NVPTX::SULD_2D_V2I32_CLAMP_R;
2954 break;
2955 case NVPTXISD::Suld2DV2I64Clamp:
2956 Opc = NVPTX::SULD_2D_V2I64_CLAMP_R;
2957 break;
2958 case NVPTXISD::Suld2DV4I8Clamp:
2959 Opc = NVPTX::SULD_2D_V4I8_CLAMP_R;
2960 break;
2961 case NVPTXISD::Suld2DV4I16Clamp:
2962 Opc = NVPTX::SULD_2D_V4I16_CLAMP_R;
2963 break;
2964 case NVPTXISD::Suld2DV4I32Clamp:
2965 Opc = NVPTX::SULD_2D_V4I32_CLAMP_R;
2966 break;
2967 case NVPTXISD::Suld2DArrayI8Clamp:
2968 Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP_R;
2969 break;
2970 case NVPTXISD::Suld2DArrayI16Clamp:
2971 Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP_R;
2972 break;
2973 case NVPTXISD::Suld2DArrayI32Clamp:
2974 Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP_R;
2975 break;
2976 case NVPTXISD::Suld2DArrayI64Clamp:
2977 Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP_R;
2978 break;
2979 case NVPTXISD::Suld2DArrayV2I8Clamp:
2980 Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R;
2981 break;
2982 case NVPTXISD::Suld2DArrayV2I16Clamp:
2983 Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R;
2984 break;
2985 case NVPTXISD::Suld2DArrayV2I32Clamp:
2986 Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R;
2987 break;
2988 case NVPTXISD::Suld2DArrayV2I64Clamp:
2989 Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R;
2990 break;
2991 case NVPTXISD::Suld2DArrayV4I8Clamp:
2992 Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R;
2993 break;
2994 case NVPTXISD::Suld2DArrayV4I16Clamp:
2995 Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R;
2996 break;
2997 case NVPTXISD::Suld2DArrayV4I32Clamp:
2998 Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R;
2999 break;
3000 case NVPTXISD::Suld3DI8Clamp:
3001 Opc = NVPTX::SULD_3D_I8_CLAMP_R;
3002 break;
3003 case NVPTXISD::Suld3DI16Clamp:
3004 Opc = NVPTX::SULD_3D_I16_CLAMP_R;
3005 break;
3006 case NVPTXISD::Suld3DI32Clamp:
3007 Opc = NVPTX::SULD_3D_I32_CLAMP_R;
3008 break;
3009 case NVPTXISD::Suld3DI64Clamp:
3010 Opc = NVPTX::SULD_3D_I64_CLAMP_R;
3011 break;
3012 case NVPTXISD::Suld3DV2I8Clamp:
3013 Opc = NVPTX::SULD_3D_V2I8_CLAMP_R;
3014 break;
3015 case NVPTXISD::Suld3DV2I16Clamp:
3016 Opc = NVPTX::SULD_3D_V2I16_CLAMP_R;
3017 break;
3018 case NVPTXISD::Suld3DV2I32Clamp:
3019 Opc = NVPTX::SULD_3D_V2I32_CLAMP_R;
3020 break;
3021 case NVPTXISD::Suld3DV2I64Clamp:
3022 Opc = NVPTX::SULD_3D_V2I64_CLAMP_R;
3023 break;
3024 case NVPTXISD::Suld3DV4I8Clamp:
3025 Opc = NVPTX::SULD_3D_V4I8_CLAMP_R;
3026 break;
3027 case NVPTXISD::Suld3DV4I16Clamp:
3028 Opc = NVPTX::SULD_3D_V4I16_CLAMP_R;
3029 break;
3030 case NVPTXISD::Suld3DV4I32Clamp:
3031 Opc = NVPTX::SULD_3D_V4I32_CLAMP_R;
3032 break;
3033 case NVPTXISD::Suld1DI8Trap:
3034 Opc = NVPTX::SULD_1D_I8_TRAP_R;
3035 break;
3036 case NVPTXISD::Suld1DI16Trap:
3037 Opc = NVPTX::SULD_1D_I16_TRAP_R;
3038 break;
3039 case NVPTXISD::Suld1DI32Trap:
3040 Opc = NVPTX::SULD_1D_I32_TRAP_R;
3041 break;
3042 case NVPTXISD::Suld1DI64Trap:
3043 Opc = NVPTX::SULD_1D_I64_TRAP_R;
3044 break;
3045 case NVPTXISD::Suld1DV2I8Trap:
3046 Opc = NVPTX::SULD_1D_V2I8_TRAP_R;
3047 break;
3048 case NVPTXISD::Suld1DV2I16Trap:
3049 Opc = NVPTX::SULD_1D_V2I16_TRAP_R;
3050 break;
3051 case NVPTXISD::Suld1DV2I32Trap:
3052 Opc = NVPTX::SULD_1D_V2I32_TRAP_R;
3053 break;
3054 case NVPTXISD::Suld1DV2I64Trap:
3055 Opc = NVPTX::SULD_1D_V2I64_TRAP_R;
3056 break;
3057 case NVPTXISD::Suld1DV4I8Trap:
3058 Opc = NVPTX::SULD_1D_V4I8_TRAP_R;
3059 break;
3060 case NVPTXISD::Suld1DV4I16Trap:
3061 Opc = NVPTX::SULD_1D_V4I16_TRAP_R;
3062 break;
3063 case NVPTXISD::Suld1DV4I32Trap:
3064 Opc = NVPTX::SULD_1D_V4I32_TRAP_R;
3065 break;
3066 case NVPTXISD::Suld1DArrayI8Trap:
3067 Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP_R;
3068 break;
3069 case NVPTXISD::Suld1DArrayI16Trap:
3070 Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP_R;
3071 break;
3072 case NVPTXISD::Suld1DArrayI32Trap:
3073 Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP_R;
3074 break;
3075 case NVPTXISD::Suld1DArrayI64Trap:
3076 Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP_R;
3077 break;
3078 case NVPTXISD::Suld1DArrayV2I8Trap:
3079 Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R;
3080 break;
3081 case NVPTXISD::Suld1DArrayV2I16Trap:
3082 Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R;
3083 break;
3084 case NVPTXISD::Suld1DArrayV2I32Trap:
3085 Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R;
3086 break;
3087 case NVPTXISD::Suld1DArrayV2I64Trap:
3088 Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R;
3089 break;
3090 case NVPTXISD::Suld1DArrayV4I8Trap:
3091 Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R;
3092 break;
3093 case NVPTXISD::Suld1DArrayV4I16Trap:
3094 Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R;
3095 break;
3096 case NVPTXISD::Suld1DArrayV4I32Trap:
3097 Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R;
3098 break;
3099 case NVPTXISD::Suld2DI8Trap:
3100 Opc = NVPTX::SULD_2D_I8_TRAP_R;
3101 break;
3102 case NVPTXISD::Suld2DI16Trap:
3103 Opc = NVPTX::SULD_2D_I16_TRAP_R;
3104 break;
3105 case NVPTXISD::Suld2DI32Trap:
3106 Opc = NVPTX::SULD_2D_I32_TRAP_R;
3107 break;
3108 case NVPTXISD::Suld2DI64Trap:
3109 Opc = NVPTX::SULD_2D_I64_TRAP_R;
3110 break;
3111 case NVPTXISD::Suld2DV2I8Trap:
3112 Opc = NVPTX::SULD_2D_V2I8_TRAP_R;
3113 break;
3114 case NVPTXISD::Suld2DV2I16Trap:
3115 Opc = NVPTX::SULD_2D_V2I16_TRAP_R;
3116 break;
3117 case NVPTXISD::Suld2DV2I32Trap:
3118 Opc = NVPTX::SULD_2D_V2I32_TRAP_R;
3119 break;
3120 case NVPTXISD::Suld2DV2I64Trap:
3121 Opc = NVPTX::SULD_2D_V2I64_TRAP_R;
3122 break;
3123 case NVPTXISD::Suld2DV4I8Trap:
3124 Opc = NVPTX::SULD_2D_V4I8_TRAP_R;
3125 break;
3126 case NVPTXISD::Suld2DV4I16Trap:
3127 Opc = NVPTX::SULD_2D_V4I16_TRAP_R;
3128 break;
3129 case NVPTXISD::Suld2DV4I32Trap:
3130 Opc = NVPTX::SULD_2D_V4I32_TRAP_R;
3131 break;
3132 case NVPTXISD::Suld2DArrayI8Trap:
3133 Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP_R;
3134 break;
3135 case NVPTXISD::Suld2DArrayI16Trap:
3136 Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP_R;
3137 break;
3138 case NVPTXISD::Suld2DArrayI32Trap:
3139 Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP_R;
3140 break;
3141 case NVPTXISD::Suld2DArrayI64Trap:
3142 Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP_R;
3143 break;
3144 case NVPTXISD::Suld2DArrayV2I8Trap:
3145 Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R;
3146 break;
3147 case NVPTXISD::Suld2DArrayV2I16Trap:
3148 Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R;
3149 break;
3150 case NVPTXISD::Suld2DArrayV2I32Trap:
3151 Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R;
3152 break;
3153 case NVPTXISD::Suld2DArrayV2I64Trap:
3154 Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R;
3155 break;
3156 case NVPTXISD::Suld2DArrayV4I8Trap:
3157 Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R;
3158 break;
3159 case NVPTXISD::Suld2DArrayV4I16Trap:
3160 Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R;
3161 break;
3162 case NVPTXISD::Suld2DArrayV4I32Trap:
3163 Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R;
3164 break;
3165 case NVPTXISD::Suld3DI8Trap:
3166 Opc = NVPTX::SULD_3D_I8_TRAP_R;
3167 break;
3168 case NVPTXISD::Suld3DI16Trap:
3169 Opc = NVPTX::SULD_3D_I16_TRAP_R;
3170 break;
3171 case NVPTXISD::Suld3DI32Trap:
3172 Opc = NVPTX::SULD_3D_I32_TRAP_R;
3173 break;
3174 case NVPTXISD::Suld3DI64Trap:
3175 Opc = NVPTX::SULD_3D_I64_TRAP_R;
3176 break;
3177 case NVPTXISD::Suld3DV2I8Trap:
3178 Opc = NVPTX::SULD_3D_V2I8_TRAP_R;
3179 break;
3180 case NVPTXISD::Suld3DV2I16Trap:
3181 Opc = NVPTX::SULD_3D_V2I16_TRAP_R;
3182 break;
3183 case NVPTXISD::Suld3DV2I32Trap:
3184 Opc = NVPTX::SULD_3D_V2I32_TRAP_R;
3185 break;
3186 case NVPTXISD::Suld3DV2I64Trap:
3187 Opc = NVPTX::SULD_3D_V2I64_TRAP_R;
3188 break;
3189 case NVPTXISD::Suld3DV4I8Trap:
3190 Opc = NVPTX::SULD_3D_V4I8_TRAP_R;
3191 break;
3192 case NVPTXISD::Suld3DV4I16Trap:
3193 Opc = NVPTX::SULD_3D_V4I16_TRAP_R;
3194 break;
3195 case NVPTXISD::Suld3DV4I32Trap:
3196 Opc = NVPTX::SULD_3D_V4I32_TRAP_R;
3197 break;
3198 case NVPTXISD::Suld1DI8Zero:
3199 Opc = NVPTX::SULD_1D_I8_ZERO_R;
3200 break;
3201 case NVPTXISD::Suld1DI16Zero:
3202 Opc = NVPTX::SULD_1D_I16_ZERO_R;
3203 break;
3204 case NVPTXISD::Suld1DI32Zero:
3205 Opc = NVPTX::SULD_1D_I32_ZERO_R;
3206 break;
3207 case NVPTXISD::Suld1DI64Zero:
3208 Opc = NVPTX::SULD_1D_I64_ZERO_R;
3209 break;
3210 case NVPTXISD::Suld1DV2I8Zero:
3211 Opc = NVPTX::SULD_1D_V2I8_ZERO_R;
3212 break;
3213 case NVPTXISD::Suld1DV2I16Zero:
3214 Opc = NVPTX::SULD_1D_V2I16_ZERO_R;
3215 break;
3216 case NVPTXISD::Suld1DV2I32Zero:
3217 Opc = NVPTX::SULD_1D_V2I32_ZERO_R;
3218 break;
3219 case NVPTXISD::Suld1DV2I64Zero:
3220 Opc = NVPTX::SULD_1D_V2I64_ZERO_R;
3221 break;
3222 case NVPTXISD::Suld1DV4I8Zero:
3223 Opc = NVPTX::SULD_1D_V4I8_ZERO_R;
3224 break;
3225 case NVPTXISD::Suld1DV4I16Zero:
3226 Opc = NVPTX::SULD_1D_V4I16_ZERO_R;
3227 break;
3228 case NVPTXISD::Suld1DV4I32Zero:
3229 Opc = NVPTX::SULD_1D_V4I32_ZERO_R;
3230 break;
3231 case NVPTXISD::Suld1DArrayI8Zero:
3232 Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO_R;
3233 break;
3234 case NVPTXISD::Suld1DArrayI16Zero:
3235 Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO_R;
3236 break;
3237 case NVPTXISD::Suld1DArrayI32Zero:
3238 Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO_R;
3239 break;
3240 case NVPTXISD::Suld1DArrayI64Zero:
3241 Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO_R;
3242 break;
3243 case NVPTXISD::Suld1DArrayV2I8Zero:
3244 Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R;
3245 break;
3246 case NVPTXISD::Suld1DArrayV2I16Zero:
3247 Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R;
3248 break;
3249 case NVPTXISD::Suld1DArrayV2I32Zero:
3250 Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R;
3251 break;
3252 case NVPTXISD::Suld1DArrayV2I64Zero:
3253 Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R;
3254 break;
3255 case NVPTXISD::Suld1DArrayV4I8Zero:
3256 Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R;
3257 break;
3258 case NVPTXISD::Suld1DArrayV4I16Zero:
3259 Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R;
3260 break;
3261 case NVPTXISD::Suld1DArrayV4I32Zero:
3262 Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R;
3263 break;
3264 case NVPTXISD::Suld2DI8Zero:
3265 Opc = NVPTX::SULD_2D_I8_ZERO_R;
3266 break;
3267 case NVPTXISD::Suld2DI16Zero:
3268 Opc = NVPTX::SULD_2D_I16_ZERO_R;
3269 break;
3270 case NVPTXISD::Suld2DI32Zero:
3271 Opc = NVPTX::SULD_2D_I32_ZERO_R;
3272 break;
3273 case NVPTXISD::Suld2DI64Zero:
3274 Opc = NVPTX::SULD_2D_I64_ZERO_R;
3275 break;
3276 case NVPTXISD::Suld2DV2I8Zero:
3277 Opc = NVPTX::SULD_2D_V2I8_ZERO_R;
3278 break;
3279 case NVPTXISD::Suld2DV2I16Zero:
3280 Opc = NVPTX::SULD_2D_V2I16_ZERO_R;
3281 break;
3282 case NVPTXISD::Suld2DV2I32Zero:
3283 Opc = NVPTX::SULD_2D_V2I32_ZERO_R;
3284 break;
3285 case NVPTXISD::Suld2DV2I64Zero:
3286 Opc = NVPTX::SULD_2D_V2I64_ZERO_R;
3287 break;
3288 case NVPTXISD::Suld2DV4I8Zero:
3289 Opc = NVPTX::SULD_2D_V4I8_ZERO_R;
3290 break;
3291 case NVPTXISD::Suld2DV4I16Zero:
3292 Opc = NVPTX::SULD_2D_V4I16_ZERO_R;
3293 break;
3294 case NVPTXISD::Suld2DV4I32Zero:
3295 Opc = NVPTX::SULD_2D_V4I32_ZERO_R;
3296 break;
3297 case NVPTXISD::Suld2DArrayI8Zero:
3298 Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO_R;
3299 break;
3300 case NVPTXISD::Suld2DArrayI16Zero:
3301 Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO_R;
3302 break;
3303 case NVPTXISD::Suld2DArrayI32Zero:
3304 Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO_R;
3305 break;
3306 case NVPTXISD::Suld2DArrayI64Zero:
3307 Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO_R;
3308 break;
3309 case NVPTXISD::Suld2DArrayV2I8Zero:
3310 Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R;
3311 break;
3312 case NVPTXISD::Suld2DArrayV2I16Zero:
3313 Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R;
3314 break;
3315 case NVPTXISD::Suld2DArrayV2I32Zero:
3316 Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R;
3317 break;
3318 case NVPTXISD::Suld2DArrayV2I64Zero:
3319 Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R;
3320 break;
3321 case NVPTXISD::Suld2DArrayV4I8Zero:
3322 Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R;
3323 break;
3324 case NVPTXISD::Suld2DArrayV4I16Zero:
3325 Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R;
3326 break;
3327 case NVPTXISD::Suld2DArrayV4I32Zero:
3328 Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R;
3329 break;
3330 case NVPTXISD::Suld3DI8Zero:
3331 Opc = NVPTX::SULD_3D_I8_ZERO_R;
3332 break;
3333 case NVPTXISD::Suld3DI16Zero:
3334 Opc = NVPTX::SULD_3D_I16_ZERO_R;
3335 break;
3336 case NVPTXISD::Suld3DI32Zero:
3337 Opc = NVPTX::SULD_3D_I32_ZERO_R;
3338 break;
3339 case NVPTXISD::Suld3DI64Zero:
3340 Opc = NVPTX::SULD_3D_I64_ZERO_R;
3341 break;
3342 case NVPTXISD::Suld3DV2I8Zero:
3343 Opc = NVPTX::SULD_3D_V2I8_ZERO_R;
3344 break;
3345 case NVPTXISD::Suld3DV2I16Zero:
3346 Opc = NVPTX::SULD_3D_V2I16_ZERO_R;
3347 break;
3348 case NVPTXISD::Suld3DV2I32Zero:
3349 Opc = NVPTX::SULD_3D_V2I32_ZERO_R;
3350 break;
3351 case NVPTXISD::Suld3DV2I64Zero:
3352 Opc = NVPTX::SULD_3D_V2I64_ZERO_R;
3353 break;
3354 case NVPTXISD::Suld3DV4I8Zero:
3355 Opc = NVPTX::SULD_3D_V4I8_ZERO_R;
3356 break;
3357 case NVPTXISD::Suld3DV4I16Zero:
3358 Opc = NVPTX::SULD_3D_V4I16_ZERO_R;
3359 break;
3360 case NVPTXISD::Suld3DV4I32Zero:
3361 Opc = NVPTX::SULD_3D_V4I32_ZERO_R;
3362 break;
3363 }
3364
3365 // Copy over operands
3366 SmallVector<SDValue, 8> Ops(drop_begin(N->ops()));
3367 Ops.push_back(N->getOperand(0)); // Move chain to the back.
3368
3369 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
3370 return true;
3371 }
3372
3373
3374 /// SelectBFE - Look for instruction sequences that can be made more efficient
3375 /// by using the 'bfe' (bit-field extract) PTX instruction
tryBFE(SDNode * N)3376 bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
3377 SDLoc DL(N);
3378 SDValue LHS = N->getOperand(0);
3379 SDValue RHS = N->getOperand(1);
3380 SDValue Len;
3381 SDValue Start;
3382 SDValue Val;
3383 bool IsSigned = false;
3384
3385 if (N->getOpcode() == ISD::AND) {
3386 // Canonicalize the operands
3387 // We want 'and %val, %mask'
3388 if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
3389 std::swap(LHS, RHS);
3390 }
3391
3392 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
3393 if (!Mask) {
3394 // We need a constant mask on the RHS of the AND
3395 return false;
3396 }
3397
3398 // Extract the mask bits
3399 uint64_t MaskVal = Mask->getZExtValue();
3400 if (!isMask_64(MaskVal)) {
3401 // We *could* handle shifted masks here, but doing so would require an
3402 // 'and' operation to fix up the low-order bits so we would trade
3403 // shr+and for bfe+and, which has the same throughput
3404 return false;
3405 }
3406
3407 // How many bits are in our mask?
3408 uint64_t NumBits = countTrailingOnes(MaskVal);
3409 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3410
3411 if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
3412 // We have a 'srl/and' pair, extract the effective start bit and length
3413 Val = LHS.getNode()->getOperand(0);
3414 Start = LHS.getNode()->getOperand(1);
3415 ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
3416 if (StartConst) {
3417 uint64_t StartVal = StartConst->getZExtValue();
3418 // How many "good" bits do we have left? "good" is defined here as bits
3419 // that exist in the original value, not shifted in.
3420 uint64_t GoodBits = Start.getValueSizeInBits() - StartVal;
3421 if (NumBits > GoodBits) {
3422 // Do not handle the case where bits have been shifted in. In theory
3423 // we could handle this, but the cost is likely higher than just
3424 // emitting the srl/and pair.
3425 return false;
3426 }
3427 Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
3428 } else {
3429 // Do not handle the case where the shift amount (can be zero if no srl
3430 // was found) is not constant. We could handle this case, but it would
3431 // require run-time logic that would be more expensive than just
3432 // emitting the srl/and pair.
3433 return false;
3434 }
3435 } else {
3436 // Do not handle the case where the LHS of the and is not a shift. While
3437 // it would be trivial to handle this case, it would just transform
3438 // 'and' -> 'bfe', but 'and' has higher-throughput.
3439 return false;
3440 }
3441 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
3442 if (LHS->getOpcode() == ISD::AND) {
3443 ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
3444 if (!ShiftCnst) {
3445 // Shift amount must be constant
3446 return false;
3447 }
3448
3449 uint64_t ShiftAmt = ShiftCnst->getZExtValue();
3450
3451 SDValue AndLHS = LHS->getOperand(0);
3452 SDValue AndRHS = LHS->getOperand(1);
3453
3454 // Canonicalize the AND to have the mask on the RHS
3455 if (isa<ConstantSDNode>(AndLHS)) {
3456 std::swap(AndLHS, AndRHS);
3457 }
3458
3459 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
3460 if (!MaskCnst) {
3461 // Mask must be constant
3462 return false;
3463 }
3464
3465 uint64_t MaskVal = MaskCnst->getZExtValue();
3466 uint64_t NumZeros;
3467 uint64_t NumBits;
3468 if (isMask_64(MaskVal)) {
3469 NumZeros = 0;
3470 // The number of bits in the result bitfield will be the number of
3471 // trailing ones (the AND) minus the number of bits we shift off
3472 NumBits = countTrailingOnes(MaskVal) - ShiftAmt;
3473 } else if (isShiftedMask_64(MaskVal)) {
3474 NumZeros = countTrailingZeros(MaskVal);
3475 unsigned NumOnes = countTrailingOnes(MaskVal >> NumZeros);
3476 // The number of bits in the result bitfield will be the number of
3477 // trailing zeros plus the number of set bits in the mask minus the
3478 // number of bits we shift off
3479 NumBits = NumZeros + NumOnes - ShiftAmt;
3480 } else {
3481 // This is not a mask we can handle
3482 return false;
3483 }
3484
3485 if (ShiftAmt < NumZeros) {
3486 // Handling this case would require extra logic that would make this
3487 // transformation non-profitable
3488 return false;
3489 }
3490
3491 Val = AndLHS;
3492 Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
3493 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3494 } else if (LHS->getOpcode() == ISD::SHL) {
3495 // Here, we have a pattern like:
3496 //
3497 // (sra (shl val, NN), MM)
3498 // or
3499 // (srl (shl val, NN), MM)
3500 //
3501 // If MM >= NN, we can efficiently optimize this with bfe
3502 Val = LHS->getOperand(0);
3503
3504 SDValue ShlRHS = LHS->getOperand(1);
3505 ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
3506 if (!ShlCnst) {
3507 // Shift amount must be constant
3508 return false;
3509 }
3510 uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
3511
3512 SDValue ShrRHS = RHS;
3513 ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
3514 if (!ShrCnst) {
3515 // Shift amount must be constant
3516 return false;
3517 }
3518 uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
3519
3520 // To avoid extra codegen and be profitable, we need Outer >= Inner
3521 if (OuterShiftAmt < InnerShiftAmt) {
3522 return false;
3523 }
3524
3525 // If the outer shift is more than the type size, we have no bitfield to
3526 // extract (since we also check that the inner shift is <= the outer shift
3527 // then this also implies that the inner shift is < the type size)
3528 if (OuterShiftAmt >= Val.getValueSizeInBits()) {
3529 return false;
3530 }
3531
3532 Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
3533 MVT::i32);
3534 Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
3535 DL, MVT::i32);
3536
3537 if (N->getOpcode() == ISD::SRA) {
3538 // If we have a arithmetic right shift, we need to use the signed bfe
3539 // variant
3540 IsSigned = true;
3541 }
3542 } else {
3543 // No can do...
3544 return false;
3545 }
3546 } else {
3547 // No can do...
3548 return false;
3549 }
3550
3551
3552 unsigned Opc;
3553 // For the BFE operations we form here from "and" and "srl", always use the
3554 // unsigned variants.
3555 if (Val.getValueType() == MVT::i32) {
3556 if (IsSigned) {
3557 Opc = NVPTX::BFE_S32rii;
3558 } else {
3559 Opc = NVPTX::BFE_U32rii;
3560 }
3561 } else if (Val.getValueType() == MVT::i64) {
3562 if (IsSigned) {
3563 Opc = NVPTX::BFE_S64rii;
3564 } else {
3565 Opc = NVPTX::BFE_U64rii;
3566 }
3567 } else {
3568 // We cannot handle this type
3569 return false;
3570 }
3571
3572 SDValue Ops[] = {
3573 Val, Start, Len
3574 };
3575
3576 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
3577 return true;
3578 }
3579
3580 // SelectDirectAddr - Match a direct address for DAG.
3581 // A direct address could be a globaladdress or externalsymbol.
SelectDirectAddr(SDValue N,SDValue & Address)3582 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
3583 // Return true if TGA or ES.
3584 if (N.getOpcode() == ISD::TargetGlobalAddress ||
3585 N.getOpcode() == ISD::TargetExternalSymbol) {
3586 Address = N;
3587 return true;
3588 }
3589 if (N.getOpcode() == NVPTXISD::Wrapper) {
3590 Address = N.getOperand(0);
3591 return true;
3592 }
3593 // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
3594 if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
3595 if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
3596 CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM &&
3597 CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam)
3598 return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
3599 }
3600 return false;
3601 }
3602
3603 // symbol+offset
SelectADDRsi_imp(SDNode * OpNode,SDValue Addr,SDValue & Base,SDValue & Offset,MVT mvt)3604 bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
3605 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3606 if (Addr.getOpcode() == ISD::ADD) {
3607 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3608 SDValue base = Addr.getOperand(0);
3609 if (SelectDirectAddr(base, Base)) {
3610 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3611 mvt);
3612 return true;
3613 }
3614 }
3615 }
3616 return false;
3617 }
3618
3619 // symbol+offset
SelectADDRsi(SDNode * OpNode,SDValue Addr,SDValue & Base,SDValue & Offset)3620 bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
3621 SDValue &Base, SDValue &Offset) {
3622 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
3623 }
3624
3625 // symbol+offset
SelectADDRsi64(SDNode * OpNode,SDValue Addr,SDValue & Base,SDValue & Offset)3626 bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
3627 SDValue &Base, SDValue &Offset) {
3628 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
3629 }
3630
3631 // register+offset
SelectADDRri_imp(SDNode * OpNode,SDValue Addr,SDValue & Base,SDValue & Offset,MVT mvt)3632 bool NVPTXDAGToDAGISel::SelectADDRri_imp(
3633 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3634 if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
3635 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3636 Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
3637 return true;
3638 }
3639 if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
3640 Addr.getOpcode() == ISD::TargetGlobalAddress)
3641 return false; // direct calls.
3642
3643 if (Addr.getOpcode() == ISD::ADD) {
3644 if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
3645 return false;
3646 }
3647 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3648 if (FrameIndexSDNode *FIN =
3649 dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
3650 // Constant offset from frame ref.
3651 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3652 else
3653 Base = Addr.getOperand(0);
3654 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3655 mvt);
3656 return true;
3657 }
3658 }
3659 return false;
3660 }
3661
3662 // register+offset
SelectADDRri(SDNode * OpNode,SDValue Addr,SDValue & Base,SDValue & Offset)3663 bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
3664 SDValue &Base, SDValue &Offset) {
3665 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
3666 }
3667
3668 // register+offset
SelectADDRri64(SDNode * OpNode,SDValue Addr,SDValue & Base,SDValue & Offset)3669 bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
3670 SDValue &Base, SDValue &Offset) {
3671 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
3672 }
3673
ChkMemSDNodeAddressSpace(SDNode * N,unsigned int spN) const3674 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
3675 unsigned int spN) const {
3676 const Value *Src = nullptr;
3677 if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
3678 if (spN == 0 && mN->getMemOperand()->getPseudoValue())
3679 return true;
3680 Src = mN->getMemOperand()->getValue();
3681 }
3682 if (!Src)
3683 return false;
3684 if (auto *PT = dyn_cast<PointerType>(Src->getType()))
3685 return (PT->getAddressSpace() == spN);
3686 return false;
3687 }
3688
3689 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
3690 /// inline asm expressions.
SelectInlineAsmMemoryOperand(const SDValue & Op,unsigned ConstraintID,std::vector<SDValue> & OutOps)3691 bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
3692 const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
3693 SDValue Op0, Op1;
3694 switch (ConstraintID) {
3695 default:
3696 return true;
3697 case InlineAsm::Constraint_m: // memory
3698 if (SelectDirectAddr(Op, Op0)) {
3699 OutOps.push_back(Op0);
3700 OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
3701 return false;
3702 }
3703 if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
3704 OutOps.push_back(Op0);
3705 OutOps.push_back(Op1);
3706 return false;
3707 }
3708 break;
3709 }
3710 return true;
3711 }
3712
3713 /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
3714 /// conversion from \p SrcTy to \p DestTy.
GetConvertOpcode(MVT DestTy,MVT SrcTy,bool IsSigned)3715 unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
3716 bool IsSigned) {
3717 switch (SrcTy.SimpleTy) {
3718 default:
3719 llvm_unreachable("Unhandled source type");
3720 case MVT::i8:
3721 switch (DestTy.SimpleTy) {
3722 default:
3723 llvm_unreachable("Unhandled dest type");
3724 case MVT::i16:
3725 return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
3726 case MVT::i32:
3727 return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
3728 case MVT::i64:
3729 return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
3730 }
3731 case MVT::i16:
3732 switch (DestTy.SimpleTy) {
3733 default:
3734 llvm_unreachable("Unhandled dest type");
3735 case MVT::i8:
3736 return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
3737 case MVT::i32:
3738 return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
3739 case MVT::i64:
3740 return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
3741 }
3742 case MVT::i32:
3743 switch (DestTy.SimpleTy) {
3744 default:
3745 llvm_unreachable("Unhandled dest type");
3746 case MVT::i8:
3747 return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
3748 case MVT::i16:
3749 return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
3750 case MVT::i64:
3751 return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
3752 }
3753 case MVT::i64:
3754 switch (DestTy.SimpleTy) {
3755 default:
3756 llvm_unreachable("Unhandled dest type");
3757 case MVT::i8:
3758 return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
3759 case MVT::i16:
3760 return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
3761 case MVT::i32:
3762 return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
3763 }
3764 }
3765 }
3766