1//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file describes the PTX instructions in TableGen format.
10//
11//===----------------------------------------------------------------------===//
12
13include "NVPTXInstrFormats.td"
14
15let OperandType = "OPERAND_IMMEDIATE" in {
16  def f16imm : Operand<f16>;
17  def bf16imm : Operand<bf16>;
18
19}
20
21// List of vector specific properties
22def isVecLD      : VecInstTypeEnum<1>;
23def isVecST      : VecInstTypeEnum<2>;
24def isVecBuild   : VecInstTypeEnum<3>;
25def isVecShuffle : VecInstTypeEnum<4>;
26def isVecExtract : VecInstTypeEnum<5>;
27def isVecInsert  : VecInstTypeEnum<6>;
28def isVecDest    : VecInstTypeEnum<7>;
29def isVecOther   : VecInstTypeEnum<15>;
30
31//===----------------------------------------------------------------------===//
32// NVPTX Operand Definitions.
33//===----------------------------------------------------------------------===//
34
35def brtarget    : Operand<OtherVT>;
36
37// CVT conversion modes
38// These must match the enum in NVPTX.h
39def CvtNONE : PatLeaf<(i32 0x0)>;
40def CvtRNI  : PatLeaf<(i32 0x1)>;
41def CvtRZI  : PatLeaf<(i32 0x2)>;
42def CvtRMI  : PatLeaf<(i32 0x3)>;
43def CvtRPI  : PatLeaf<(i32 0x4)>;
44def CvtRN   : PatLeaf<(i32 0x5)>;
45def CvtRZ   : PatLeaf<(i32 0x6)>;
46def CvtRM   : PatLeaf<(i32 0x7)>;
47def CvtRP   : PatLeaf<(i32 0x8)>;
48def CvtRNA   : PatLeaf<(i32 0x9)>;
49
50def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
51def CvtRNI_FTZ  : PatLeaf<(i32 0x11)>;
52def CvtRZI_FTZ  : PatLeaf<(i32 0x12)>;
53def CvtRMI_FTZ  : PatLeaf<(i32 0x13)>;
54def CvtRPI_FTZ  : PatLeaf<(i32 0x14)>;
55def CvtRN_FTZ   : PatLeaf<(i32 0x15)>;
56def CvtRZ_FTZ   : PatLeaf<(i32 0x16)>;
57def CvtRM_FTZ   : PatLeaf<(i32 0x17)>;
58def CvtRP_FTZ   : PatLeaf<(i32 0x18)>;
59
60def CvtSAT      : PatLeaf<(i32 0x20)>;
61def CvtSAT_FTZ  : PatLeaf<(i32 0x30)>;
62
63def CvtNONE_RELU   : PatLeaf<(i32 0x40)>;
64def CvtRN_RELU   : PatLeaf<(i32 0x45)>;
65def CvtRZ_RELU   : PatLeaf<(i32 0x46)>;
66
67def CvtMode : Operand<i32> {
68  let PrintMethod = "printCvtMode";
69}
70
71// Compare modes
72// These must match the enum in NVPTX.h
73def CmpEQ   : PatLeaf<(i32 0)>;
74def CmpNE   : PatLeaf<(i32 1)>;
75def CmpLT   : PatLeaf<(i32 2)>;
76def CmpLE   : PatLeaf<(i32 3)>;
77def CmpGT   : PatLeaf<(i32 4)>;
78def CmpGE   : PatLeaf<(i32 5)>;
79def CmpLO   : PatLeaf<(i32 6)>;
80def CmpLS   : PatLeaf<(i32 7)>;
81def CmpHI   : PatLeaf<(i32 8)>;
82def CmpHS   : PatLeaf<(i32 9)>;
83def CmpEQU  : PatLeaf<(i32 10)>;
84def CmpNEU  : PatLeaf<(i32 11)>;
85def CmpLTU  : PatLeaf<(i32 12)>;
86def CmpLEU  : PatLeaf<(i32 13)>;
87def CmpGTU  : PatLeaf<(i32 14)>;
88def CmpGEU  : PatLeaf<(i32 15)>;
89def CmpNUM  : PatLeaf<(i32 16)>;
90def CmpNAN  : PatLeaf<(i32 17)>;
91
92def CmpEQ_FTZ   : PatLeaf<(i32 0x100)>;
93def CmpNE_FTZ   : PatLeaf<(i32 0x101)>;
94def CmpLT_FTZ   : PatLeaf<(i32 0x102)>;
95def CmpLE_FTZ   : PatLeaf<(i32 0x103)>;
96def CmpGT_FTZ   : PatLeaf<(i32 0x104)>;
97def CmpGE_FTZ   : PatLeaf<(i32 0x105)>;
98def CmpEQU_FTZ  : PatLeaf<(i32 0x10A)>;
99def CmpNEU_FTZ  : PatLeaf<(i32 0x10B)>;
100def CmpLTU_FTZ  : PatLeaf<(i32 0x10C)>;
101def CmpLEU_FTZ  : PatLeaf<(i32 0x10D)>;
102def CmpGTU_FTZ  : PatLeaf<(i32 0x10E)>;
103def CmpGEU_FTZ  : PatLeaf<(i32 0x10F)>;
104def CmpNUM_FTZ  : PatLeaf<(i32 0x110)>;
105def CmpNAN_FTZ  : PatLeaf<(i32 0x111)>;
106
107def CmpMode : Operand<i32> {
108  let PrintMethod = "printCmpMode";
109}
110def VecElement : Operand<i32> {
111  let PrintMethod = "printVecElement";
112}
113
114// PRMT modes
115// These must match the enum in NVPTX.h
116def PrmtNONE : PatLeaf<(i32 0x0)>;
117def PrmtF4E  : PatLeaf<(i32 0x1)>;
118def PrmtB4E  : PatLeaf<(i32 0x2)>;
119def PrmtRC8  : PatLeaf<(i32 0x3)>;
120def PrmtECL  : PatLeaf<(i32 0x4)>;
121def PrmtECR  : PatLeaf<(i32 0x5)>;
122def PrmtRC16 : PatLeaf<(i32 0x6)>;
123
124def PrmtMode : Operand<i32> {
125  let PrintMethod = "printPrmtMode";
126}
127
128
129//===----------------------------------------------------------------------===//
130// NVPTX Instruction Predicate Definitions
131//===----------------------------------------------------------------------===//
132
133
134def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
135def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
136def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
137def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
138def hasVote : Predicate<"Subtarget->hasVote()">;
139def hasDouble : Predicate<"Subtarget->hasDouble()">;
140def hasLDG : Predicate<"Subtarget->hasLDG()">;
141def hasLDU : Predicate<"Subtarget->hasLDU()">;
142
143def doF32FTZ : Predicate<"useF32FTZ()">;
144def doNoF32FTZ : Predicate<"!useF32FTZ()">;
145
146def doMulWide      : Predicate<"doMulWide">;
147
148def allowFMA : Predicate<"allowFMA()">;
149def noFMA : Predicate<"!allowFMA()">;
150def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">;
151def noUnsafeFPMath : Predicate<"!allowUnsafeFPMath()">;
152
153def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
154def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
155
156def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
157def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
158
159def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
160def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
161
162def True : Predicate<"true">;
163
164class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>;
165class hasSM<int version>: Predicate<"Subtarget->getSmVersion() >= " # version>;
166
167// non-sync shfl instructions are not available on sm_70+ in PTX6.4+
168def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"
169                          "&& Subtarget->getPTXVersion() >= 64)">;
170
171def useShortPtr : Predicate<"useShortPointers()">;
172def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
173def hasBF16Math: Predicate<"Subtarget->hasBF16Math()">;
174
175// Helper class to aid conversion between ValueType and a matching RegisterClass.
176
177class ValueToRegClass<ValueType T> {
178   string name = !cast<string>(T);
179   NVPTXRegClass ret = !cond(
180     !eq(name, "i1"): Int1Regs,
181     !eq(name, "i16"): Int16Regs,
182     !eq(name, "v2i16"): Int32Regs,
183     !eq(name, "i32"): Int32Regs,
184     !eq(name, "i64"): Int64Regs,
185     !eq(name, "f16"): Int16Regs,
186     !eq(name, "v2f16"): Int32Regs,
187     !eq(name, "bf16"): Int16Regs,
188     !eq(name, "v2bf16"): Int32Regs,
189     !eq(name, "f32"): Float32Regs,
190     !eq(name, "f64"): Float64Regs,
191     !eq(name, "ai32"): Int32ArgRegs,
192     !eq(name, "ai64"): Int64ArgRegs,
193     !eq(name, "af32"): Float32ArgRegs,
194     !eq(name, "if64"): Float64ArgRegs,
195    );
196}
197
198
199//===----------------------------------------------------------------------===//
200// Some Common Instruction Class Templates
201//===----------------------------------------------------------------------===//
202
203// Template for instructions which take three int64, int32, or int16 args.
204// The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
205multiclass I3<string OpcStr, SDNode OpNode> {
206  def i64rr :
207    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
208              !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
209              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
210  def i64ri :
211    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
212              !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
213              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
214  def i32rr :
215    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
216              !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
217              [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>;
218  def i32ri :
219    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
220              !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
221              [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>;
222  def i16rr :
223    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
224              !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
225              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
226  def i16ri :
227    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
228              !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
229              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
230}
231
232class I16x2<string OpcStr, SDNode OpNode> :
233 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
234              !strconcat(OpcStr, "16x2 \t$dst, $a, $b;"),
235              [(set Int32Regs:$dst, (OpNode (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)))]>,
236              Requires<[hasPTX<80>, hasSM<90>]>;
237
238// Template for instructions which take 3 int args.  The instructions are
239// named "<OpcStr>.s32" (e.g. "addc.cc.s32").
240multiclass ADD_SUB_INT_CARRY<string OpcStr, SDNode OpNode> {
241  let hasSideEffects = 1 in {
242    def i32rr :
243      NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
244                !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
245                [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>;
246    def i32ri :
247      NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
248                !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
249                [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>;
250    def i64rr :
251      NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
252                !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
253                [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>,
254      Requires<[hasPTX<43>]>;
255    def i64ri :
256      NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
257                !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
258                [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>,
259      Requires<[hasPTX<43>]>;
260  }
261}
262
263// Template for instructions which take three fp64 or fp32 args.  The
264// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64").
265//
266// Also defines ftz (flush subnormal inputs and results to sign-preserving
267// zero) variants for fp32 functions.
268//
269// This multiclass should be used for nodes that cannot be folded into FMAs.
270// For nodes that can be folded into FMAs (i.e. adds and muls), use
271// F3_fma_component.
272multiclass F3<string OpcStr, SDNode OpNode> {
273   def f64rr :
274     NVPTXInst<(outs Float64Regs:$dst),
275               (ins Float64Regs:$a, Float64Regs:$b),
276               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
277               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
278   def f64ri :
279     NVPTXInst<(outs Float64Regs:$dst),
280               (ins Float64Regs:$a, f64imm:$b),
281               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
282               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
283   def f32rr_ftz :
284     NVPTXInst<(outs Float32Regs:$dst),
285               (ins Float32Regs:$a, Float32Regs:$b),
286               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
287               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
288               Requires<[doF32FTZ]>;
289   def f32ri_ftz :
290     NVPTXInst<(outs Float32Regs:$dst),
291               (ins Float32Regs:$a, f32imm:$b),
292               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
293               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
294               Requires<[doF32FTZ]>;
295   def f32rr :
296     NVPTXInst<(outs Float32Regs:$dst),
297               (ins Float32Regs:$a, Float32Regs:$b),
298               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
299               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
300   def f32ri :
301     NVPTXInst<(outs Float32Regs:$dst),
302               (ins Float32Regs:$a, f32imm:$b),
303               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
304               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
305
306   def f16rr_ftz :
307     NVPTXInst<(outs Int16Regs:$dst),
308               (ins Int16Regs:$a, Int16Regs:$b),
309               !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
310               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
311               Requires<[useFP16Math, doF32FTZ]>;
312   def f16rr :
313     NVPTXInst<(outs Int16Regs:$dst),
314               (ins Int16Regs:$a, Int16Regs:$b),
315               !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
316               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
317               Requires<[useFP16Math]>;
318
319   def f16x2rr_ftz :
320     NVPTXInst<(outs Int32Regs:$dst),
321               (ins Int32Regs:$a, Int32Regs:$b),
322               !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
323               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
324               Requires<[useFP16Math, doF32FTZ]>;
325   def f16x2rr :
326     NVPTXInst<(outs Int32Regs:$dst),
327               (ins Int32Regs:$a, Int32Regs:$b),
328               !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
329               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
330               Requires<[useFP16Math]>;
331   def bf16rr_ftz :
332     NVPTXInst<(outs Int16Regs:$dst),
333               (ins Int16Regs:$a, Int16Regs:$b),
334               !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"),
335               [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
336               Requires<[hasBF16Math, doF32FTZ]>;
337   def bf16rr :
338     NVPTXInst<(outs Int16Regs:$dst),
339               (ins Int16Regs:$a, Int16Regs:$b),
340               !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"),
341               [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
342               Requires<[hasBF16Math]>;
343
344   def bf16x2rr_ftz :
345     NVPTXInst<(outs Int32Regs:$dst),
346               (ins Int32Regs:$a, Int32Regs:$b),
347               !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"),
348               [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
349               Requires<[hasBF16Math, doF32FTZ]>;
350   def bf16x2rr :
351     NVPTXInst<(outs Int32Regs:$dst),
352               (ins Int32Regs:$a, Int32Regs:$b),
353               !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"),
354               [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
355               Requires<[hasBF16Math]>;
356}
357
358// Template for instructions which take three FP args.  The
359// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
360//
361// Also defines ftz (flush subnormal inputs and results to sign-preserving
362// zero) variants for fp32/fp16 functions.
363//
364// This multiclass should be used for nodes that can be folded to make fma ops.
365// In this case, we use the ".rn" variant when FMA is disabled, as this behaves
366// just like the non ".rn" op, but prevents ptxas from creating FMAs.
367multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
368   def f64rr :
369     NVPTXInst<(outs Float64Regs:$dst),
370               (ins Float64Regs:$a, Float64Regs:$b),
371               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
372               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
373               Requires<[allowFMA]>;
374   def f64ri :
375     NVPTXInst<(outs Float64Regs:$dst),
376               (ins Float64Regs:$a, f64imm:$b),
377               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
378               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
379               Requires<[allowFMA]>;
380   def f32rr_ftz :
381     NVPTXInst<(outs Float32Regs:$dst),
382               (ins Float32Regs:$a, Float32Regs:$b),
383               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
384               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
385               Requires<[allowFMA, doF32FTZ]>;
386   def f32ri_ftz :
387     NVPTXInst<(outs Float32Regs:$dst),
388               (ins Float32Regs:$a, f32imm:$b),
389               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
390               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
391               Requires<[allowFMA, doF32FTZ]>;
392   def f32rr :
393     NVPTXInst<(outs Float32Regs:$dst),
394               (ins Float32Regs:$a, Float32Regs:$b),
395               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
396               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
397               Requires<[allowFMA]>;
398   def f32ri :
399     NVPTXInst<(outs Float32Regs:$dst),
400               (ins Float32Regs:$a, f32imm:$b),
401               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
402               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
403               Requires<[allowFMA]>;
404
405   def f16rr_ftz :
406     NVPTXInst<(outs Int16Regs:$dst),
407               (ins Int16Regs:$a, Int16Regs:$b),
408               !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
409               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
410               Requires<[useFP16Math, allowFMA, doF32FTZ]>;
411   def f16rr :
412     NVPTXInst<(outs Int16Regs:$dst),
413               (ins Int16Regs:$a, Int16Regs:$b),
414               !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
415               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
416               Requires<[useFP16Math, allowFMA]>;
417
418   def f16x2rr_ftz :
419     NVPTXInst<(outs Int32Regs:$dst),
420               (ins Int32Regs:$a, Int32Regs:$b),
421               !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
422               [(set (v2f16 Int32Regs:$dst), (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
423               Requires<[useFP16Math, allowFMA, doF32FTZ]>;
424   def f16x2rr :
425     NVPTXInst<(outs Int32Regs:$dst),
426               (ins Int32Regs:$a, Int32Regs:$b),
427               !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
428               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
429               Requires<[useFP16Math, allowFMA]>;
430   def bf16rr_ftz :
431     NVPTXInst<(outs Int16Regs:$dst),
432               (ins Int16Regs:$a, Int16Regs:$b),
433               !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"),
434               [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
435               Requires<[hasBF16Math, allowFMA, doF32FTZ]>;
436   def bf16rr :
437     NVPTXInst<(outs Int16Regs:$dst),
438               (ins Int16Regs:$a, Int16Regs:$b),
439               !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"),
440               [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
441               Requires<[hasBF16Math, allowFMA]>;
442
443   def bf16x2rr_ftz :
444     NVPTXInst<(outs Int32Regs:$dst),
445               (ins Int32Regs:$a, Int32Regs:$b),
446               !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"),
447               [(set (v2bf16 Int32Regs:$dst), (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
448               Requires<[hasBF16Math, allowFMA, doF32FTZ]>;
449   def bf16x2rr :
450     NVPTXInst<(outs Int32Regs:$dst),
451               (ins Int32Regs:$a, Int32Regs:$b),
452               !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"),
453               [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
454               Requires<[hasBF16Math, allowFMA]>;
455   // These have strange names so we don't perturb existing mir tests.
456   def _rnf64rr :
457     NVPTXInst<(outs Float64Regs:$dst),
458               (ins Float64Regs:$a, Float64Regs:$b),
459               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
460               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
461               Requires<[noFMA]>;
462   def _rnf64ri :
463     NVPTXInst<(outs Float64Regs:$dst),
464               (ins Float64Regs:$a, f64imm:$b),
465               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
466               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
467               Requires<[noFMA]>;
468   def _rnf32rr_ftz :
469     NVPTXInst<(outs Float32Regs:$dst),
470               (ins Float32Regs:$a, Float32Regs:$b),
471               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
472               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
473               Requires<[noFMA, doF32FTZ]>;
474   def _rnf32ri_ftz :
475     NVPTXInst<(outs Float32Regs:$dst),
476               (ins Float32Regs:$a, f32imm:$b),
477               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
478               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
479               Requires<[noFMA, doF32FTZ]>;
480   def _rnf32rr :
481     NVPTXInst<(outs Float32Regs:$dst),
482               (ins Float32Regs:$a, Float32Regs:$b),
483               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
484               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
485               Requires<[noFMA]>;
486   def _rnf32ri :
487     NVPTXInst<(outs Float32Regs:$dst),
488               (ins Float32Regs:$a, f32imm:$b),
489               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
490               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
491               Requires<[noFMA]>;
492   def _rnf16rr_ftz :
493     NVPTXInst<(outs Int16Regs:$dst),
494               (ins Int16Regs:$a, Int16Regs:$b),
495               !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"),
496               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
497               Requires<[useFP16Math, noFMA, doF32FTZ]>;
498   def _rnf16rr :
499     NVPTXInst<(outs Int16Regs:$dst),
500               (ins Int16Regs:$a, Int16Regs:$b),
501               !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"),
502               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
503               Requires<[useFP16Math, noFMA]>;
504   def _rnf16x2rr_ftz :
505     NVPTXInst<(outs Int32Regs:$dst),
506               (ins Int32Regs:$a, Int32Regs:$b),
507               !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"),
508               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
509               Requires<[useFP16Math, noFMA, doF32FTZ]>;
510   def _rnf16x2rr :
511     NVPTXInst<(outs Int32Regs:$dst),
512               (ins Int32Regs:$a, Int32Regs:$b),
513               !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"),
514               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
515               Requires<[useFP16Math, noFMA]>;
516  def _rnbf16rr_ftz :
517     NVPTXInst<(outs Int16Regs:$dst),
518               (ins Int16Regs:$a, Int16Regs:$b),
519               !strconcat(OpcStr, ".rn.ftz.bf16 \t$dst, $a, $b;"),
520               [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
521               Requires<[hasBF16Math, noFMA, doF32FTZ]>;
522   def _rnbf16rr :
523     NVPTXInst<(outs Int16Regs:$dst),
524               (ins Int16Regs:$a, Int16Regs:$b),
525               !strconcat(OpcStr, ".rn.bf16 \t$dst, $a, $b;"),
526               [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
527               Requires<[hasBF16Math, noFMA]>;
528   def _rnbf16x2rr_ftz :
529     NVPTXInst<(outs Int32Regs:$dst),
530               (ins Int32Regs:$a, Int32Regs:$b),
531               !strconcat(OpcStr, ".rn.ftz.bf16x2 \t$dst, $a, $b;"),
532               [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
533               Requires<[hasBF16Math, noFMA, doF32FTZ]>;
534   def _rnbf16x2rr :
535     NVPTXInst<(outs Int32Regs:$dst),
536               (ins Int32Regs:$a, Int32Regs:$b),
537               !strconcat(OpcStr, ".rn.bf16x2 \t$dst, $a, $b;"),
538               [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
539               Requires<[hasBF16Math, noFMA]>;
540}
541
542// Template for operations which take two f32 or f64 operands.  Provides three
543// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
544// subnormal inputs and results to zero).
545multiclass F2<string OpcStr, SDNode OpNode> {
546   def f64 :     NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
547                           !strconcat(OpcStr, ".f64 \t$dst, $a;"),
548                           [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
549   def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
550                           !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
551                           [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
552                           Requires<[doF32FTZ]>;
553   def f32 :     NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
554                           !strconcat(OpcStr, ".f32 \t$dst, $a;"),
555                           [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
556}
557
558multiclass F2_Support_Half<string OpcStr, SDNode OpNode> {
559   def bf16 :      NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
560                           !strconcat(OpcStr, ".bf16 \t$dst, $a;"),
561                           [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a)))]>,
562                           Requires<[hasSM<80>, hasPTX<70>]>;
563   def bf16x2 :    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
564                           !strconcat(OpcStr, ".bf16x2 \t$dst, $a;"),
565                           [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a)))]>,
566                           Requires<[hasSM<80>, hasPTX<70>]>;
567   def f16_ftz :   NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
568                           !strconcat(OpcStr, ".ftz.f16 \t$dst, $a;"),
569                           [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a)))]>,
570                           Requires<[hasSM<53>, hasPTX<65>, doF32FTZ]>;
571   def f16x2_ftz : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
572                           !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a;"),
573                           [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a)))]>,
574                           Requires<[hasSM<53>, hasPTX<65>, doF32FTZ]>;
575   def f16 :       NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
576                           !strconcat(OpcStr, ".f16 \t$dst, $a;"),
577                           [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a)))]>,
578                           Requires<[hasSM<53>, hasPTX<65>]>;
579   def f16x2 :     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
580                           !strconcat(OpcStr, ".f16x2 \t$dst, $a;"),
581                           [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a)))]>,
582                           Requires<[hasSM<53>, hasPTX<65>]>;
583
584}
585
586//===----------------------------------------------------------------------===//
587// NVPTX Instructions.
588//===----------------------------------------------------------------------===//
589
590//-----------------------------------
591// Type Conversion
592//-----------------------------------
593
594let hasSideEffects = false in {
595  // Generate a cvt to the given type from all possible types.  Each instance
596  // takes a CvtMode immediate that defines the conversion mode to use.  It can
597  // be CvtNONE to omit a conversion mode.
598  multiclass CVT_FROM_ALL<string ToType, RegisterClass RC, list<Predicate> Preds = []> {
599    def _s8 :
600      NVPTXInst<(outs RC:$dst),
601                (ins Int16Regs:$src, CvtMode:$mode),
602                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
603                ToType, ".s8 \t$dst, $src;"), []>,
604      Requires<Preds>;
605    def _u8 :
606      NVPTXInst<(outs RC:$dst),
607                (ins Int16Regs:$src, CvtMode:$mode),
608                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
609                ToType, ".u8 \t$dst, $src;"), []>,
610      Requires<Preds>;
611    def _s16 :
612      NVPTXInst<(outs RC:$dst),
613                (ins Int16Regs:$src, CvtMode:$mode),
614                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
615                ToType, ".s16 \t$dst, $src;"), []>,
616      Requires<Preds>;
617    def _u16 :
618      NVPTXInst<(outs RC:$dst),
619                (ins Int16Regs:$src, CvtMode:$mode),
620                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
621                ToType, ".u16 \t$dst, $src;"), []>,
622      Requires<Preds>;
623    def _s32 :
624      NVPTXInst<(outs RC:$dst),
625                (ins Int32Regs:$src, CvtMode:$mode),
626                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
627                ToType, ".s32 \t$dst, $src;"), []>,
628      Requires<Preds>;
629    def _u32 :
630      NVPTXInst<(outs RC:$dst),
631                (ins Int32Regs:$src, CvtMode:$mode),
632                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
633                ToType, ".u32 \t$dst, $src;"), []>,
634      Requires<Preds>;
635    def _s64 :
636      NVPTXInst<(outs RC:$dst),
637                (ins Int64Regs:$src, CvtMode:$mode),
638                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
639                ToType, ".s64 \t$dst, $src;"), []>,
640      Requires<Preds>;
641    def _u64 :
642      NVPTXInst<(outs RC:$dst),
643                (ins Int64Regs:$src, CvtMode:$mode),
644                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
645                ToType, ".u64 \t$dst, $src;"), []>,
646      Requires<Preds>;
647    def _f16 :
648      NVPTXInst<(outs RC:$dst),
649                (ins Int16Regs:$src, CvtMode:$mode),
650                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
651                ToType, ".f16 \t$dst, $src;"), []>,
652      Requires<Preds>;
653    def _bf16 :
654      NVPTXInst<(outs RC:$dst),
655                (ins Int16Regs:$src, CvtMode:$mode),
656                !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.",
657                ToType, ".bf16 \t$dst, $src;"), []>,
658      Requires<!if(!eq(ToType, "f32"),
659                   // bf16->f32 was introduced early.
660                   [hasPTX<71>, hasSM<80>],
661                   // bf16->everything else needs sm90/ptx78
662                   [hasPTX<78>, hasSM<90>])>;
663    def _f32 :
664      NVPTXInst<(outs RC:$dst),
665                (ins Float32Regs:$src, CvtMode:$mode),
666                !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.",
667                ToType, ".f32 \t$dst, $src;"), []>,
668      Requires<!if(!eq(ToType, "bf16"),
669                   // f32->bf16 was introduced early.
670                   [hasPTX<70>, hasSM<80>],
671                   Preds)>;
672    def _f64 :
673      NVPTXInst<(outs RC:$dst),
674                (ins Float64Regs:$src, CvtMode:$mode),
675                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
676                ToType, ".f64 \t$dst, $src;"), []>,
677      Requires<Preds>;
678  }
679
680  // Generate cvts from all types to all types.
681  defm CVT_s8  : CVT_FROM_ALL<"s8",  Int16Regs>;
682  defm CVT_u8  : CVT_FROM_ALL<"u8",  Int16Regs>;
683  defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
684  defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
685  defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
686  defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
687  defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
688  defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
689  defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>;
690  defm CVT_bf16 : CVT_FROM_ALL<"bf16", Int16Regs, [hasPTX<78>, hasSM<90>]>;
691  defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
692  defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
693
694  // These cvts are different from those above: The source and dest registers
695  // are of the same type.
696  def CVT_INREG_s16_s8 :  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
697                                    "cvt.s16.s8 \t$dst, $src;", []>;
698  def CVT_INREG_s32_s8 :  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
699                                    "cvt.s32.s8 \t$dst, $src;", []>;
700  def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
701                                    "cvt.s32.s16 \t$dst, $src;", []>;
702  def CVT_INREG_s64_s8 :  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
703                                    "cvt.s64.s8 \t$dst, $src;", []>;
704  def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
705                                    "cvt.s64.s16 \t$dst, $src;", []>;
706  def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
707                                    "cvt.s64.s32 \t$dst, $src;", []>;
708
709  multiclass CVT_FROM_FLOAT_V2_SM80<string FromName, RegisterClass RC> {
710    def _f32 :
711      NVPTXInst<(outs RC:$dst),
712                (ins Float32Regs:$src1, Float32Regs:$src2,  CvtMode:$mode),
713                !strconcat("cvt${mode:base}${mode:relu}.",
714                FromName, ".f32 \t$dst, $src1, $src2;"), []>,
715    Requires<[hasPTX<70>, hasSM<80>]>;
716  }
717
718  defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Int32Regs>;
719  defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>;
720}
721
722//-----------------------------------
723// Selection instructions (selp)
724//-----------------------------------
725
726// TODO: Missing slct
727
728// selp instructions that don't have any pattern matches; we explicitly use
729// them within this file.
730let hasSideEffects = false in {
731  multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
732    def rr : NVPTXInst<(outs RC:$dst),
733                       (ins RC:$a, RC:$b, Int1Regs:$p),
734                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
735    def ri : NVPTXInst<(outs RC:$dst),
736                       (ins RC:$a, ImmCls:$b, Int1Regs:$p),
737                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
738    def ir : NVPTXInst<(outs RC:$dst),
739                       (ins ImmCls:$a, RC:$b, Int1Regs:$p),
740                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
741    def ii : NVPTXInst<(outs RC:$dst),
742                       (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
743                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
744  }
745
746  multiclass SELP_PATTERN<string TypeStr, ValueType T, RegisterClass RC,
747                          Operand ImmCls, SDNode ImmNode> {
748    def rr :
749      NVPTXInst<(outs RC:$dst),
750                (ins RC:$a, RC:$b, Int1Regs:$p),
751                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
752                [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T RC:$b)))]>;
753    def ri :
754      NVPTXInst<(outs RC:$dst),
755                (ins RC:$a, ImmCls:$b, Int1Regs:$p),
756                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
757                [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T ImmNode:$b)))]>;
758    def ir :
759      NVPTXInst<(outs RC:$dst),
760                (ins ImmCls:$a, RC:$b, Int1Regs:$p),
761                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
762                [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, (T RC:$b)))]>;
763    def ii :
764      NVPTXInst<(outs RC:$dst),
765                (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
766                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
767                [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
768  }
769}
770
771// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as
772// good.
773defm SELP_b16 : SELP_PATTERN<"b16", i16, Int16Regs, i16imm, imm>;
774defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
775defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
776defm SELP_b32 : SELP_PATTERN<"b32", i32, Int32Regs, i32imm, imm>;
777defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;
778defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
779defm SELP_b64 : SELP_PATTERN<"b64", i64, Int64Regs, i64imm, imm>;
780defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
781defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
782defm SELP_f16 : SELP_PATTERN<"b16", f16, Int16Regs, f16imm, fpimm>;
783defm SELP_bf16 : SELP_PATTERN<"b16", bf16, Int16Regs, bf16imm, fpimm>;
784
785defm SELP_f32 : SELP_PATTERN<"f32", f32, Float32Regs, f32imm, fpimm>;
786defm SELP_f64 : SELP_PATTERN<"f64", f64, Float64Regs, f64imm, fpimm>;
787
788// This does not work as tablegen fails to infer the type of 'imm'.
789// def v2f16imm : Operand<v2f16>;
790// defm SELP_f16x2 : SELP_PATTERN<"b32", v2f16, Int32Regs, v2f16imm, imm>;
791
792foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
793def : Pat<(vt (select Int1Regs:$p, (vt Int32Regs:$a), (vt Int32Regs:$b))),
794          (SELP_b32rr Int32Regs:$a, Int32Regs:$b, Int1Regs:$p)>;
795}
796
797//-----------------------------------
798// Test Instructions
799//-----------------------------------
800
801def TESTINF_f32r : NVPTXInst<(outs Int1Regs:$p), (ins Float32Regs:$a),
802                             "testp.infinite.f32 \t$p, $a;",
803                             []>;
804def TESTINF_f32i : NVPTXInst<(outs Int1Regs:$p), (ins f32imm:$a),
805                             "testp.infinite.f32 \t$p, $a;",
806                             []>;
807def TESTINF_f64r : NVPTXInst<(outs Int1Regs:$p), (ins Float64Regs:$a),
808                             "testp.infinite.f64 \t$p, $a;",
809                             []>;
810def TESTINF_f64i : NVPTXInst<(outs Int1Regs:$p), (ins f64imm:$a),
811                             "testp.infinite.f64 \t$p, $a;",
812                             []>;
813
814//-----------------------------------
815// Integer Arithmetic
816//-----------------------------------
817
818// Template for xor masquerading as int1 arithmetic.
819multiclass ADD_SUB_i1<SDNode OpNode> {
820   def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
821                      "xor.pred \t$dst, $a, $b;",
822                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
823   def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
824                      "xor.pred \t$dst, $a, $b;",
825                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
826}
827
828// int1 addition and subtraction are both just xor.
829defm ADD_i1 : ADD_SUB_i1<add>;
830defm SUB_i1 : ADD_SUB_i1<sub>;
831
832// int16, int32, and int64 signed addition.  Since nvptx is 2's complement, we
833// also use these for unsigned arithmetic.
834defm ADD : I3<"add.s", add>;
835defm SUB : I3<"sub.s", sub>;
836
837def ADD16x2 : I16x2<"add.s", add>;
838def SUB16x2 : I16x2<"sub.s", sub>;
839
840// in32 and int64 addition and subtraction with carry-out.
841defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc>;
842defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>;
843
844// int32 and int64 addition and subtraction with carry-in and carry-out.
845defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>;
846defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>;
847
848defm MULT : I3<"mul.lo.s", mul>;
849
850defm MULTHS : I3<"mul.hi.s", mulhs>;
851defm MULTHU : I3<"mul.hi.u", mulhu>;
852
853defm SDIV : I3<"div.s", sdiv>;
854defm UDIV : I3<"div.u", udiv>;
855
856// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
857// will lower it.
858defm SREM : I3<"rem.s", srem>;
859defm UREM : I3<"rem.u", urem>;
860
861// Integer absolute value.  NumBits should be one minus the bit width of RC.
862// This idiom implements the algorithm at
863// http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs.
864multiclass ABS<ValueType T, RegisterClass RC, string SizeName> {
865  def : NVPTXInst<(outs RC:$dst), (ins RC:$a),
866                  !strconcat("abs", SizeName, " \t$dst, $a;"),
867                  [(set (T RC:$dst), (abs (T RC:$a)))]>;
868}
869defm ABS_16 : ABS<i16, Int16Regs, ".s16">;
870defm ABS_32 : ABS<i32, Int32Regs, ".s32">;
871defm ABS_64 : ABS<i64, Int64Regs, ".s64">;
872
873// Integer min/max.
874defm SMAX : I3<"max.s", smax>;
875defm UMAX : I3<"max.u", umax>;
876defm SMIN : I3<"min.s", smin>;
877defm UMIN : I3<"min.u", umin>;
878
879def SMAX16x2 : I16x2<"max.s", smax>;
880def UMAX16x2 : I16x2<"max.u", umax>;
881def SMIN16x2 : I16x2<"min.s", smin>;
882def UMIN16x2 : I16x2<"min.u", umin>;
883
884
885//
886// Wide multiplication
887//
888def MULWIDES64 :
889  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
890            "mul.wide.s32 \t$dst, $a, $b;", []>;
891def MULWIDES64Imm :
892  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
893            "mul.wide.s32 \t$dst, $a, $b;", []>;
894def MULWIDES64Imm64 :
895  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
896            "mul.wide.s32 \t$dst, $a, $b;", []>;
897
898def MULWIDEU64 :
899  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
900            "mul.wide.u32 \t$dst, $a, $b;", []>;
901def MULWIDEU64Imm :
902  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
903            "mul.wide.u32 \t$dst, $a, $b;", []>;
904def MULWIDEU64Imm64 :
905  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
906            "mul.wide.u32 \t$dst, $a, $b;", []>;
907
908def MULWIDES32 :
909  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
910            "mul.wide.s16 \t$dst, $a, $b;", []>;
911def MULWIDES32Imm :
912  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
913            "mul.wide.s16 \t$dst, $a, $b;", []>;
914def MULWIDES32Imm32 :
915  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
916            "mul.wide.s16 \t$dst, $a, $b;", []>;
917
918def MULWIDEU32 :
919  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
920            "mul.wide.u16 \t$dst, $a, $b;", []>;
921def MULWIDEU32Imm :
922  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
923            "mul.wide.u16 \t$dst, $a, $b;", []>;
924def MULWIDEU32Imm32 :
925  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
926            "mul.wide.u16 \t$dst, $a, $b;", []>;
927
928def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
929def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
930def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
931
932// Matchers for signed, unsigned mul.wide ISD nodes.
933def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)),
934          (MULWIDES32 i16:$a, i16:$b)>,
935      Requires<[doMulWide]>;
936def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
937          (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
938      Requires<[doMulWide]>;
939def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)),
940          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
941      Requires<[doMulWide]>;
942def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
943          (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
944      Requires<[doMulWide]>;
945
946def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)),
947          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
948      Requires<[doMulWide]>;
949def : Pat<(i64 (mul_wide_signed (i32 Int32Regs:$a), imm:$b)),
950          (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
951      Requires<[doMulWide]>;
952def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)),
953          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
954      Requires<[doMulWide]>;
955def : Pat<(i64 (mul_wide_unsigned (i32 Int32Regs:$a), imm:$b)),
956          (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
957      Requires<[doMulWide]>;
958
959// Predicates used for converting some patterns to mul.wide.
960def SInt32Const : PatLeaf<(imm), [{
961  const APInt &v = N->getAPIntValue();
962  return v.isSignedIntN(32);
963}]>;
964
965def UInt32Const : PatLeaf<(imm), [{
966  const APInt &v = N->getAPIntValue();
967  return v.isIntN(32);
968}]>;
969
970def SInt16Const : PatLeaf<(imm), [{
971  const APInt &v = N->getAPIntValue();
972  return v.isSignedIntN(16);
973}]>;
974
975def UInt16Const : PatLeaf<(imm), [{
976  const APInt &v = N->getAPIntValue();
977  return v.isIntN(16);
978}]>;
979
980def IntConst_0_30 : PatLeaf<(imm), [{
981  // Check if 0 <= v < 31; only then will the result of (x << v) be an int32.
982  const APInt &v = N->getAPIntValue();
983  return v.sge(0) && v.slt(31);
984}]>;
985
986def IntConst_0_14 : PatLeaf<(imm), [{
987  // Check if 0 <= v < 15; only then will the result of (x << v) be an int16.
988  const APInt &v = N->getAPIntValue();
989  return v.sge(0) && v.slt(15);
990}]>;
991
992def SHL2MUL32 : SDNodeXForm<imm, [{
993  const APInt &v = N->getAPIntValue();
994  APInt temp(32, 1);
995  return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32);
996}]>;
997
998def SHL2MUL16 : SDNodeXForm<imm, [{
999  const APInt &v = N->getAPIntValue();
1000  APInt temp(16, 1);
1001  return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
1002}]>;
1003
1004// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
1005def : Pat<(shl (sext Int32Regs:$a), (i32 IntConst_0_30:$b)),
1006          (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
1007      Requires<[doMulWide]>;
1008def : Pat<(shl (zext Int32Regs:$a), (i32 IntConst_0_30:$b)),
1009          (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
1010      Requires<[doMulWide]>;
1011
1012def : Pat<(shl (sext Int16Regs:$a), (i16 IntConst_0_14:$b)),
1013          (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
1014      Requires<[doMulWide]>;
1015def : Pat<(shl (zext Int16Regs:$a), (i16 IntConst_0_14:$b)),
1016          (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
1017      Requires<[doMulWide]>;
1018
1019// Convert "sign/zero-extend then multiply" to mul.wide.
1020def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
1021          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
1022      Requires<[doMulWide]>;
1023def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
1024          (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
1025      Requires<[doMulWide]>;
1026
1027def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
1028          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
1029      Requires<[doMulWide]>;
1030def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
1031          (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
1032      Requires<[doMulWide]>;
1033
1034def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
1035          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
1036      Requires<[doMulWide]>;
1037def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
1038          (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
1039      Requires<[doMulWide]>;
1040
1041def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
1042          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
1043      Requires<[doMulWide]>;
1044def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
1045          (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
1046      Requires<[doMulWide]>;
1047
1048//
1049// Integer multiply-add
1050//
1051def SDTIMAD :
1052  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>,
1053                       SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
1054def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
1055
1056def MAD16rrr :
1057  NVPTXInst<(outs Int16Regs:$dst),
1058            (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
1059            "mad.lo.s16 \t$dst, $a, $b, $c;",
1060            [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
1061def MAD16rri :
1062  NVPTXInst<(outs Int16Regs:$dst),
1063            (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
1064            "mad.lo.s16 \t$dst, $a, $b, $c;",
1065            [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
1066def MAD16rir :
1067  NVPTXInst<(outs Int16Regs:$dst),
1068            (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
1069            "mad.lo.s16 \t$dst, $a, $b, $c;",
1070            [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
1071def MAD16rii :
1072  NVPTXInst<(outs Int16Regs:$dst),
1073            (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
1074            "mad.lo.s16 \t$dst, $a, $b, $c;",
1075            [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>;
1076
1077def MAD32rrr :
1078  NVPTXInst<(outs Int32Regs:$dst),
1079            (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
1080            "mad.lo.s32 \t$dst, $a, $b, $c;",
1081            [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), (i32 Int32Regs:$b), (i32 Int32Regs:$c)))]>;
1082def MAD32rri :
1083  NVPTXInst<(outs Int32Regs:$dst),
1084            (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
1085            "mad.lo.s32 \t$dst, $a, $b, $c;",
1086            [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), (i32 Int32Regs:$b), imm:$c))]>;
1087def MAD32rir :
1088  NVPTXInst<(outs Int32Regs:$dst),
1089            (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
1090            "mad.lo.s32 \t$dst, $a, $b, $c;",
1091            [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), imm:$b, (i32 Int32Regs:$c)))]>;
1092def MAD32rii :
1093  NVPTXInst<(outs Int32Regs:$dst),
1094            (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
1095            "mad.lo.s32 \t$dst, $a, $b, $c;",
1096            [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), imm:$b, imm:$c))]>;
1097
1098def MAD64rrr :
1099  NVPTXInst<(outs Int64Regs:$dst),
1100            (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
1101            "mad.lo.s64 \t$dst, $a, $b, $c;",
1102            [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
1103def MAD64rri :
1104  NVPTXInst<(outs Int64Regs:$dst),
1105            (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
1106            "mad.lo.s64 \t$dst, $a, $b, $c;",
1107            [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
1108def MAD64rir :
1109  NVPTXInst<(outs Int64Regs:$dst),
1110            (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
1111            "mad.lo.s64 \t$dst, $a, $b, $c;",
1112            [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
1113def MAD64rii :
1114  NVPTXInst<(outs Int64Regs:$dst),
1115            (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
1116            "mad.lo.s64 \t$dst, $a, $b, $c;",
1117            [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>;
1118
1119def INEG16 :
1120  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
1121            "neg.s16 \t$dst, $src;",
1122            [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
1123def INEG32 :
1124  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
1125            "neg.s32 \t$dst, $src;",
1126            [(set (i32 Int32Regs:$dst), (ineg (i32 Int32Regs:$src)))]>;
1127def INEG64 :
1128  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
1129            "neg.s64 \t$dst, $src;",
1130            [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
1131
1132//-----------------------------------
1133// Floating Point Arithmetic
1134//-----------------------------------
1135
1136// Constant 1.0f
1137def FloatConst1 : PatLeaf<(fpimm), [{
1138  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() &&
1139         N->getValueAPF().convertToFloat() == 1.0f;
1140}]>;
1141// Constant 1.0 (double)
1142def DoubleConst1 : PatLeaf<(fpimm), [{
1143  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
1144         N->getValueAPF().convertToDouble() == 1.0;
1145}]>;
1146
1147// Loads FP16 constant into a register.
1148//
1149// ptxas does not have hex representation for fp16, so we can't use
1150// fp16 immediate values in .f16 instructions. Instead we have to load
1151// the constant into a register using mov.b16.
1152def LOAD_CONST_F16 :
1153  NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$a),
1154            "mov.b16 \t$dst, $a;", []>;
1155def LOAD_CONST_BF16 :
1156  NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$a),
1157            "mov.b16 \t$dst, $a;", []>;
1158defm FADD : F3_fma_component<"add", fadd>;
1159defm FSUB : F3_fma_component<"sub", fsub>;
1160defm FMUL : F3_fma_component<"mul", fmul>;
1161
1162defm FMIN : F3<"min", fminnum>;
1163defm FMAX : F3<"max", fmaxnum>;
1164// Note: min.NaN.f64 and max.NaN.f64 do not actually exist.
1165defm FMINNAN : F3<"min.NaN", fminimum>;
1166defm FMAXNAN : F3<"max.NaN", fmaximum>;
1167
1168defm FABS  : F2<"abs", fabs>;
1169defm FNEG  : F2<"neg", fneg>;
1170defm FABS_H: F2_Support_Half<"abs", fabs>;
1171defm FNEG_H: F2_Support_Half<"neg", fneg>;
1172
1173defm FSQRT : F2<"sqrt.rn", fsqrt>;
1174
1175//
1176// F16 NEG
1177//
1178class FNEG_F16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> :
1179      NVPTXInst<(outs RC:$dst), (ins RC:$src),
1180                !strconcat(OpcStr, " \t$dst, $src;"),
1181                [(set RC:$dst, (fneg (T RC:$src)))]>,
1182                Requires<[useFP16Math, hasPTX<60>, hasSM<53>, Pred]>;
1183def FNEG16_ftz   : FNEG_F16_F16X2<"neg.ftz.f16", f16, Int16Regs, doF32FTZ>;
1184def FNEG16       : FNEG_F16_F16X2<"neg.f16", f16, Int16Regs, True>;
1185def FNEG16x2_ftz : FNEG_F16_F16X2<"neg.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>;
1186def FNEG16x2     : FNEG_F16_F16X2<"neg.f16x2", v2f16, Int32Regs, True>;
1187
1188//
1189// BF16 NEG
1190//
1191
1192class FNEG_BF16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> :
1193      NVPTXInst<(outs RC:$dst), (ins RC:$src),
1194                !strconcat(OpcStr, " \t$dst, $src;"),
1195                [(set RC:$dst, (fneg (T RC:$src)))]>,
1196                Requires<[hasBF16Math, hasPTX<70>, hasSM<80>, Pred]>;
1197def BFNEG16_ftz   : FNEG_BF16_F16X2<"neg.ftz.bf16", bf16, Int16Regs, doF32FTZ>;
1198def BFNEG16       : FNEG_BF16_F16X2<"neg.bf16", bf16, Int16Regs, True>;
1199def BFNEG16x2_ftz : FNEG_BF16_F16X2<"neg.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>;
1200def BFNEG16x2     : FNEG_BF16_F16X2<"neg.bf16x2", v2bf16, Int32Regs, True>;
1201
1202//
1203// F64 division
1204//
1205def FDIV641r :
1206  NVPTXInst<(outs Float64Regs:$dst),
1207            (ins f64imm:$a, Float64Regs:$b),
1208            "rcp.rn.f64 \t$dst, $b;",
1209            [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
1210def FDIV64rr :
1211  NVPTXInst<(outs Float64Regs:$dst),
1212            (ins Float64Regs:$a, Float64Regs:$b),
1213            "div.rn.f64 \t$dst, $a, $b;",
1214            [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>;
1215def FDIV64ri :
1216  NVPTXInst<(outs Float64Regs:$dst),
1217            (ins Float64Regs:$a, f64imm:$b),
1218            "div.rn.f64 \t$dst, $a, $b;",
1219            [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>;
1220
1221//
1222// F32 Approximate reciprocal
1223//
1224def FDIV321r_ftz :
1225  NVPTXInst<(outs Float32Regs:$dst),
1226            (ins f32imm:$a, Float32Regs:$b),
1227            "rcp.approx.ftz.f32 \t$dst, $b;",
1228            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1229            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
1230def FDIV321r :
1231  NVPTXInst<(outs Float32Regs:$dst),
1232            (ins f32imm:$a, Float32Regs:$b),
1233            "rcp.approx.f32 \t$dst, $b;",
1234            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1235            Requires<[do_DIVF32_APPROX]>;
1236//
1237// F32 Approximate division
1238//
1239def FDIV32approxrr_ftz :
1240  NVPTXInst<(outs Float32Regs:$dst),
1241            (ins Float32Regs:$a, Float32Regs:$b),
1242            "div.approx.ftz.f32 \t$dst, $a, $b;",
1243            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1244            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
1245def FDIV32approxri_ftz :
1246  NVPTXInst<(outs Float32Regs:$dst),
1247            (ins Float32Regs:$a, f32imm:$b),
1248            "div.approx.ftz.f32 \t$dst, $a, $b;",
1249            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1250            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
1251def FDIV32approxrr :
1252  NVPTXInst<(outs Float32Regs:$dst),
1253            (ins Float32Regs:$a, Float32Regs:$b),
1254            "div.approx.f32 \t$dst, $a, $b;",
1255            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1256            Requires<[do_DIVF32_APPROX]>;
1257def FDIV32approxri :
1258  NVPTXInst<(outs Float32Regs:$dst),
1259            (ins Float32Regs:$a, f32imm:$b),
1260            "div.approx.f32 \t$dst, $a, $b;",
1261            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1262            Requires<[do_DIVF32_APPROX]>;
1263//
1264// F32 Semi-accurate reciprocal
1265//
1266// rcp.approx gives the same result as div.full(1.0f, a) and is faster.
1267//
1268def FDIV321r_approx_ftz :
1269  NVPTXInst<(outs Float32Regs:$dst),
1270            (ins f32imm:$a, Float32Regs:$b),
1271            "rcp.approx.ftz.f32 \t$dst, $b;",
1272            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1273            Requires<[do_DIVF32_FULL, doF32FTZ]>;
1274def FDIV321r_approx :
1275  NVPTXInst<(outs Float32Regs:$dst),
1276            (ins f32imm:$a, Float32Regs:$b),
1277            "rcp.approx.f32 \t$dst, $b;",
1278            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1279            Requires<[do_DIVF32_FULL]>;
1280//
1281// F32 Semi-accurate division
1282//
1283def FDIV32rr_ftz :
1284  NVPTXInst<(outs Float32Regs:$dst),
1285            (ins Float32Regs:$a, Float32Regs:$b),
1286            "div.full.ftz.f32 \t$dst, $a, $b;",
1287            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1288            Requires<[do_DIVF32_FULL, doF32FTZ]>;
1289def FDIV32ri_ftz :
1290  NVPTXInst<(outs Float32Regs:$dst),
1291            (ins Float32Regs:$a, f32imm:$b),
1292            "div.full.ftz.f32 \t$dst, $a, $b;",
1293            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1294            Requires<[do_DIVF32_FULL, doF32FTZ]>;
1295def FDIV32rr :
1296  NVPTXInst<(outs Float32Regs:$dst),
1297            (ins Float32Regs:$a, Float32Regs:$b),
1298            "div.full.f32 \t$dst, $a, $b;",
1299            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1300            Requires<[do_DIVF32_FULL]>;
1301def FDIV32ri :
1302  NVPTXInst<(outs Float32Regs:$dst),
1303            (ins Float32Regs:$a, f32imm:$b),
1304            "div.full.f32 \t$dst, $a, $b;",
1305            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1306            Requires<[do_DIVF32_FULL]>;
1307//
1308// F32 Accurate reciprocal
1309//
1310def FDIV321r_prec_ftz :
1311  NVPTXInst<(outs Float32Regs:$dst),
1312            (ins f32imm:$a, Float32Regs:$b),
1313            "rcp.rn.ftz.f32 \t$dst, $b;",
1314            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1315            Requires<[doF32FTZ]>;
1316def FDIV321r_prec :
1317  NVPTXInst<(outs Float32Regs:$dst),
1318            (ins f32imm:$a, Float32Regs:$b),
1319            "rcp.rn.f32 \t$dst, $b;",
1320            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>;
1321//
1322// F32 Accurate division
1323//
1324def FDIV32rr_prec_ftz :
1325  NVPTXInst<(outs Float32Regs:$dst),
1326            (ins Float32Regs:$a, Float32Regs:$b),
1327            "div.rn.ftz.f32 \t$dst, $a, $b;",
1328            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1329            Requires<[doF32FTZ]>;
1330def FDIV32ri_prec_ftz :
1331  NVPTXInst<(outs Float32Regs:$dst),
1332            (ins Float32Regs:$a, f32imm:$b),
1333            "div.rn.ftz.f32 \t$dst, $a, $b;",
1334            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1335            Requires<[doF32FTZ]>;
1336def FDIV32rr_prec :
1337  NVPTXInst<(outs Float32Regs:$dst),
1338            (ins Float32Regs:$a, Float32Regs:$b),
1339            "div.rn.f32 \t$dst, $a, $b;",
1340            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>;
1341def FDIV32ri_prec :
1342  NVPTXInst<(outs Float32Regs:$dst),
1343            (ins Float32Regs:$a, f32imm:$b),
1344            "div.rn.f32 \t$dst, $a, $b;",
1345            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>;
1346
1347//
1348// FMA
1349//
1350
1351multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
1352   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1353                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1354                       [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
1355                       Requires<[Pred]>;
1356   def rri : NVPTXInst<(outs RC:$dst),
1357                       (ins RC:$a, RC:$b, ImmCls:$c),
1358                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1359                       [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>,
1360                       Requires<[Pred]>;
1361   def rir : NVPTXInst<(outs RC:$dst),
1362                       (ins RC:$a, ImmCls:$b, RC:$c),
1363                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1364                       [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>,
1365                       Requires<[Pred]>;
1366   def rii : NVPTXInst<(outs RC:$dst),
1367                       (ins RC:$a, ImmCls:$b, ImmCls:$c),
1368                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1369                       [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
1370                       Requires<[Pred]>;
1371}
1372
1373multiclass FMA_F16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> {
1374   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1375                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1376                       [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>,
1377                       Requires<[useFP16Math, Pred]>;
1378}
1379
1380multiclass FMA_BF16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> {
1381   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1382                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1383                       [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>,
1384                       Requires<[hasBF16Math, Pred]>;
1385}
1386
1387defm FMA16_ftz    : FMA_F16<"fma.rn.ftz.f16", f16, Int16Regs, doF32FTZ>;
1388defm FMA16        : FMA_F16<"fma.rn.f16", f16, Int16Regs, True>;
1389defm FMA16x2_ftz  : FMA_F16<"fma.rn.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>;
1390defm FMA16x2      : FMA_F16<"fma.rn.f16x2", v2f16, Int32Regs, True>;
1391defm BFMA16_ftz   : FMA_BF16<"fma.rn.ftz.bf16", bf16, Int16Regs, doF32FTZ>;
1392defm BFMA16       : FMA_BF16<"fma.rn.bf16", bf16, Int16Regs, True>;
1393defm BFMA16x2_ftz : FMA_BF16<"fma.rn.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>;
1394defm BFMA16x2     : FMA_BF16<"fma.rn.bf16x2", v2bf16, Int32Regs, True>;
1395defm FMA32_ftz    : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
1396defm FMA32        : FMA<"fma.rn.f32", Float32Regs, f32imm, True>;
1397defm FMA64        : FMA<"fma.rn.f64", Float64Regs, f64imm, True>;
1398
1399// sin/cos
1400def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1401                      "sin.approx.f32 \t$dst, $src;",
1402                      [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>,
1403                      Requires<[allowUnsafeFPMath]>;
1404def COSF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1405                      "cos.approx.f32 \t$dst, $src;",
1406                      [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>,
1407                      Requires<[allowUnsafeFPMath]>;
1408
1409// Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
1410// i.e. "poor man's fmod()". When y is infinite, x is returned. This matches the
1411// semantics of LLVM's frem.
1412
1413// frem - f32 FTZ
1414def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1415          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
1416            (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ),
1417             Float32Regs:$y))>,
1418          Requires<[doF32FTZ, allowUnsafeFPMath]>;
1419def : Pat<(frem Float32Regs:$x, fpimm:$y),
1420          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
1421            (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ),
1422             fpimm:$y))>,
1423          Requires<[doF32FTZ, allowUnsafeFPMath]>;
1424
1425def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1426          (SELP_f32rr Float32Regs:$x,
1427            (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
1428              (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ),
1429              Float32Regs:$y)),
1430            (TESTINF_f32r Float32Regs:$y))>,
1431          Requires<[doF32FTZ, noUnsafeFPMath]>;
1432def : Pat<(frem Float32Regs:$x, fpimm:$y),
1433          (SELP_f32rr Float32Regs:$x,
1434            (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
1435              (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ),
1436              fpimm:$y)),
1437            (TESTINF_f32i fpimm:$y))>,
1438          Requires<[doF32FTZ, noUnsafeFPMath]>;
1439
1440// frem - f32
1441def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1442          (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
1443            (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI),
1444             Float32Regs:$y))>,
1445          Requires<[allowUnsafeFPMath]>;
1446def : Pat<(frem Float32Regs:$x, fpimm:$y),
1447          (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
1448            (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI),
1449             fpimm:$y))>,
1450          Requires<[allowUnsafeFPMath]>;
1451
1452def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1453          (SELP_f32rr Float32Regs:$x,
1454            (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
1455              (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI),
1456              Float32Regs:$y)),
1457            (TESTINF_f32r Float32Regs:$y))>,
1458          Requires<[noUnsafeFPMath]>;
1459def : Pat<(frem Float32Regs:$x, fpimm:$y),
1460          (SELP_f32rr Float32Regs:$x,
1461            (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
1462              (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI),
1463              fpimm:$y)),
1464            (TESTINF_f32i fpimm:$y))>,
1465          Requires<[noUnsafeFPMath]>;
1466
1467// frem - f64
1468def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
1469          (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
1470            (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI),
1471             Float64Regs:$y))>,
1472          Requires<[allowUnsafeFPMath]>;
1473def : Pat<(frem Float64Regs:$x, fpimm:$y),
1474          (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
1475            (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI),
1476             fpimm:$y))>,
1477          Requires<[allowUnsafeFPMath]>;
1478
1479def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
1480          (SELP_f64rr Float64Regs:$x,
1481            (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
1482              (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI),
1483               Float64Regs:$y)),
1484            (TESTINF_f64r Float64Regs:$y))>,
1485          Requires<[noUnsafeFPMath]>;
1486def : Pat<(frem Float64Regs:$x, fpimm:$y),
1487          (SELP_f64rr Float64Regs:$x,
1488            (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
1489              (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI),
1490              fpimm:$y)),
1491            (TESTINF_f64r Float64Regs:$y))>,
1492          Requires<[noUnsafeFPMath]>;
1493
1494//-----------------------------------
1495// Bitwise operations
1496//-----------------------------------
1497
1498// Template for three-arg bitwise operations.  Takes three args, Creates .b16,
1499// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
1500multiclass BITWISE<string OpcStr, SDNode OpNode> {
1501  def b1rr :
1502    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
1503              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
1504              [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
1505  def b1ri :
1506    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
1507              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
1508              [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
1509  def b16rr :
1510    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
1511              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
1512              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
1513  def b16ri :
1514    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
1515              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
1516              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
1517  def b32rr :
1518    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1519              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
1520              [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>;
1521  def b32ri :
1522    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1523              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
1524              [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>;
1525  def b64rr :
1526    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
1527              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
1528              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
1529  def b64ri :
1530    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
1531              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
1532              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
1533}
1534
1535defm OR  : BITWISE<"or", or>;
1536defm AND : BITWISE<"and", and>;
1537defm XOR : BITWISE<"xor", xor>;
1538
1539// Lower logical v2i16/v4i8 ops as bitwise ops on b32.
1540foreach vt = [v2i16, v4i8] in {
1541  def: Pat<(or (vt Int32Regs:$a), (vt Int32Regs:$b)),
1542           (ORb32rr Int32Regs:$a, Int32Regs:$b)>;
1543  def: Pat<(xor (vt Int32Regs:$a), (vt Int32Regs:$b)),
1544           (XORb32rr Int32Regs:$a, Int32Regs:$b)>;
1545  def: Pat<(and (vt Int32Regs:$a), (vt Int32Regs:$b)),
1546           (ANDb32rr Int32Regs:$a, Int32Regs:$b)>;
1547
1548  // The constants get legalized into a bitcast from i32, so that's what we need
1549  // to match here.
1550  def: Pat<(or Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))),
1551           (ORb32ri Int32Regs:$a, imm:$b)>;
1552  def: Pat<(xor Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))),
1553           (XORb32ri Int32Regs:$a, imm:$b)>;
1554  def: Pat<(and Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))),
1555           (ANDb32ri Int32Regs:$a, imm:$b)>;
1556}
1557
1558def NOT1  : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
1559                      "not.pred \t$dst, $src;",
1560                      [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
1561def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
1562                      "not.b16 \t$dst, $src;",
1563                      [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
1564def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
1565                      "not.b32 \t$dst, $src;",
1566                      [(set (i32 Int32Regs:$dst), (not (i32 Int32Regs:$src)))]>;
1567def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
1568                       "not.b64 \t$dst, $src;",
1569                       [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
1570
1571// Template for left/right shifts.  Takes three operands,
1572//   [dest (reg), src (reg), shift (reg or imm)].
1573// dest and src may be int64, int32, or int16, but shift is always int32.
1574//
1575// This template also defines a 32-bit shift (imm, imm) instruction.
1576multiclass SHIFT<string OpcStr, SDNode OpNode> {
1577   def i64rr :
1578     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
1579               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1580               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 Int32Regs:$b)))]>;
1581   def i64ri :
1582     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
1583               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1584               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>;
1585   def i32rr :
1586     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1587               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1588               [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>;
1589   def i32ri :
1590     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1591               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1592               [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 imm:$b)))]>;
1593   def i32ii :
1594     NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
1595               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1596               [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
1597   def i16rr :
1598     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
1599               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1600               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 Int32Regs:$b)))]>;
1601   def i16ri :
1602     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
1603               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1604               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>;
1605}
1606
1607defm SHL : SHIFT<"shl.b", shl>;
1608defm SRA : SHIFT<"shr.s", sra>;
1609defm SRL : SHIFT<"shr.u", srl>;
1610
1611// Bit-reverse
1612def BREV32 :
1613  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
1614             "brev.b32 \t$dst, $a;",
1615             [(set Int32Regs:$dst, (bitreverse (i32 Int32Regs:$a)))]>;
1616def BREV64 :
1617  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a),
1618             "brev.b64 \t$dst, $a;",
1619             [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>;
1620
1621//
1622// Rotate: Use ptx shf instruction if available.
1623//
1624
1625// 32 bit r2 = rotl r1, n
1626//    =>
1627//        r2 = shf.l r1, r1, n
1628def ROTL32imm_hw :
1629  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
1630            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1631            [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 imm:$amt)))]>,
1632           Requires<[hasHWROT32]>;
1633
1634def ROTL32reg_hw :
1635  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1636            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1637            [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
1638           Requires<[hasHWROT32]>;
1639
1640// 32 bit r2 = rotr r1, n
1641//    =>
1642//        r2 = shf.r r1, r1, n
1643def ROTR32imm_hw :
1644  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
1645            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1646            [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 imm:$amt)))]>,
1647           Requires<[hasHWROT32]>;
1648
1649def ROTR32reg_hw :
1650  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1651            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1652            [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
1653           Requires<[hasHWROT32]>;
1654
1655// 32-bit software rotate by immediate.  $amt2 should equal 32 - $amt1.
1656def ROT32imm_sw :
1657  NVPTXInst<(outs Int32Regs:$dst),
1658            (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
1659            "{{\n\t"
1660            ".reg .b32 %lhs;\n\t"
1661            ".reg .b32 %rhs;\n\t"
1662            "shl.b32 \t%lhs, $src, $amt1;\n\t"
1663            "shr.b32 \t%rhs, $src, $amt2;\n\t"
1664            "add.u32 \t$dst, %lhs, %rhs;\n\t"
1665            "}}",
1666            []>;
1667
1668def SUB_FRM_32 : SDNodeXForm<imm, [{
1669  return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32);
1670}]>;
1671
1672def : Pat<(rotl (i32 Int32Regs:$src), (i32 imm:$amt)),
1673          (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
1674      Requires<[noHWROT32]>;
1675def : Pat<(rotr (i32 Int32Regs:$src), (i32 imm:$amt)),
1676          (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
1677      Requires<[noHWROT32]>;
1678
1679// 32-bit software rotate left by register.
1680def ROTL32reg_sw :
1681  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1682            "{{\n\t"
1683            ".reg .b32 %lhs;\n\t"
1684            ".reg .b32 %rhs;\n\t"
1685            ".reg .b32 %amt2;\n\t"
1686            "shl.b32 \t%lhs, $src, $amt;\n\t"
1687            "sub.s32 \t%amt2, 32, $amt;\n\t"
1688            "shr.b32 \t%rhs, $src, %amt2;\n\t"
1689            "add.u32 \t$dst, %lhs, %rhs;\n\t"
1690            "}}",
1691            [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
1692           Requires<[noHWROT32]>;
1693
1694// 32-bit software rotate right by register.
1695def ROTR32reg_sw :
1696  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1697            "{{\n\t"
1698            ".reg .b32 %lhs;\n\t"
1699            ".reg .b32 %rhs;\n\t"
1700            ".reg .b32 %amt2;\n\t"
1701            "shr.b32 \t%lhs, $src, $amt;\n\t"
1702            "sub.s32 \t%amt2, 32, $amt;\n\t"
1703            "shl.b32 \t%rhs, $src, %amt2;\n\t"
1704            "add.u32 \t$dst, %lhs, %rhs;\n\t"
1705            "}}",
1706            [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
1707           Requires<[noHWROT32]>;
1708
1709// 64-bit software rotate by immediate.  $amt2 should equal 64 - $amt1.
1710def ROT64imm_sw :
1711  NVPTXInst<(outs Int64Regs:$dst),
1712            (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2),
1713            "{{\n\t"
1714            ".reg .b64 %lhs;\n\t"
1715            ".reg .b64 %rhs;\n\t"
1716            "shl.b64 \t%lhs, $src, $amt1;\n\t"
1717            "shr.b64 \t%rhs, $src, $amt2;\n\t"
1718            "add.u64 \t$dst, %lhs, %rhs;\n\t"
1719            "}}",
1720            []>;
1721
1722def SUB_FRM_64 : SDNodeXForm<imm, [{
1723    return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32);
1724}]>;
1725
1726def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
1727          (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
1728def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
1729          (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
1730
1731// 64-bit software rotate left by register.
1732def ROTL64reg_sw :
1733  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
1734            "{{\n\t"
1735            ".reg .b64 %lhs;\n\t"
1736            ".reg .b64 %rhs;\n\t"
1737            ".reg .u32 %amt2;\n\t"
1738            "shl.b64 \t%lhs, $src, $amt;\n\t"
1739            "sub.u32 \t%amt2, 64, $amt;\n\t"
1740            "shr.b64 \t%rhs, $src, %amt2;\n\t"
1741            "add.u64 \t$dst, %lhs, %rhs;\n\t"
1742            "}}",
1743            [(set Int64Regs:$dst, (rotl Int64Regs:$src, (i32 Int32Regs:$amt)))]>;
1744
1745def ROTR64reg_sw :
1746  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
1747            "{{\n\t"
1748            ".reg .b64 %lhs;\n\t"
1749            ".reg .b64 %rhs;\n\t"
1750            ".reg .u32 %amt2;\n\t"
1751            "shr.b64 \t%lhs, $src, $amt;\n\t"
1752            "sub.u32 \t%amt2, 64, $amt;\n\t"
1753            "shl.b64 \t%rhs, $src, %amt2;\n\t"
1754            "add.u64 \t$dst, %lhs, %rhs;\n\t"
1755            "}}",
1756            [(set Int64Regs:$dst, (rotr Int64Regs:$src, (i32 Int32Regs:$amt)))]>;
1757
1758//
1759// Funnnel shift in clamp mode
1760//
1761
1762// Create SDNodes so they can be used in the DAG code, e.g.
1763// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
1764def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
1765def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
1766
1767def FUNSHFLCLAMP :
1768  NVPTXInst<(outs Int32Regs:$dst),
1769            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1770            "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
1771            [(set Int32Regs:$dst,
1772              (FUN_SHFL_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>;
1773
1774def FUNSHFRCLAMP :
1775  NVPTXInst<(outs Int32Regs:$dst),
1776            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1777            "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
1778            [(set Int32Regs:$dst,
1779             (FUN_SHFR_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>;
1780
1781//
1782// BFE - bit-field extract
1783//
1784
1785// Template for BFE/BFI instructions.
1786// Args: [dest (reg), src (reg), start (reg or imm), end (reg or imm)].
1787// Start may be an imm only if end is also an imm.  FIXME: Is this a
1788// restriction in PTX?
1789//
1790// dest and src may be int32 or int64, but start and end are always int32.
1791def SDTBFE :
1792  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>,
1793                       SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
1794def bfe : SDNode<"NVPTXISD::BFE", SDTBFE>;
1795
1796def SDTBFI :
1797  SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
1798                       SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
1799def bfi : SDNode<"NVPTXISD::BFI", SDTBFI>;
1800
1801def SDTPRMT :
1802  SDTypeProfile<1, 4, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
1803                       SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>,]>;
1804def prmt : SDNode<"NVPTXISD::PRMT", SDTPRMT>;
1805
1806multiclass BFE<string Instr, ValueType T, RegisterClass RC> {
1807  def rrr
1808    : NVPTXInst<(outs RC:$d),
1809                (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
1810                !strconcat(Instr, " \t$d, $a, $b, $c;"),
1811                [(set (T RC:$d), (bfe (T RC:$a), (i32 Int32Regs:$b), (i32 Int32Regs:$c)))]>;
1812  def rri
1813    : NVPTXInst<(outs RC:$d),
1814                (ins RC:$a, Int32Regs:$b, i32imm:$c),
1815                !strconcat(Instr, " \t$d, $a, $b, $c;"),
1816                [(set (T RC:$d), (bfe (T RC:$a), (i32 Int32Regs:$b), (i32 imm:$c)))]>;
1817  def rii
1818    : NVPTXInst<(outs RC:$d),
1819                (ins RC:$a, i32imm:$b, i32imm:$c),
1820                !strconcat(Instr, " \t$d, $a, $b, $c;"),
1821                [(set (T RC:$d), (bfe (T RC:$a), (i32 imm:$b), (i32 imm:$c)))]>;
1822}
1823
1824multiclass BFI<string Instr, ValueType T, RegisterClass RC, Operand ImmCls> {
1825  def rrrr
1826    : NVPTXInst<(outs RC:$f),
1827                (ins RC:$a, RC:$b, Int32Regs:$c, Int32Regs:$d),
1828                !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1829                [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 Int32Regs:$d)))]>;
1830  def rrri
1831    : NVPTXInst<(outs RC:$f),
1832                (ins RC:$a, RC:$b, Int32Regs:$c, i32imm:$d),
1833                !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1834                [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 imm:$d)))]>;
1835  def rrii
1836    : NVPTXInst<(outs RC:$f),
1837                (ins RC:$a, RC:$b, i32imm:$c, i32imm:$d),
1838                !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1839                [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 imm:$c), (i32 imm:$d)))]>;
1840  def irrr
1841    : NVPTXInst<(outs RC:$f),
1842                (ins ImmCls:$a, RC:$b, Int32Regs:$c, Int32Regs:$d),
1843                !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1844                [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 Int32Regs:$d)))]>;
1845  def irri
1846    : NVPTXInst<(outs RC:$f),
1847                (ins ImmCls:$a, RC:$b, Int32Regs:$c, i32imm:$d),
1848                !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1849                [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 imm:$d)))]>;
1850  def irii
1851    : NVPTXInst<(outs RC:$f),
1852                (ins ImmCls:$a, RC:$b, i32imm:$c, i32imm:$d),
1853                !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1854                [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 imm:$c), (i32 imm:$d)))]>;
1855}
1856
1857multiclass PRMT<ValueType T, RegisterClass RC> {
1858  def rrr
1859    : NVPTXInst<(outs RC:$d),
1860                (ins RC:$a, Int32Regs:$b, Int32Regs:$c, PrmtMode:$mode),
1861                !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
1862                [(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), imm:$mode))]>;
1863  def rri
1864    : NVPTXInst<(outs RC:$d),
1865                (ins RC:$a, Int32Regs:$b, i32imm:$c, PrmtMode:$mode),
1866                !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
1867                [(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 imm:$c), imm:$mode))]>;
1868  def rii
1869    : NVPTXInst<(outs RC:$d),
1870                (ins RC:$a, i32imm:$b, i32imm:$c, PrmtMode:$mode),
1871                !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
1872                [(set (T RC:$d), (prmt (T RC:$a), (T imm:$b), (i32 imm:$c), imm:$mode))]>;
1873}
1874
1875let hasSideEffects = false in {
1876  defm BFE_S32 : BFE<"bfe.s32", i32, Int32Regs>;
1877  defm BFE_U32 : BFE<"bfe.u32", i32, Int32Regs>;
1878  defm BFE_S64 : BFE<"bfe.s64", i64, Int64Regs>;
1879  defm BFE_U64 : BFE<"bfe.u64", i64, Int64Regs>;
1880
1881  defm BFI_B32 : BFI<"bfi.b32", i32, Int32Regs, i32imm>;
1882  defm BFI_B64 : BFI<"bfi.b64", i64, Int64Regs, i64imm>;
1883
1884  defm PRMT_B32 : PRMT<i32, Int32Regs>;
1885}
1886
1887
1888// byte extraction + signed/unsigned extension to i32.
1889def : Pat<(i32 (sext_inreg (bfe (i32 Int32Regs:$s),  (i32 Int32Regs:$o), 8), i8)),
1890          (BFE_S32rri Int32Regs:$s, Int32Regs:$o, 8)>;
1891def : Pat<(i32 (sext_inreg (bfe (i32 Int32Regs:$s),  (i32 imm:$o), 8), i8)),
1892          (BFE_S32rii Int32Regs:$s, imm:$o, 8)>;
1893def : Pat<(i32 (and (bfe (i32 Int32Regs:$s),  (i32 Int32Regs:$o), 8), 255)),
1894          (BFE_U32rri Int32Regs:$s, Int32Regs:$o, 8)>;
1895def : Pat<(i32 (and (bfe (i32 Int32Regs:$s),  (i32 imm:$o), 8), 255)),
1896          (BFE_U32rii Int32Regs:$s, imm:$o, 8)>;
1897
1898// byte extraction + signed extension to i16
1899def : Pat<(i16 (sext_inreg (trunc (bfe (i32 Int32Regs:$s),  (i32 imm:$o), 8)), i8)),
1900          (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, imm:$o, 8), CvtNONE)>;
1901
1902
1903// Byte extraction via shift/trunc/sext
1904def : Pat<(i16 (sext_inreg (trunc Int32Regs:$s), i8)),
1905          (CVT_s8_s32 Int32Regs:$s, CvtNONE)>;
1906def : Pat<(i16 (sext_inreg (trunc (srl (i32 Int32Regs:$s),  (i32 imm:$o))), i8)),
1907          (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, imm:$o, 8), CvtNONE)>;
1908def : Pat<(sext_inreg (srl (i32 Int32Regs:$s),  (i32 imm:$o)), i8),
1909          (BFE_S32rii Int32Regs:$s, imm:$o, 8)>;
1910def : Pat<(i16 (sra (i16 (trunc Int32Regs:$s)), (i32 8))),
1911          (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, 8, 8), CvtNONE)>;
1912def : Pat<(sext_inreg (srl (i64 Int64Regs:$s),  (i32 imm:$o)), i8),
1913          (BFE_S64rii Int64Regs:$s, imm:$o, 8)>;
1914def : Pat<(i16 (sext_inreg (trunc Int64Regs:$s), i8)),
1915          (CVT_s8_s64 Int64Regs:$s, CvtNONE)>;
1916def : Pat<(i16 (sext_inreg (trunc (srl (i64 Int64Regs:$s),  (i32 imm:$o))), i8)),
1917          (CVT_s8_s64 (BFE_S64rii Int64Regs:$s, imm:$o, 8), CvtNONE)>;
1918
1919//-----------------------------------
1920// Comparison instructions (setp, set)
1921//-----------------------------------
1922
1923// FIXME: This doesn't cover versions of set and setp that combine with a
1924// boolean predicate, e.g. setp.eq.and.b16.
1925
1926let hasSideEffects = false in {
1927  multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
1928    def rr :
1929      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
1930                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1931                           " \t$dst, $a, $b;"), []>;
1932    def ri :
1933      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
1934                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1935                           " \t$dst, $a, $b;"), []>;
1936    def ir :
1937      NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
1938                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1939                           " \t$dst, $a, $b;"), []>;
1940  }
1941}
1942
1943defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
1944defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>;
1945defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>;
1946defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>;
1947defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>;
1948defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>;
1949defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>;
1950defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>;
1951defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
1952defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
1953defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
1954def SETP_f16rr :
1955      NVPTXInst<(outs Int1Regs:$dst),
1956                (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp),
1957                "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;",
1958                []>, Requires<[useFP16Math]>;
1959
1960def SETP_f16x2rr :
1961      NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
1962                (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp),
1963                "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;",
1964                []>,
1965                Requires<[useFP16Math]>;
1966def SETP_bf16rr :
1967      NVPTXInst<(outs Int1Regs:$dst),
1968                (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp),
1969                "setp${cmp:base}${cmp:ftz}.bf16 \t$dst, $a, $b;",
1970                []>, Requires<[hasBF16Math, hasPTX<78>, hasSM<90>]>;
1971
1972def SETP_bf16x2rr :
1973      NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
1974                (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp),
1975                "setp${cmp:base}${cmp:ftz}.bf16x2 \t$p|$q, $a, $b;",
1976                []>,
1977                Requires<[hasBF16Math, hasPTX<78>, hasSM<90>]>;
1978
1979
1980// FIXME: This doesn't appear to be correct.  The "set" mnemonic has the form
1981// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
1982// reg, either u32, s32, or f32.  Anyway these aren't used at the moment.
1983
1984let hasSideEffects = false in {
1985  multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
1986    def rr : NVPTXInst<(outs Int32Regs:$dst),
1987                       (ins RC:$a, RC:$b, CmpMode:$cmp),
1988                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
1989    def ri : NVPTXInst<(outs Int32Regs:$dst),
1990                       (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
1991                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
1992    def ir : NVPTXInst<(outs Int32Regs:$dst),
1993                       (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
1994                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
1995  }
1996}
1997
1998defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
1999defm SET_s16 : SET<"s16", Int16Regs, i16imm>;
2000defm SET_u16 : SET<"u16", Int16Regs, i16imm>;
2001defm SET_b32 : SET<"b32", Int32Regs, i32imm>;
2002defm SET_s32 : SET<"s32", Int32Regs, i32imm>;
2003defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
2004defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
2005defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
2006defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
2007defm SET_f16 : SET<"f16", Int16Regs, f16imm>;
2008defm SET_bf16 : SET<"bf16", Int16Regs, bf16imm>, Requires<[hasPTX<78>, hasSM<90>]>;
2009defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
2010defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
2011
2012//-----------------------------------
2013// Data Movement (Load / Store, Move)
2014//-----------------------------------
2015
2016def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
2017                            [SDNPWantRoot]>;
2018def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
2019                              [SDNPWantRoot]>;
2020def ADDRvar : ComplexPattern<iPTR, 1, "SelectDirectAddr", [], []>;
2021
2022def MEMri : Operand<i32> {
2023  let PrintMethod = "printMemOperand";
2024  let MIOperandInfo = (ops Int32Regs, i32imm);
2025}
2026def MEMri64 : Operand<i64> {
2027  let PrintMethod = "printMemOperand";
2028  let MIOperandInfo = (ops Int64Regs, i64imm);
2029}
2030
2031def imem : Operand<iPTR> {
2032  let PrintMethod = "printOperand";
2033}
2034
2035def imemAny : Operand<iPTRAny> {
2036  let PrintMethod = "printOperand";
2037}
2038
2039def LdStCode : Operand<i32> {
2040  let PrintMethod = "printLdStCode";
2041}
2042
2043def MmaCode : Operand<i32> {
2044  let PrintMethod = "printMmaCode";
2045}
2046
2047def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
2048def Wrapper    : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
2049
2050// Load a memory address into a u32 or u64 register.
2051def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
2052                         "mov.u32 \t$dst, $a;",
2053                         [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
2054def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
2055                           "mov.u64 \t$dst, $a;",
2056                           [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
2057
2058// Get pointer to local stack.
2059let hasSideEffects = false in {
2060  def MOV_DEPOT_ADDR :    NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
2061                                     "mov.u32 \t$d, __local_depot$num;", []>;
2062  def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
2063                                    "mov.u64 \t$d, __local_depot$num;", []>;
2064}
2065
2066
2067// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
2068let IsSimpleMove=1, hasSideEffects=0 in {
2069  def IMOV1rr :  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
2070                           "mov.pred \t$dst, $sss;", []>;
2071  def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
2072                           "mov.u16 \t$dst, $sss;", []>;
2073  def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
2074                           "mov.u32 \t$dst, $sss;", []>;
2075  def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
2076                           "mov.u64 \t$dst, $sss;", []>;
2077
2078  def IMOVB16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
2079                           "mov.b16 \t$dst, $sss;", []>;
2080  def IMOVB32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
2081                           "mov.b32 \t$dst, $sss;", []>;
2082  def IMOVB64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
2083                           "mov.b64 \t$dst, $sss;", []>;
2084
2085  def FMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
2086                           // We have to use .b16 here as there's no mov.f16.
2087                           "mov.b16 \t$dst, $src;", []>;
2088  def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
2089                           "mov.f32 \t$dst, $src;", []>;
2090  def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
2091                           "mov.f64 \t$dst, $src;", []>;
2092}
2093
2094def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
2095                        "mov.pred \t$dst, $src;",
2096                        [(set Int1Regs:$dst, imm:$src)]>;
2097def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
2098                         "mov.u16 \t$dst, $src;",
2099                         [(set Int16Regs:$dst, imm:$src)]>;
2100def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
2101                         "mov.u32 \t$dst, $src;",
2102                         [(set (i32 Int32Regs:$dst), imm:$src)]>;
2103def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
2104                        "mov.u64 \t$dst, $src;",
2105                        [(set Int64Regs:$dst, imm:$src)]>;
2106
2107def IMOVB16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
2108                         "mov.b16 \t$dst, $src;", []>;
2109def IMOVB32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
2110                         "mov.b32 \t$dst, $src;", []>;
2111def IMOVB64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
2112                        "mov.b64 \t$dst, $src;", []>;
2113
2114def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
2115                         "mov.f32 \t$dst, $src;",
2116                         [(set Float32Regs:$dst, fpimm:$src)]>;
2117def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
2118                         "mov.f64 \t$dst, $src;",
2119                         [(set Float64Regs:$dst, fpimm:$src)]>;
2120
2121def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
2122def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64ri texternalsym:$dst)>;
2123
2124//---- Copy Frame Index ----
2125def LEA_ADDRi :   NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
2126                            "add.u32 \t$dst, ${addr:add};",
2127                            [(set Int32Regs:$dst, ADDRri:$addr)]>;
2128def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
2129                            "add.u64 \t$dst, ${addr:add};",
2130                            [(set Int64Regs:$dst, ADDRri64:$addr)]>;
2131
2132//-----------------------------------
2133// Comparison and Selection
2134//-----------------------------------
2135
2136multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
2137                       Instruction setp_16rr,
2138                       Instruction setp_16ri,
2139                       Instruction setp_16ir,
2140                       Instruction setp_32rr,
2141                       Instruction setp_32ri,
2142                       Instruction setp_32ir,
2143                       Instruction setp_64rr,
2144                       Instruction setp_64ri,
2145                       Instruction setp_64ir,
2146                       Instruction set_16rr,
2147                       Instruction set_16ri,
2148                       Instruction set_16ir,
2149                       Instruction set_32rr,
2150                       Instruction set_32ri,
2151                       Instruction set_32ir,
2152                       Instruction set_64rr,
2153                       Instruction set_64ri,
2154                       Instruction set_64ir> {
2155  // i16 -> pred
2156  def : Pat<(i1 (OpNode i16:$a, i16:$b)),
2157            (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
2158  def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)),
2159            (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
2160  def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)),
2161            (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
2162  // i32 -> pred
2163  def : Pat<(i1 (OpNode i32:$a, i32:$b)),
2164            (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
2165  def : Pat<(i1 (OpNode (i32 Int32Regs:$a), imm:$b)),
2166            (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
2167  def : Pat<(i1 (OpNode imm:$a, (i32 Int32Regs:$b))),
2168            (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
2169  // i64 -> pred
2170  def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)),
2171            (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
2172  def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)),
2173            (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
2174  def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)),
2175            (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
2176
2177  // i16 -> i32
2178  def : Pat<(i32 (OpNode i16:$a, i16:$b)),
2179            (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
2180  def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)),
2181            (set_16ri Int16Regs:$a, imm:$b, Mode)>;
2182  def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)),
2183            (set_16ir imm:$a, Int16Regs:$b, Mode)>;
2184  // i32 -> i32
2185  def : Pat<(i32 (OpNode i32:$a, i32:$b)),
2186            (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
2187  def : Pat<(i32 (OpNode (i32 Int32Regs:$a), imm:$b)),
2188            (set_32ri Int32Regs:$a, imm:$b, Mode)>;
2189  def : Pat<(i32 (OpNode imm:$a, (i32 Int32Regs:$b))),
2190            (set_32ir imm:$a, Int32Regs:$b, Mode)>;
2191  // i64 -> i32
2192  def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)),
2193            (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
2194  def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)),
2195            (set_64ri Int64Regs:$a, imm:$b, Mode)>;
2196  def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)),
2197            (set_64ir imm:$a, Int64Regs:$b, Mode)>;
2198}
2199
2200multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
2201  : ISET_FORMAT<OpNode, Mode,
2202                SETP_s16rr, SETP_s16ri, SETP_s16ir,
2203                SETP_s32rr, SETP_s32ri, SETP_s32ir,
2204                SETP_s64rr, SETP_s64ri, SETP_s64ir,
2205                SET_s16rr, SET_s16ri, SET_s16ir,
2206                SET_s32rr, SET_s32ri, SET_s32ir,
2207                SET_s64rr, SET_s64ri, SET_s64ir> {
2208  // TableGen doesn't like empty multiclasses.
2209  def : PatLeaf<(i32 0)>;
2210}
2211
2212multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
2213  : ISET_FORMAT<OpNode, Mode,
2214                SETP_u16rr, SETP_u16ri, SETP_u16ir,
2215                SETP_u32rr, SETP_u32ri, SETP_u32ir,
2216                SETP_u64rr, SETP_u64ri, SETP_u64ir,
2217                SET_u16rr, SET_u16ri, SET_u16ir,
2218                SET_u32rr, SET_u32ri, SET_u32ir,
2219                SET_u64rr, SET_u64ri, SET_u64ir> {
2220  // TableGen doesn't like empty multiclasses.
2221  def : PatLeaf<(i32 0)>;
2222}
2223
2224defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
2225defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
2226defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
2227defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
2228defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
2229defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
2230defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
2231defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
2232defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
2233defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
2234defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
2235defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
2236
2237// i1 compares
2238def : Pat<(setne Int1Regs:$a, Int1Regs:$b),
2239          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
2240def : Pat<(setune Int1Regs:$a, Int1Regs:$b),
2241          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
2242
2243def : Pat<(seteq Int1Regs:$a, Int1Regs:$b),
2244          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
2245def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
2246          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
2247
2248// comparisons of i8 extracted with BFE as i32
2249def: Pat<(setgt (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
2250         (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpGT)>;
2251def: Pat<(setge (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
2252         (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpGE)>;
2253def: Pat<(setlt (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
2254         (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpLT)>;
2255def: Pat<(setle (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
2256         (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpLE)>;
2257
2258def: Pat<(setugt (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
2259         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpHI)>;
2260def: Pat<(setuge (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
2261         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpHS)>;
2262def: Pat<(setult (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
2263         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLO)>;
2264def: Pat<(setule (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
2265         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLS)>;
2266def: Pat<(seteq (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
2267         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpEQ)>;
2268def: Pat<(setne (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
2269         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpNE)>;
2270
2271// i1 compare -> i32
2272def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
2273          (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
2274def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
2275          (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
2276
2277
2278
2279multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
2280  // f16 -> pred
2281  def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
2282            (SETP_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
2283        Requires<[useFP16Math,doF32FTZ]>;
2284  def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
2285            (SETP_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
2286        Requires<[useFP16Math]>;
2287  def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
2288            (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
2289        Requires<[useFP16Math,doF32FTZ]>;
2290  def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
2291            (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
2292        Requires<[useFP16Math]>;
2293  def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
2294            (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2295        Requires<[useFP16Math,doF32FTZ]>;
2296  def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
2297            (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>,
2298        Requires<[useFP16Math]>;
2299
2300  // bf16 -> pred
2301  def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
2302            (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
2303        Requires<[hasBF16Math,doF32FTZ]>;
2304  def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
2305            (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
2306        Requires<[hasBF16Math]>;
2307  def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
2308            (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>,
2309        Requires<[hasBF16Math,doF32FTZ]>;
2310  def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
2311            (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>,
2312        Requires<[hasBF16Math]>;
2313  def : Pat<(i1 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
2314            (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2315        Requires<[hasBF16Math,doF32FTZ]>;
2316  def : Pat<(i1 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
2317            (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>,
2318        Requires<[hasBF16Math]>;
2319
2320  // f32 -> pred
2321  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
2322            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
2323        Requires<[doF32FTZ]>;
2324  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
2325            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
2326  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
2327            (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
2328        Requires<[doF32FTZ]>;
2329  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
2330            (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
2331  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
2332            (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
2333        Requires<[doF32FTZ]>;
2334  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
2335            (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
2336
2337  // f64 -> pred
2338  def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),
2339            (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
2340  def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),
2341            (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
2342  def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
2343            (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
2344
2345  // f16 -> i32
2346  def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
2347            (SET_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
2348        Requires<[useFP16Math, doF32FTZ]>;
2349  def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
2350            (SET_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
2351        Requires<[useFP16Math]>;
2352  def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
2353            (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
2354        Requires<[useFP16Math, doF32FTZ]>;
2355  def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
2356            (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
2357        Requires<[useFP16Math]>;
2358  def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
2359            (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2360        Requires<[useFP16Math, doF32FTZ]>;
2361  def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
2362            (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>,
2363        Requires<[useFP16Math]>;
2364
2365  // bf16 -> i32
2366  def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
2367            (SET_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
2368        Requires<[hasBF16Math, doF32FTZ]>;
2369  def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
2370            (SET_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
2371        Requires<[hasBF16Math]>;
2372  def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
2373            (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>,
2374        Requires<[hasBF16Math, doF32FTZ]>;
2375  def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
2376            (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>,
2377        Requires<[hasBF16Math]>;
2378  def : Pat<(i32 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
2379            (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2380        Requires<[hasBF16Math, doF32FTZ]>;
2381  def : Pat<(i32 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
2382            (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>,
2383        Requires<[hasBF16Math]>;
2384
2385  // f32 -> i32
2386  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
2387            (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
2388        Requires<[doF32FTZ]>;
2389  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
2390            (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
2391  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
2392            (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
2393        Requires<[doF32FTZ]>;
2394  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
2395            (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
2396  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
2397            (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
2398        Requires<[doF32FTZ]>;
2399  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
2400            (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
2401
2402  // f64 -> i32
2403  def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)),
2404            (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
2405  def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)),
2406            (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
2407  def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)),
2408            (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
2409}
2410
2411defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
2412defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
2413defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
2414defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
2415defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
2416defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
2417
2418defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>;
2419defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>;
2420defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>;
2421defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>;
2422defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>;
2423defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>;
2424
2425defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>;
2426defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>;
2427defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>;
2428defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>;
2429defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>;
2430defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
2431
2432defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
2433defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
2434
2435// FIXME: What is this doing here?  Can it be deleted?
2436// def ld_param         : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
2437//                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
2438
2439def SDTDeclareParamProfile :
2440  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
2441def SDTDeclareScalarParamProfile :
2442  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
2443def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
2444def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
2445def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
2446def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
2447def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
2448def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
2449def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
2450def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
2451def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
2452def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
2453def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
2454def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
2455def SDTCallValProfile : SDTypeProfile<1, 0, []>;
2456def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
2457def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
2458def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
2459def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
2460def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
2461def SDTProxyRegProfile : SDTypeProfile<1, 1, []>;
2462
2463def DeclareParam :
2464  SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
2465         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2466def DeclareScalarParam :
2467  SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
2468         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2469def DeclareRetParam :
2470  SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile,
2471         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2472def DeclareRet :
2473  SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
2474         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2475def LoadParam :
2476  SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
2477         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
2478def LoadParamV2 :
2479  SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
2480         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
2481def LoadParamV4 :
2482  SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
2483         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
2484def PrintCall :
2485  SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
2486         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2487def PrintConvergentCall :
2488  SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile,
2489         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2490def PrintCallUni :
2491  SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
2492         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2493def PrintConvergentCallUni :
2494  SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile,
2495         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2496def StoreParam :
2497  SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
2498         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2499def StoreParamV2 :
2500  SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
2501         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2502def StoreParamV4 :
2503  SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
2504         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2505def StoreParamU32 :
2506  SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
2507         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2508def StoreParamS32 :
2509  SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
2510         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2511def CallArgBegin :
2512  SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
2513         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2514def CallArg :
2515  SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
2516         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2517def LastCallArg :
2518  SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
2519         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2520def CallArgEnd :
2521  SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
2522         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2523def CallVoid :
2524  SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
2525         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2526def Prototype :
2527  SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
2528         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2529def CallVal :
2530  SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
2531         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2532def MoveParam :
2533  SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
2534def StoreRetval :
2535  SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
2536         [SDNPHasChain, SDNPSideEffect]>;
2537def StoreRetvalV2 :
2538  SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
2539         [SDNPHasChain, SDNPSideEffect]>;
2540def StoreRetvalV4 :
2541  SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
2542         [SDNPHasChain, SDNPSideEffect]>;
2543def PseudoUseParam :
2544  SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile,
2545         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2546def RETURNNode :
2547  SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
2548         [SDNPHasChain, SDNPSideEffect]>;
2549def ProxyReg :
2550  SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile,
2551         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2552
2553let mayLoad = true in {
2554  class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
2555        NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
2556                  !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"),
2557                  []>;
2558
2559  class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
2560        NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
2561                  !strconcat("ld.param.v2", opstr,
2562                             " \t{{$dst, $dst2}}, [retval0+$b];"), []>;
2563
2564  class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
2565        NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
2566                        regclass:$dst4),
2567                  (ins i32imm:$b),
2568                  !strconcat("ld.param.v4", opstr,
2569                             " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"),
2570                  []>;
2571}
2572
2573class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
2574      NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
2575                !strconcat("mov", opstr, " \t$dst, retval$b;"),
2576                [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
2577
2578let mayStore = true in {
2579  class StoreParamInst<NVPTXRegClass regclass, string opstr> :
2580        NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
2581                  !strconcat("st.param", opstr, " \t[param$a+$b], $val;"),
2582                  []>;
2583
2584  class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
2585        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
2586                               i32imm:$a, i32imm:$b),
2587                  !strconcat("st.param.v2", opstr,
2588                             " \t[param$a+$b], {{$val, $val2}};"),
2589                  []>;
2590
2591  class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
2592        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3,
2593                               regclass:$val4, i32imm:$a,
2594                               i32imm:$b),
2595                  !strconcat("st.param.v4", opstr,
2596                             " \t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
2597                  []>;
2598
2599  class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
2600        NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
2601                  !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"),
2602                  []>;
2603
2604  class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
2605        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
2606                  !strconcat("st.param.v2", opstr,
2607                             " \t[func_retval0+$a], {{$val, $val2}};"),
2608                  []>;
2609
2610  class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
2611        NVPTXInst<(outs),
2612                  (ins regclass:$val, regclass:$val2, regclass:$val3,
2613                       regclass:$val4, i32imm:$a),
2614                  !strconcat("st.param.v4", opstr,
2615                             " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
2616                  []>;
2617}
2618
2619let isCall=1 in {
2620  multiclass CALL<string OpcStr, SDNode OpNode> {
2621     def PrintCallNoRetInst : NVPTXInst<(outs), (ins),
2622       !strconcat(OpcStr, " "), [(OpNode (i32 0))]>;
2623     def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
2624       !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>;
2625     def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
2626       !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>;
2627     def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
2628       !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>;
2629     def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
2630       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "),
2631       [(OpNode (i32 4))]>;
2632     def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
2633       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "),
2634       [(OpNode (i32 5))]>;
2635     def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
2636       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2637                            "retval5), "),
2638       [(OpNode (i32 6))]>;
2639     def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
2640       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2641                            "retval5, retval6), "),
2642       [(OpNode (i32 7))]>;
2643     def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
2644       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2645                            "retval5, retval6, retval7), "),
2646       [(OpNode (i32 8))]>;
2647  }
2648}
2649
2650defm Call : CALL<"call", PrintCall>;
2651defm CallUni : CALL<"call.uni", PrintCallUni>;
2652
2653// Convergent call instructions.  These are identical to regular calls, except
2654// they have the isConvergent bit set.
2655let isConvergent=1 in {
2656  defm ConvergentCall : CALL<"call", PrintConvergentCall>;
2657  defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>;
2658}
2659
2660def LoadParamMemI64    : LoadParamMemInst<Int64Regs, ".b64">;
2661def LoadParamMemI32    : LoadParamMemInst<Int32Regs, ".b32">;
2662def LoadParamMemI16    : LoadParamMemInst<Int16Regs, ".b16">;
2663def LoadParamMemI8     : LoadParamMemInst<Int16Regs, ".b8">;
2664def LoadParamMemV2I64  : LoadParamV2MemInst<Int64Regs, ".b64">;
2665def LoadParamMemV2I32  : LoadParamV2MemInst<Int32Regs, ".b32">;
2666def LoadParamMemV2I16  : LoadParamV2MemInst<Int16Regs, ".b16">;
2667def LoadParamMemV2I8   : LoadParamV2MemInst<Int16Regs, ".b8">;
2668def LoadParamMemV4I32  : LoadParamV4MemInst<Int32Regs, ".b32">;
2669def LoadParamMemV4I16  : LoadParamV4MemInst<Int16Regs, ".b16">;
2670def LoadParamMemV4I8   : LoadParamV4MemInst<Int16Regs, ".b8">;
2671def LoadParamMemF32    : LoadParamMemInst<Float32Regs, ".f32">;
2672def LoadParamMemF64    : LoadParamMemInst<Float64Regs, ".f64">;
2673def LoadParamMemV2F32  : LoadParamV2MemInst<Float32Regs, ".f32">;
2674def LoadParamMemV2F64  : LoadParamV2MemInst<Float64Regs, ".f64">;
2675def LoadParamMemV4F32  : LoadParamV4MemInst<Float32Regs, ".f32">;
2676
2677def StoreParamI64    : StoreParamInst<Int64Regs, ".b64">;
2678def StoreParamI32    : StoreParamInst<Int32Regs, ".b32">;
2679
2680def StoreParamI16    : StoreParamInst<Int16Regs, ".b16">;
2681def StoreParamI8     : StoreParamInst<Int16Regs, ".b8">;
2682def StoreParamV2I64  : StoreParamV2Inst<Int64Regs, ".b64">;
2683def StoreParamV2I32  : StoreParamV2Inst<Int32Regs, ".b32">;
2684def StoreParamV2I16  : StoreParamV2Inst<Int16Regs, ".b16">;
2685def StoreParamV2I8   : StoreParamV2Inst<Int16Regs, ".b8">;
2686
2687def StoreParamV4I32  : StoreParamV4Inst<Int32Regs, ".b32">;
2688def StoreParamV4I16  : StoreParamV4Inst<Int16Regs, ".b16">;
2689def StoreParamV4I8   : StoreParamV4Inst<Int16Regs, ".b8">;
2690
2691def StoreParamF32      : StoreParamInst<Float32Regs, ".f32">;
2692def StoreParamF64      : StoreParamInst<Float64Regs, ".f64">;
2693def StoreParamV2F32    : StoreParamV2Inst<Float32Regs, ".f32">;
2694def StoreParamV2F64    : StoreParamV2Inst<Float64Regs, ".f64">;
2695def StoreParamV4F32    : StoreParamV4Inst<Float32Regs, ".f32">;
2696
2697def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
2698def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
2699def StoreRetvalI16    : StoreRetvalInst<Int16Regs, ".b16">;
2700def StoreRetvalI8     : StoreRetvalInst<Int16Regs, ".b8">;
2701def StoreRetvalV2I64  : StoreRetvalV2Inst<Int64Regs, ".b64">;
2702def StoreRetvalV2I32  : StoreRetvalV2Inst<Int32Regs, ".b32">;
2703def StoreRetvalV2I16  : StoreRetvalV2Inst<Int16Regs, ".b16">;
2704def StoreRetvalV2I8   : StoreRetvalV2Inst<Int16Regs, ".b8">;
2705def StoreRetvalV4I32  : StoreRetvalV4Inst<Int32Regs, ".b32">;
2706def StoreRetvalV4I16  : StoreRetvalV4Inst<Int16Regs, ".b16">;
2707def StoreRetvalV4I8   : StoreRetvalV4Inst<Int16Regs, ".b8">;
2708
2709def StoreRetvalF64    : StoreRetvalInst<Float64Regs, ".f64">;
2710def StoreRetvalF32    : StoreRetvalInst<Float32Regs, ".f32">;
2711def StoreRetvalV2F64  : StoreRetvalV2Inst<Float64Regs, ".f64">;
2712def StoreRetvalV2F32  : StoreRetvalV2Inst<Float32Regs, ".f32">;
2713def StoreRetvalV4F32  : StoreRetvalV4Inst<Float32Regs, ".f32">;
2714
2715def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
2716def CallArgEndInst1  : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
2717def CallArgEndInst0  : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
2718def RETURNInst       : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
2719
2720class CallArgInst<NVPTXRegClass regclass> :
2721  NVPTXInst<(outs), (ins regclass:$a), "$a, ",
2722            [(CallArg (i32 0), regclass:$a)]>;
2723
2724class CallArgInstVT<NVPTXRegClass regclass, ValueType vt> :
2725  NVPTXInst<(outs), (ins regclass:$a), "$a, ",
2726            [(CallArg (i32 0), vt:$a)]>;
2727
2728class LastCallArgInst<NVPTXRegClass regclass> :
2729  NVPTXInst<(outs), (ins regclass:$a), "$a",
2730            [(LastCallArg (i32 0), regclass:$a)]>;
2731class LastCallArgInstVT<NVPTXRegClass regclass, ValueType vt> :
2732  NVPTXInst<(outs), (ins regclass:$a), "$a",
2733            [(LastCallArg (i32 0), vt:$a)]>;
2734
2735def CallArgI64     : CallArgInst<Int64Regs>;
2736def CallArgI32     : CallArgInstVT<Int32Regs, i32>;
2737def CallArgI16     : CallArgInstVT<Int16Regs, i16>;
2738def CallArgF64     : CallArgInst<Float64Regs>;
2739def CallArgF32     : CallArgInst<Float32Regs>;
2740
2741def LastCallArgI64 : LastCallArgInst<Int64Regs>;
2742def LastCallArgI32 : LastCallArgInstVT<Int32Regs, i32>;
2743def LastCallArgI16 : LastCallArgInstVT<Int16Regs, i16>;
2744def LastCallArgF64 : LastCallArgInst<Float64Regs>;
2745def LastCallArgF32 : LastCallArgInst<Float32Regs>;
2746
2747def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
2748                              [(CallArg (i32 0), (i32 imm:$a))]>;
2749def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
2750                                  [(LastCallArg (i32 0), (i32 imm:$a))]>;
2751
2752def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
2753                             [(CallArg (i32 1), (i32 imm:$a))]>;
2754def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
2755                                 [(LastCallArg (i32 1), (i32 imm:$a))]>;
2756
2757def CallVoidInst :      NVPTXInst<(outs), (ins imem:$addr), "$addr, ",
2758                                  [(CallVoid (Wrapper tglobaladdr:$addr))]>;
2759def CallVoidInstReg :   NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ",
2760                                  [(CallVoid i32:$addr)]>;
2761def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ",
2762                                  [(CallVoid Int64Regs:$addr)]>;
2763def PrototypeInst :     NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
2764                                  [(Prototype (i32 imm:$val))]>;
2765
2766def DeclareRetMemInst :
2767  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num),
2768            ".param .align $align .b8 retval$num[$size];",
2769            [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
2770def DeclareRetScalarInst :
2771  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
2772            ".param .b$size retval$num;",
2773            [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
2774def DeclareRetRegInst :
2775  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
2776            ".reg .b$size retval$num;",
2777            [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
2778
2779def DeclareParamInst :
2780  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
2781            ".param .align $align .b8 param$a[$size];",
2782            [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
2783def DeclareScalarParamInst :
2784  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
2785            ".param .b$size param$a;",
2786            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
2787def DeclareScalarRegInst :
2788  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
2789            ".reg .b$size param$a;",
2790            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
2791
2792class MoveParamInst<ValueType T, NVPTXRegClass regclass, string asmstr> :
2793  NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
2794            !strconcat("mov", asmstr, " \t$dst, $src;"),
2795            [(set (T regclass:$dst), (MoveParam (T regclass:$src)))]>;
2796
2797class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty, ValueType vt,
2798                          string asmstr> :
2799  NVPTXInst<(outs regclass:$dst), (ins srcty:$src),
2800            !strconcat("mov", asmstr, " \t$dst, $src;"),
2801            [(set vt:$dst, (MoveParam texternalsym:$src))]>;
2802
2803def MoveParamI64 : MoveParamInst<i64, Int64Regs, ".b64">;
2804def MoveParamI32 : MoveParamInst<i32, Int32Regs, ".b32">;
2805
2806def MoveParamSymbolI64 : MoveParamSymbolInst<Int64Regs, i64imm, i64, ".b64">;
2807def MoveParamSymbolI32 : MoveParamSymbolInst<Int32Regs, i32imm, i32, ".b32">;
2808
2809def MoveParamI16 :
2810  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
2811            "cvt.u16.u32 \t$dst, $src;", // ??? Why cvt.u16.u32 ?
2812            [(set i16:$dst, (MoveParam i16:$src))]>;
2813def MoveParamF64 : MoveParamInst<f64, Float64Regs, ".f64">;
2814def MoveParamF32 : MoveParamInst<f32, Float32Regs, ".f32">;
2815
2816class PseudoUseParamInst<NVPTXRegClass regclass, ValueType vt> :
2817  NVPTXInst<(outs), (ins regclass:$src),
2818            "// Pseudo use of $src",
2819            [(PseudoUseParam vt:$src)]>;
2820
2821def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs, i64>;
2822def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs, i32>;
2823def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs, i16>;
2824def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs, f64>;
2825def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs, f32>;
2826
2827class ProxyRegInst<string SzStr, ValueType T, NVPTXRegClass regclass> :
2828  NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
2829            !strconcat("mov.", SzStr, " \t$dst, $src;"),
2830            [(set (T regclass:$dst), (ProxyReg (T regclass:$src)))]>;
2831
2832def ProxyRegI1    : ProxyRegInst<"pred", i1, Int1Regs>;
2833def ProxyRegI16   : ProxyRegInst<"b16",  i16, Int16Regs>;
2834def ProxyRegI32   : ProxyRegInst<"b32",  i32, Int32Regs>;
2835def ProxyRegI64   : ProxyRegInst<"b64",  i64, Int64Regs>;
2836def ProxyRegF32   : ProxyRegInst<"f32",  f32, Float32Regs>;
2837def ProxyRegF64   : ProxyRegInst<"f64",  f64, Float64Regs>;
2838
2839foreach vt = [f16, bf16] in {
2840  def: Pat<(vt (ProxyReg  vt:$src)), (ProxyRegI16 Int16Regs:$src)>;
2841}
2842
2843foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
2844  def: Pat<(vt (ProxyReg  vt:$src)), (ProxyRegI32 Int32Regs:$src)>;
2845}
2846
2847//
2848// Load / Store Handling
2849//
2850multiclass LD<NVPTXRegClass regclass> {
2851  def _avar : NVPTXInst<
2852    (outs regclass:$dst),
2853    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2854         i32imm:$fromWidth, imem:$addr),
2855    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2856    "\t$dst, [$addr];", []>;
2857  def _areg : NVPTXInst<
2858    (outs regclass:$dst),
2859    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2860         i32imm:$fromWidth, Int32Regs:$addr),
2861    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2862    "\t$dst, [$addr];", []>;
2863  def _areg_64 : NVPTXInst<
2864    (outs regclass:$dst),
2865    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2866         i32imm:$fromWidth, Int64Regs:$addr),
2867    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2868    "\t$dst, [$addr];", []>;
2869  def _ari : NVPTXInst<
2870    (outs regclass:$dst),
2871    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2872         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2873    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2874    "\t$dst, [$addr+$offset];", []>;
2875  def _ari_64 : NVPTXInst<
2876    (outs regclass:$dst),
2877    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2878         LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2879    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2880    "\t$dst, [$addr+$offset];", []>;
2881  def _asi : NVPTXInst<
2882    (outs regclass:$dst),
2883    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2884         LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2885    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2886    "\t$dst, [$addr+$offset];", []>;
2887}
2888
2889let mayLoad=1, hasSideEffects=0 in {
2890  defm LD_i8  : LD<Int16Regs>;
2891  defm LD_i16 : LD<Int16Regs>;
2892  defm LD_i32 : LD<Int32Regs>;
2893  defm LD_i64 : LD<Int64Regs>;
2894  defm LD_f32 : LD<Float32Regs>;
2895  defm LD_f64 : LD<Float64Regs>;
2896}
2897
2898multiclass ST<NVPTXRegClass regclass> {
2899  def _avar : NVPTXInst<
2900    (outs),
2901    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2902         LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
2903    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2904    " \t[$addr], $src;", []>;
2905  def _areg : NVPTXInst<
2906    (outs),
2907    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
2908         LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
2909    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2910    " \t[$addr], $src;", []>;
2911  def _areg_64 : NVPTXInst<
2912    (outs),
2913    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2914         LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
2915    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2916    " \t[$addr], $src;", []>;
2917  def _ari : NVPTXInst<
2918    (outs),
2919    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2920         LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
2921    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2922    " \t[$addr+$offset], $src;", []>;
2923  def _ari_64 : NVPTXInst<
2924    (outs),
2925    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2926         LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
2927    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2928    " \t[$addr+$offset], $src;", []>;
2929  def _asi : NVPTXInst<
2930    (outs),
2931    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2932         LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
2933    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2934    " \t[$addr+$offset], $src;", []>;
2935}
2936
2937let mayStore=1, hasSideEffects=0 in {
2938  defm ST_i8  : ST<Int16Regs>;
2939  defm ST_i16 : ST<Int16Regs>;
2940  defm ST_i32 : ST<Int32Regs>;
2941  defm ST_i64 : ST<Int64Regs>;
2942  defm ST_f32 : ST<Float32Regs>;
2943  defm ST_f64 : ST<Float64Regs>;
2944}
2945
2946// The following is used only in and after vector elementizations.  Vector
2947// elementization happens at the machine instruction level, so the following
2948// instructions never appear in the DAG.
2949multiclass LD_VEC<NVPTXRegClass regclass> {
2950  def _v2_avar : NVPTXInst<
2951    (outs regclass:$dst1, regclass:$dst2),
2952    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2953         i32imm:$fromWidth, imem:$addr),
2954    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2955    "\t{{$dst1, $dst2}}, [$addr];", []>;
2956  def _v2_areg : NVPTXInst<
2957    (outs regclass:$dst1, regclass:$dst2),
2958    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2959         i32imm:$fromWidth, Int32Regs:$addr),
2960    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2961    "\t{{$dst1, $dst2}}, [$addr];", []>;
2962  def _v2_areg_64 : NVPTXInst<
2963    (outs regclass:$dst1, regclass:$dst2),
2964    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2965         i32imm:$fromWidth, Int64Regs:$addr),
2966    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2967    "\t{{$dst1, $dst2}}, [$addr];", []>;
2968  def _v2_ari : NVPTXInst<
2969    (outs regclass:$dst1, regclass:$dst2),
2970    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2971         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2972    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2973    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2974  def _v2_ari_64 : NVPTXInst<
2975    (outs regclass:$dst1, regclass:$dst2),
2976    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2977         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2978    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2979    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2980  def _v2_asi : NVPTXInst<
2981    (outs regclass:$dst1, regclass:$dst2),
2982    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2983         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2984    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2985    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2986  def _v4_avar : NVPTXInst<
2987    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2988    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2989         i32imm:$fromWidth, imem:$addr),
2990    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2991    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2992  def _v4_areg : NVPTXInst<
2993    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2994    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2995         i32imm:$fromWidth, Int32Regs:$addr),
2996    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2997    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2998  def _v4_areg_64 : NVPTXInst<
2999    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
3000    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3001         i32imm:$fromWidth, Int64Regs:$addr),
3002    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3003    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
3004  def _v4_ari : NVPTXInst<
3005    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
3006    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3007         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
3008    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3009    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
3010  def _v4_ari_64 : NVPTXInst<
3011    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
3012    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3013         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
3014    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3015    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
3016  def _v4_asi : NVPTXInst<
3017    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
3018    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3019         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
3020    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3021    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
3022}
3023let mayLoad=1, hasSideEffects=0 in {
3024  defm LDV_i8  : LD_VEC<Int16Regs>;
3025  defm LDV_i16 : LD_VEC<Int16Regs>;
3026  defm LDV_i32 : LD_VEC<Int32Regs>;
3027  defm LDV_i64 : LD_VEC<Int64Regs>;
3028  defm LDV_f32 : LD_VEC<Float32Regs>;
3029  defm LDV_f64 : LD_VEC<Float64Regs>;
3030}
3031
3032multiclass ST_VEC<NVPTXRegClass regclass> {
3033  def _v2_avar : NVPTXInst<
3034    (outs),
3035    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3036         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
3037    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3038    "\t[$addr], {{$src1, $src2}};", []>;
3039  def _v2_areg : NVPTXInst<
3040    (outs),
3041    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3042         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
3043    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3044    "\t[$addr], {{$src1, $src2}};", []>;
3045  def _v2_areg_64 : NVPTXInst<
3046    (outs),
3047    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3048         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
3049    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3050    "\t[$addr], {{$src1, $src2}};", []>;
3051  def _v2_ari : NVPTXInst<
3052    (outs),
3053    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3054         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
3055         i32imm:$offset),
3056    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3057    "\t[$addr+$offset], {{$src1, $src2}};", []>;
3058  def _v2_ari_64 : NVPTXInst<
3059    (outs),
3060    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3061         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
3062         i32imm:$offset),
3063    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3064    "\t[$addr+$offset], {{$src1, $src2}};", []>;
3065  def _v2_asi : NVPTXInst<
3066    (outs),
3067    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3068         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
3069         i32imm:$offset),
3070    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3071    "\t[$addr+$offset], {{$src1, $src2}};", []>;
3072  def _v4_avar : NVPTXInst<
3073    (outs),
3074    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3075         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3076         i32imm:$fromWidth, imem:$addr),
3077    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3078    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
3079  def _v4_areg : NVPTXInst<
3080    (outs),
3081    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3082         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3083         i32imm:$fromWidth, Int32Regs:$addr),
3084    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3085    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
3086  def _v4_areg_64 : NVPTXInst<
3087    (outs),
3088    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3089         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3090         i32imm:$fromWidth, Int64Regs:$addr),
3091    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3092    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
3093  def _v4_ari : NVPTXInst<
3094    (outs),
3095    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3096         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3097         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
3098    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3099    "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
3100  def _v4_ari_64 : NVPTXInst<
3101    (outs),
3102    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3103         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3104         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
3105    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3106    "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
3107  def _v4_asi : NVPTXInst<
3108    (outs),
3109    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3110         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3111         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
3112    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
3113    "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
3114}
3115
3116let mayStore=1, hasSideEffects=0 in {
3117  defm STV_i8  : ST_VEC<Int16Regs>;
3118  defm STV_i16 : ST_VEC<Int16Regs>;
3119  defm STV_i32 : ST_VEC<Int32Regs>;
3120  defm STV_i64 : ST_VEC<Int64Regs>;
3121  defm STV_f32 : ST_VEC<Float32Regs>;
3122  defm STV_f64 : ST_VEC<Float64Regs>;
3123}
3124
3125//---- Conversion ----
3126
3127class F_BITCONVERT<string SzStr, ValueType TIn, ValueType TOut,
3128  NVPTXRegClass regclassIn = ValueToRegClass<TIn>.ret,
3129  NVPTXRegClass regclassOut = ValueToRegClass<TOut>.ret> :
3130           NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
3131           !strconcat("mov.b", SzStr, " \t$d, $a;"),
3132     [(set (TOut regclassOut:$d), (bitconvert (TIn regclassIn:$a)))]>;
3133
3134def BITCONVERT_32_I2F : F_BITCONVERT<"32", i32, f32>;
3135def BITCONVERT_32_F2I : F_BITCONVERT<"32", f32, i32>;
3136def BITCONVERT_64_I2F : F_BITCONVERT<"64", i64, f64>;
3137def BITCONVERT_64_F2I : F_BITCONVERT<"64", f64, i64>;
3138
3139foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
3140def: Pat<(vt (bitconvert (f32 Float32Regs:$a))),
3141         (BITCONVERT_32_F2I Float32Regs:$a)>;
3142def: Pat<(f32 (bitconvert (vt Int32Regs:$a))),
3143         (BITCONVERT_32_I2F Int32Regs:$a)>;
3144}
3145foreach vt = [f16, bf16] in {
3146def: Pat<(vt (bitconvert (i16 UInt16Const:$a))),
3147         (IMOVB16ri UInt16Const:$a)>;
3148def: Pat<(vt (bitconvert (i16 Int16Regs:$a))),
3149         (ProxyRegI16 Int16Regs:$a)>;
3150def: Pat<(i16 (bitconvert (vt Int16Regs:$a))),
3151         (ProxyRegI16 Int16Regs:$a)>;
3152}
3153
3154foreach ta = [v2f16, v2bf16, v2i16, v4i8, i32] in {
3155  def: Pat<(ta (bitconvert (i32 UInt32Const:$a))),
3156           (IMOVB32ri UInt32Const:$a)>;
3157  foreach tb = [v2f16, v2bf16, v2i16, v4i8, i32] in {
3158    if !ne(ta, tb) then {
3159      def: Pat<(ta (bitconvert (tb Int32Regs:$a))),
3160             (ProxyRegI32 Int32Regs:$a)>;
3161    }
3162  }
3163}
3164
3165// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
3166// we cannot specify floating-point literals in isel patterns.  Therefore, we
3167// use an integer selp to select either 1 or 0 and then cvt to floating-point.
3168
3169// sint -> f16
3170def : Pat<(f16 (sint_to_fp Int1Regs:$a)),
3171          (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3172def : Pat<(f16 (sint_to_fp Int16Regs:$a)),
3173          (CVT_f16_s16 Int16Regs:$a, CvtRN)>;
3174def : Pat<(f16 (sint_to_fp Int32Regs:$a)),
3175          (CVT_f16_s32 Int32Regs:$a, CvtRN)>;
3176def : Pat<(f16 (sint_to_fp Int64Regs:$a)),
3177          (CVT_f16_s64 Int64Regs:$a, CvtRN)>;
3178
3179// uint -> f16
3180def : Pat<(f16 (uint_to_fp Int1Regs:$a)),
3181          (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3182def : Pat<(f16 (uint_to_fp Int16Regs:$a)),
3183          (CVT_f16_u16 Int16Regs:$a, CvtRN)>;
3184def : Pat<(f16 (uint_to_fp Int32Regs:$a)),
3185          (CVT_f16_u32 Int32Regs:$a, CvtRN)>;
3186def : Pat<(f16 (uint_to_fp Int64Regs:$a)),
3187          (CVT_f16_u64 Int64Regs:$a, CvtRN)>;
3188
3189// sint -> bf16
3190def : Pat<(bf16 (sint_to_fp Int1Regs:$a)),
3191          (CVT_bf16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3192def : Pat<(bf16 (sint_to_fp Int16Regs:$a)),
3193          (CVT_bf16_s16 Int16Regs:$a, CvtRN)>;
3194def : Pat<(bf16 (sint_to_fp Int32Regs:$a)),
3195          (CVT_bf16_s32 Int32Regs:$a, CvtRN)>;
3196def : Pat<(bf16 (sint_to_fp Int64Regs:$a)),
3197          (CVT_bf16_s64 Int64Regs:$a, CvtRN)>;
3198
3199// uint -> bf16
3200def : Pat<(bf16 (uint_to_fp Int1Regs:$a)),
3201          (CVT_bf16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3202def : Pat<(bf16 (uint_to_fp Int16Regs:$a)),
3203          (CVT_bf16_u16 Int16Regs:$a, CvtRN)>;
3204def : Pat<(bf16 (uint_to_fp Int32Regs:$a)),
3205          (CVT_bf16_u32 Int32Regs:$a, CvtRN)>;
3206def : Pat<(bf16 (uint_to_fp Int64Regs:$a)),
3207          (CVT_bf16_u64 Int64Regs:$a, CvtRN)>;
3208
3209// sint -> f32
3210def : Pat<(f32 (sint_to_fp Int1Regs:$a)),
3211          (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3212def : Pat<(f32 (sint_to_fp Int16Regs:$a)),
3213          (CVT_f32_s16 Int16Regs:$a, CvtRN)>;
3214def : Pat<(f32 (sint_to_fp Int32Regs:$a)),
3215          (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
3216def : Pat<(f32 (sint_to_fp Int64Regs:$a)),
3217          (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
3218
3219// uint -> f32
3220def : Pat<(f32 (uint_to_fp Int1Regs:$a)),
3221          (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3222def : Pat<(f32 (uint_to_fp Int16Regs:$a)),
3223          (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
3224def : Pat<(f32 (uint_to_fp Int32Regs:$a)),
3225          (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
3226def : Pat<(f32 (uint_to_fp Int64Regs:$a)),
3227          (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
3228
3229// sint -> f64
3230def : Pat<(f64 (sint_to_fp Int1Regs:$a)),
3231          (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3232def : Pat<(f64 (sint_to_fp Int16Regs:$a)),
3233          (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
3234def : Pat<(f64 (sint_to_fp Int32Regs:$a)),
3235          (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
3236def : Pat<(f64 (sint_to_fp Int64Regs:$a)),
3237          (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
3238
3239// uint -> f64
3240def : Pat<(f64 (uint_to_fp Int1Regs:$a)),
3241          (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3242def : Pat<(f64 (uint_to_fp Int16Regs:$a)),
3243          (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
3244def : Pat<(f64 (uint_to_fp Int32Regs:$a)),
3245          (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
3246def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
3247          (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
3248
3249
3250// f16 -> sint
3251def : Pat<(i1 (fp_to_sint (f16 Int16Regs:$a))),
3252          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
3253def : Pat<(i16 (fp_to_sint (f16 Int16Regs:$a))),
3254          (CVT_s16_f16 (f16 Int16Regs:$a), CvtRZI)>;
3255def : Pat<(i32 (fp_to_sint (f16 Int16Regs:$a))),
3256          (CVT_s32_f16 (f16 Int16Regs:$a), CvtRZI)>;
3257def : Pat<(i64 (fp_to_sint (f16 Int16Regs:$a))),
3258          (CVT_s64_f16 Int16Regs:$a, CvtRZI)>;
3259
3260// f16 -> uint
3261def : Pat<(i1 (fp_to_uint (f16 Int16Regs:$a))),
3262          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
3263def : Pat<(i16 (fp_to_uint (f16 Int16Regs:$a))),
3264          (CVT_u16_f16 Int16Regs:$a, CvtRZI)>;
3265def : Pat<(i32 (fp_to_uint (f16 Int16Regs:$a))),
3266          (CVT_u32_f16 Int16Regs:$a, CvtRZI)>;
3267def : Pat<(i64 (fp_to_uint (f16 Int16Regs:$a))),
3268          (CVT_u64_f16 Int16Regs:$a, CvtRZI)>;
3269
3270// bf16 -> sint
3271def : Pat<(i1 (fp_to_sint (bf16 Int16Regs:$a))),
3272          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
3273def : Pat<(i16 (fp_to_sint (bf16 Int16Regs:$a))),
3274          (CVT_s16_bf16 (bf16 Int16Regs:$a), CvtRZI)>;
3275def : Pat<(i32 (fp_to_sint (bf16 Int16Regs:$a))),
3276          (CVT_s32_bf16 (bf16 Int16Regs:$a), CvtRZI)>;
3277def : Pat<(i64 (fp_to_sint (bf16 Int16Regs:$a))),
3278          (CVT_s64_bf16 Int16Regs:$a, CvtRZI)>;
3279
3280// bf16 -> uint
3281def : Pat<(i1 (fp_to_uint (bf16 Int16Regs:$a))),
3282          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
3283def : Pat<(i16 (fp_to_uint (bf16 Int16Regs:$a))),
3284          (CVT_u16_bf16 Int16Regs:$a, CvtRZI)>;
3285def : Pat<(i32 (fp_to_uint (bf16 Int16Regs:$a))),
3286          (CVT_u32_bf16 Int16Regs:$a, CvtRZI)>;
3287def : Pat<(i64 (fp_to_uint (bf16 Int16Regs:$a))),
3288          (CVT_u64_bf16 Int16Regs:$a, CvtRZI)>;
3289// f32 -> sint
3290def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
3291          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
3292def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
3293          (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3294def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
3295          (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
3296def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
3297          (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3298def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
3299          (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
3300def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
3301          (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3302def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
3303          (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
3304
3305// f32 -> uint
3306def : Pat<(i1 (fp_to_uint Float32Regs:$a)),
3307          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
3308def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
3309          (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3310def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
3311          (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
3312def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
3313          (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3314def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
3315          (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
3316def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
3317          (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3318def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
3319          (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
3320
3321// f64 -> sint
3322def : Pat<(i1 (fp_to_sint Float64Regs:$a)),
3323          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
3324def : Pat<(i16 (fp_to_sint Float64Regs:$a)),
3325          (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
3326def : Pat<(i32 (fp_to_sint Float64Regs:$a)),
3327          (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
3328def : Pat<(i64 (fp_to_sint Float64Regs:$a)),
3329          (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
3330
3331// f64 -> uint
3332def : Pat<(i1 (fp_to_uint Float64Regs:$a)),
3333          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
3334def : Pat<(i16 (fp_to_uint Float64Regs:$a)),
3335          (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
3336def : Pat<(i32 (fp_to_uint Float64Regs:$a)),
3337          (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
3338def : Pat<(i64 (fp_to_uint Float64Regs:$a)),
3339          (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
3340
3341// sext i1
3342def : Pat<(i16 (sext Int1Regs:$a)),
3343          (SELP_s16ii -1, 0, Int1Regs:$a)>;
3344def : Pat<(i32 (sext Int1Regs:$a)),
3345          (SELP_s32ii -1, 0, Int1Regs:$a)>;
3346def : Pat<(i64 (sext Int1Regs:$a)),
3347          (SELP_s64ii -1, 0, Int1Regs:$a)>;
3348
3349// zext i1
3350def : Pat<(i16 (zext Int1Regs:$a)),
3351          (SELP_u16ii 1, 0, Int1Regs:$a)>;
3352def : Pat<(i32 (zext Int1Regs:$a)),
3353          (SELP_u32ii 1, 0, Int1Regs:$a)>;
3354def : Pat<(i64 (zext Int1Regs:$a)),
3355          (SELP_u64ii 1, 0, Int1Regs:$a)>;
3356
3357// anyext i1
3358def : Pat<(i16 (anyext Int1Regs:$a)),
3359          (SELP_u16ii -1, 0, Int1Regs:$a)>;
3360def : Pat<(i32 (anyext Int1Regs:$a)),
3361          (SELP_u32ii -1, 0, Int1Regs:$a)>;
3362def : Pat<(i64 (anyext Int1Regs:$a)),
3363          (SELP_u64ii -1, 0, Int1Regs:$a)>;
3364
3365// sext i16
3366def : Pat<(i32 (sext Int16Regs:$a)),
3367          (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
3368def : Pat<(i64 (sext Int16Regs:$a)),
3369          (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
3370
3371// zext i16
3372def : Pat<(i32 (zext Int16Regs:$a)),
3373          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
3374def : Pat<(i64 (zext Int16Regs:$a)),
3375          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
3376
3377// anyext i16
3378def : Pat<(i32 (anyext Int16Regs:$a)),
3379          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
3380def : Pat<(i64 (anyext Int16Regs:$a)),
3381          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
3382
3383// sext i32
3384def : Pat<(i64 (sext Int32Regs:$a)),
3385          (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
3386
3387// zext i32
3388def : Pat<(i64 (zext Int32Regs:$a)),
3389          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
3390
3391// anyext i32
3392def : Pat<(i64 (anyext Int32Regs:$a)),
3393          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
3394
3395
3396// truncate i64
3397def : Pat<(i32 (trunc Int64Regs:$a)),
3398          (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
3399def : Pat<(i16 (trunc Int64Regs:$a)),
3400          (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
3401def : Pat<(i1 (trunc Int64Regs:$a)),
3402          (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
3403
3404// truncate i32
3405def : Pat<(i16 (trunc Int32Regs:$a)),
3406          (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
3407def : Pat<(i1 (trunc Int32Regs:$a)),
3408          (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
3409
3410// truncate i16
3411def : Pat<(i1 (trunc Int16Regs:$a)),
3412          (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
3413
3414// sext_inreg
3415def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
3416def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
3417def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
3418def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
3419def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
3420def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
3421
3422
3423// Select instructions with 32-bit predicates
3424def : Pat<(select (i32 Int32Regs:$pred), i16:$a, i16:$b),
3425          (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
3426          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3427def : Pat<(select (i32 Int32Regs:$pred), i32:$a, i32:$b),
3428          (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
3429          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3430def : Pat<(select (i32 Int32Regs:$pred), Int64Regs:$a, Int64Regs:$b),
3431          (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
3432          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3433def : Pat<(select (i32 Int32Regs:$pred), (f16 Int16Regs:$a), (f16 Int16Regs:$b)),
3434          (SELP_f16rr Int16Regs:$a, Int16Regs:$b,
3435          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3436def : Pat<(select (i32 Int32Regs:$pred), (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)),
3437          (SELP_bf16rr Int16Regs:$a, Int16Regs:$b,
3438          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3439def : Pat<(select (i32 Int32Regs:$pred), Float32Regs:$a, Float32Regs:$b),
3440          (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
3441          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3442def : Pat<(select (i32 Int32Regs:$pred), Float64Regs:$a, Float64Regs:$b),
3443          (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
3444          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3445
3446
3447let hasSideEffects = false in {
3448  // pack a set of smaller int registers to a larger int register
3449  def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
3450                             (ins Int16Regs:$s1, Int16Regs:$s2,
3451                                  Int16Regs:$s3, Int16Regs:$s4),
3452                             "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
3453  def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
3454                             (ins Int16Regs:$s1, Int16Regs:$s2),
3455                             "mov.b32 \t$d, {{$s1, $s2}};", []>;
3456  def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
3457                             (ins Int32Regs:$s1, Int32Regs:$s2),
3458                             "mov.b64 \t$d, {{$s1, $s2}};", []>;
3459  def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
3460                             (ins Float32Regs:$s1, Float32Regs:$s2),
3461                             "mov.b64 \t$d, {{$s1, $s2}};", []>;
3462
3463  // unpack a larger int register to a set of smaller int registers
3464  def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
3465                                   Int16Regs:$d3, Int16Regs:$d4),
3466                             (ins Int64Regs:$s),
3467                             "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
3468  def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
3469                             (ins Int32Regs:$s),
3470                             "mov.b32 \t{{$d1, $d2}}, $s;", []>;
3471  def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
3472                             (ins Int64Regs:$s),
3473                             "mov.b64 \t{{$d1, $d2}}, $s;", []>;
3474  def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
3475                             (ins Float64Regs:$s),
3476                             "mov.b64 \t{{$d1, $d2}}, $s;", []>;
3477
3478  def I32toI16H  : NVPTXInst<(outs Int16Regs:$high),
3479                             (ins Int32Regs:$s),
3480                             "{{ .reg .b16 tmp; mov.b32 {tmp, $high}, $s; }}",
3481                             []>;
3482  def I32toI16L  : NVPTXInst<(outs Int16Regs:$low),
3483                             (ins Int32Regs:$s),
3484                             "{{ .reg .b16 tmp; mov.b32 {$low, tmp}, $s; }}",
3485                             []>;
3486  def I64toI32H  : NVPTXInst<(outs Int32Regs:$high),
3487                             (ins Int64Regs:$s),
3488                             "{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}",
3489                             []>;
3490}
3491
3492// Using partial vectorized move produces better SASS code for extraction of
3493// upper/lower parts of an integer.
3494def : Pat<(i16 (trunc (srl Int32Regs:$s, (i32 16)))),
3495          (I32toI16H Int32Regs:$s)>;
3496def : Pat<(i16 (trunc (sra Int32Regs:$s, (i32 16)))),
3497          (I32toI16H Int32Regs:$s)>;
3498def : Pat<(i32 (trunc (srl Int64Regs:$s, (i32 32)))),
3499          (I64toI32H Int64Regs:$s)>;
3500def : Pat<(i32 (trunc (sra Int64Regs:$s, (i32 32)))),
3501          (I64toI32H Int64Regs:$s)>;
3502
3503def: Pat<(i32 (sext (extractelt (v2i16 Int32Regs:$src), 0))),
3504         (CVT_INREG_s32_s16 Int32Regs:$src)>;
3505
3506foreach vt = [v2f16, v2bf16, v2i16] in {
3507def : Pat<(extractelt (vt Int32Regs:$src), 0),
3508          (I32toI16L Int32Regs:$src)>;
3509def : Pat<(extractelt (vt Int32Regs:$src), 1),
3510          (I32toI16H Int32Regs:$src)>;
3511}
3512def : Pat<(v2f16 (build_vector (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
3513          (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
3514def : Pat<(v2bf16 (build_vector (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
3515          (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
3516def : Pat<(v2i16 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b))),
3517          (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
3518
3519def: Pat<(v2i16 (scalar_to_vector (i16 Int16Regs:$a))),
3520         (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
3521
3522// Count leading zeros
3523let hasSideEffects = false in {
3524  def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
3525                         "clz.b32 \t$d, $a;", []>;
3526  def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
3527                         "clz.b64 \t$d, $a;", []>;
3528}
3529
3530// 32-bit has a direct PTX instruction
3531def : Pat<(i32 (ctlz (i32 Int32Regs:$a))), (CLZr32 Int32Regs:$a)>;
3532
3533// The return type of the ctlz ISD node is the same as its input, but the PTX
3534// ctz instruction always returns a 32-bit value.  For ctlz.i64, convert the
3535// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
3536// truncating back down to 32 bits.
3537def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
3538def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
3539
3540// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
3541// result back to 16-bits if necessary.  We also need to subtract 16 because
3542// the high-order 16 zeros were counted.
3543//
3544// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
3545// use to save one SASS instruction (on sm_35 anyway):
3546//
3547//   mov.b32 $tmp, {0xffff, $a}
3548//   ctlz.b32 $result, $tmp
3549//
3550// That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
3551// and then ctlz that value.  This way we don't have to subtract 16 from the
3552// result.  Unfortunately today we don't have a way to generate
3553// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
3554def : Pat<(i16 (ctlz Int16Regs:$a)),
3555          (SUBi16ri (CVT_u16_u32
3556           (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
3557def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))),
3558          (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
3559
3560// Population count
3561let hasSideEffects = false in {
3562  def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
3563                          "popc.b32 \t$d, $a;", []>;
3564  def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
3565                          "popc.b64 \t$d, $a;", []>;
3566}
3567
3568// 32-bit has a direct PTX instruction
3569def : Pat<(i32 (ctpop (i32 Int32Regs:$a))), (POPCr32 Int32Regs:$a)>;
3570
3571// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
3572// to match the LLVM semantics.  Just as with ctlz.i64, we provide a second
3573// pattern that avoids the type conversion if we're truncating the result to
3574// i32 anyway.
3575def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
3576def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>;
3577
3578// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
3579// If we know that we're storing into an i32, we can avoid the final trunc.
3580def : Pat<(ctpop Int16Regs:$a),
3581          (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
3582def : Pat<(i32 (zext (i16 (ctpop Int16Regs:$a)))),
3583          (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>;
3584
3585// fpround f32 -> f16
3586def : Pat<(f16 (fpround Float32Regs:$a)),
3587          (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
3588
3589// fpround f32 -> bf16
3590def : Pat<(bf16 (fpround Float32Regs:$a)),
3591          (CVT_bf16_f32 Float32Regs:$a, CvtRN)>;
3592
3593// fpround f64 -> f16
3594def : Pat<(f16 (fpround Float64Regs:$a)),
3595          (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
3596
3597// fpround f64 -> bf16
3598def : Pat<(bf16 (fpround Float64Regs:$a)),
3599          (CVT_bf16_f64 Float64Regs:$a, CvtRN)>;
3600// fpround f64 -> f32
3601def : Pat<(f32 (fpround Float64Regs:$a)),
3602          (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
3603def : Pat<(f32 (fpround Float64Regs:$a)),
3604          (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
3605
3606// fpextend f16 -> f32
3607def : Pat<(f32 (fpextend (f16 Int16Regs:$a))),
3608          (CVT_f32_f16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
3609def : Pat<(f32 (fpextend (f16 Int16Regs:$a))),
3610          (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
3611// fpextend bf16 -> f32
3612def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))),
3613          (CVT_f32_bf16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
3614def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))),
3615          (CVT_f32_bf16 Int16Regs:$a, CvtNONE)>;
3616
3617// fpextend f16 -> f64
3618def : Pat<(f64 (fpextend (f16 Int16Regs:$a))),
3619          (CVT_f64_f16 Int16Regs:$a, CvtNONE)>;
3620
3621// fpextend bf16 -> f64
3622def : Pat<(f64 (fpextend (bf16 Int16Regs:$a))),
3623          (CVT_f64_bf16 Int16Regs:$a, CvtNONE)>;
3624
3625// fpextend f32 -> f64
3626def : Pat<(f64 (fpextend Float32Regs:$a)),
3627          (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
3628def : Pat<(f64 (fpextend Float32Regs:$a)),
3629          (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
3630
3631def retglue : SDNode<"NVPTXISD::RET_GLUE", SDTNone,
3632                     [SDNPHasChain, SDNPOptInGlue]>;
3633
3634// fceil, ffloor, froundeven, ftrunc.
3635
3636multiclass CVT_ROUND<SDNode OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
3637  def : Pat<(OpNode (f16 Int16Regs:$a)),
3638            (CVT_f16_f16 Int16Regs:$a, Mode)>;
3639  def : Pat<(OpNode (bf16 Int16Regs:$a)),
3640            (CVT_bf16_bf16 Int16Regs:$a, Mode)>;
3641  def : Pat<(OpNode Float32Regs:$a),
3642            (CVT_f32_f32 Float32Regs:$a, ModeFTZ)>, Requires<[doF32FTZ]>;
3643  def : Pat<(OpNode Float32Regs:$a),
3644            (CVT_f32_f32 Float32Regs:$a, Mode)>, Requires<[doNoF32FTZ]>;
3645  def : Pat<(OpNode Float64Regs:$a),
3646            (CVT_f64_f64 Float64Regs:$a, Mode)>;
3647}
3648
3649defm : CVT_ROUND<fceil, CvtRPI, CvtRPI_FTZ>;
3650defm : CVT_ROUND<ffloor, CvtRMI, CvtRMI_FTZ>;
3651defm : CVT_ROUND<froundeven, CvtRNI, CvtRNI_FTZ>;
3652defm : CVT_ROUND<ftrunc, CvtRZI, CvtRZI_FTZ>;
3653
3654// nearbyint and rint are implemented as rounding to nearest even.  This isn't
3655// strictly correct, because it causes us to ignore the rounding mode.  But it
3656// matches what CUDA's "libm" does.
3657
3658defm : CVT_ROUND<fnearbyint, CvtRNI, CvtRNI_FTZ>;
3659defm : CVT_ROUND<frint, CvtRNI, CvtRNI_FTZ>;
3660
3661//-----------------------------------
3662// Control-flow
3663//-----------------------------------
3664
3665let isTerminator=1 in {
3666   let isReturn=1, isBarrier=1 in
3667      def Return : NVPTXInst<(outs), (ins), "ret;", [(retglue)]>;
3668
3669   let isBranch=1 in
3670      def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
3671                              "@$a bra \t$target;",
3672                              [(brcond Int1Regs:$a, bb:$target)]>;
3673   let isBranch=1 in
3674      def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
3675                                   "@!$a bra \t$target;", []>;
3676
3677   let isBranch=1, isBarrier=1 in
3678      def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
3679                           "bra.uni \t$target;", [(br bb:$target)]>;
3680}
3681
3682def : Pat<(brcond (i32 Int32Regs:$a), bb:$target),
3683          (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
3684
3685// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
3686// conditional branch if the target block is the next block so that the code
3687// can fall through to the target block.  The invertion is done by 'xor
3688// condition, 1', which will be translated to (setne condition, -1).  Since ptx
3689// supports '@!pred bra target', we should use it.
3690def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
3691          (CBranchOther Int1Regs:$a, bb:$target)>;
3692
3693// Call
3694def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
3695                                            SDTCisVT<1, i32>]>;
3696def SDT_NVPTXCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
3697
3698def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
3699                           [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
3700def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
3701                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
3702                            SDNPSideEffect]>;
3703
3704def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
3705def call          : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
3706                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
3707def calltarget : Operand<i32>;
3708let isCall=1 in {
3709   def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>;
3710}
3711
3712def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
3713def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>;
3714
3715// Pseudo instructions.
3716class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
3717   : NVPTXInst<outs, ins, asmstr, pattern>;
3718
3719def Callseq_Start :
3720  NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
3721            "\\{ // callseq $amt1, $amt2\n"
3722            "\t.reg .b32 temp_param_reg;",
3723            [(callseq_start timm:$amt1, timm:$amt2)]>;
3724def Callseq_End :
3725  NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
3726            "\\} // callseq $amt1",
3727            [(callseq_end timm:$amt1, timm:$amt2)]>;
3728
3729// trap instruction
3730// Emit an `exit` as well to convey to ptxas that `trap` exits the CFG.
3731// This won't be necessary in a future version of ptxas.
3732def trapinst : NVPTXInst<(outs), (ins), "trap; exit;", [(trap)]>;
3733
3734// Call prototype wrapper
3735def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
3736def CallPrototype :
3737  SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
3738         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
3739def ProtoIdent : Operand<i32> {
3740  let PrintMethod = "printProtoIdent";
3741}
3742def CALL_PROTOTYPE :
3743  NVPTXInst<(outs), (ins ProtoIdent:$ident),
3744            "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
3745
3746
3747include "NVPTXIntrinsics.td"
3748
3749
3750//-----------------------------------
3751// Notes
3752//-----------------------------------
3753// BSWAP is currently expanded. The following is a more efficient
3754// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
3755// - for sm_20, use pmpt (use vector scalar mov to get the pack and
3756//   unpack). sm_20 supports native 32-bit register, but not native 16-bit
3757// register.
3758