1//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file describes the PTX instructions in TableGen format.
10//
11//===----------------------------------------------------------------------===//
12
13include "NVPTXInstrFormats.td"
14
15// A NOP instruction
16let hasSideEffects = false in {
17  def NOP : NVPTXInst<(outs), (ins), "", []>;
18}
19
20let OperandType = "OPERAND_IMMEDIATE" in {
21  def f16imm : Operand<f16>;
22  def bf16imm : Operand<bf16>;
23
24}
25
26// List of vector specific properties
27def isVecLD      : VecInstTypeEnum<1>;
28def isVecST      : VecInstTypeEnum<2>;
29def isVecBuild   : VecInstTypeEnum<3>;
30def isVecShuffle : VecInstTypeEnum<4>;
31def isVecExtract : VecInstTypeEnum<5>;
32def isVecInsert  : VecInstTypeEnum<6>;
33def isVecDest    : VecInstTypeEnum<7>;
34def isVecOther   : VecInstTypeEnum<15>;
35
36//===----------------------------------------------------------------------===//
37// NVPTX Operand Definitions.
38//===----------------------------------------------------------------------===//
39
40def brtarget    : Operand<OtherVT>;
41
42// CVT conversion modes
43// These must match the enum in NVPTX.h
44def CvtNONE : PatLeaf<(i32 0x0)>;
45def CvtRNI  : PatLeaf<(i32 0x1)>;
46def CvtRZI  : PatLeaf<(i32 0x2)>;
47def CvtRMI  : PatLeaf<(i32 0x3)>;
48def CvtRPI  : PatLeaf<(i32 0x4)>;
49def CvtRN   : PatLeaf<(i32 0x5)>;
50def CvtRZ   : PatLeaf<(i32 0x6)>;
51def CvtRM   : PatLeaf<(i32 0x7)>;
52def CvtRP   : PatLeaf<(i32 0x8)>;
53def CvtRNA   : PatLeaf<(i32 0x9)>;
54
55def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
56def CvtRNI_FTZ  : PatLeaf<(i32 0x11)>;
57def CvtRZI_FTZ  : PatLeaf<(i32 0x12)>;
58def CvtRMI_FTZ  : PatLeaf<(i32 0x13)>;
59def CvtRPI_FTZ  : PatLeaf<(i32 0x14)>;
60def CvtRN_FTZ   : PatLeaf<(i32 0x15)>;
61def CvtRZ_FTZ   : PatLeaf<(i32 0x16)>;
62def CvtRM_FTZ   : PatLeaf<(i32 0x17)>;
63def CvtRP_FTZ   : PatLeaf<(i32 0x18)>;
64
65def CvtSAT      : PatLeaf<(i32 0x20)>;
66def CvtSAT_FTZ  : PatLeaf<(i32 0x30)>;
67
68def CvtNONE_RELU   : PatLeaf<(i32 0x40)>;
69def CvtRN_RELU   : PatLeaf<(i32 0x45)>;
70def CvtRZ_RELU   : PatLeaf<(i32 0x46)>;
71
72def CvtMode : Operand<i32> {
73  let PrintMethod = "printCvtMode";
74}
75
76// Compare modes
77// These must match the enum in NVPTX.h
78def CmpEQ   : PatLeaf<(i32 0)>;
79def CmpNE   : PatLeaf<(i32 1)>;
80def CmpLT   : PatLeaf<(i32 2)>;
81def CmpLE   : PatLeaf<(i32 3)>;
82def CmpGT   : PatLeaf<(i32 4)>;
83def CmpGE   : PatLeaf<(i32 5)>;
84def CmpEQU  : PatLeaf<(i32 10)>;
85def CmpNEU  : PatLeaf<(i32 11)>;
86def CmpLTU  : PatLeaf<(i32 12)>;
87def CmpLEU  : PatLeaf<(i32 13)>;
88def CmpGTU  : PatLeaf<(i32 14)>;
89def CmpGEU  : PatLeaf<(i32 15)>;
90def CmpNUM  : PatLeaf<(i32 16)>;
91def CmpNAN  : PatLeaf<(i32 17)>;
92
93def CmpEQ_FTZ   : PatLeaf<(i32 0x100)>;
94def CmpNE_FTZ   : PatLeaf<(i32 0x101)>;
95def CmpLT_FTZ   : PatLeaf<(i32 0x102)>;
96def CmpLE_FTZ   : PatLeaf<(i32 0x103)>;
97def CmpGT_FTZ   : PatLeaf<(i32 0x104)>;
98def CmpGE_FTZ   : PatLeaf<(i32 0x105)>;
99def CmpEQU_FTZ  : PatLeaf<(i32 0x10A)>;
100def CmpNEU_FTZ  : PatLeaf<(i32 0x10B)>;
101def CmpLTU_FTZ  : PatLeaf<(i32 0x10C)>;
102def CmpLEU_FTZ  : PatLeaf<(i32 0x10D)>;
103def CmpGTU_FTZ  : PatLeaf<(i32 0x10E)>;
104def CmpGEU_FTZ  : PatLeaf<(i32 0x10F)>;
105def CmpNUM_FTZ  : PatLeaf<(i32 0x110)>;
106def CmpNAN_FTZ  : PatLeaf<(i32 0x111)>;
107
108def CmpMode : Operand<i32> {
109  let PrintMethod = "printCmpMode";
110}
111def VecElement : Operand<i32> {
112  let PrintMethod = "printVecElement";
113}
114
115//===----------------------------------------------------------------------===//
116// NVPTX Instruction Predicate Definitions
117//===----------------------------------------------------------------------===//
118
119
120def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
121def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
122def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
123def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
124def hasVote : Predicate<"Subtarget->hasVote()">;
125def hasDouble : Predicate<"Subtarget->hasDouble()">;
126def hasLDG : Predicate<"Subtarget->hasLDG()">;
127def hasLDU : Predicate<"Subtarget->hasLDU()">;
128
129def doF32FTZ : Predicate<"useF32FTZ()">;
130def doNoF32FTZ : Predicate<"!useF32FTZ()">;
131
132def doMulWide      : Predicate<"doMulWide">;
133
134def allowFMA : Predicate<"allowFMA()">;
135def noFMA : Predicate<"!allowFMA()">;
136def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">;
137def noUnsafeFPMath : Predicate<"!allowUnsafeFPMath()">;
138
139def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
140def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
141
142def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
143def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
144
145def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
146def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
147
148def True : Predicate<"true">;
149
150class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>;
151class hasSM<int version>: Predicate<"Subtarget->getSmVersion() >= " # version>;
152
153// non-sync shfl instructions are not available on sm_70+ in PTX6.4+
154def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"
155                          "&& Subtarget->getPTXVersion() >= 64)">;
156
157def useShortPtr : Predicate<"useShortPointers()">;
158def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
159def hasBF16Math: Predicate<"Subtarget->hasBF16Math()">;
160
161// Helper class to aid conversion between ValueType and a matching RegisterClass.
162
163class ValueToRegClass<ValueType T> {
164   string name = !cast<string>(T);
165   NVPTXRegClass ret = !cond(
166     !eq(name, "i1"): Int1Regs,
167     !eq(name, "i16"): Int16Regs,
168     !eq(name, "i32"): Int32Regs,
169     !eq(name, "i64"): Int64Regs,
170     !eq(name, "f16"): Int16Regs,
171     !eq(name, "v2f16"): Int32Regs,
172     !eq(name, "bf16"): Int16Regs,
173     !eq(name, "v2bf16"): Int32Regs,
174     !eq(name, "f32"): Float32Regs,
175     !eq(name, "f64"): Float64Regs,
176     !eq(name, "ai32"): Int32ArgRegs,
177     !eq(name, "ai64"): Int64ArgRegs,
178     !eq(name, "af32"): Float32ArgRegs,
179     !eq(name, "if64"): Float64ArgRegs,
180    );
181}
182
183
184//===----------------------------------------------------------------------===//
185// Some Common Instruction Class Templates
186//===----------------------------------------------------------------------===//
187
188// Template for instructions which take three int64, int32, or int16 args.
189// The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
190multiclass I3<string OpcStr, SDNode OpNode> {
191  def i64rr :
192    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
193              !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
194              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
195  def i64ri :
196    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
197              !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
198              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
199  def i32rr :
200    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
201              !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
202              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
203  def i32ri :
204    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
205              !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
206              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
207  def i16rr :
208    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
209              !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
210              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
211  def i16ri :
212    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
213              !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
214              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
215}
216
217// Template for instructions which take 3 int args.  The instructions are
218// named "<OpcStr>.s32" (e.g. "addc.cc.s32").
219multiclass ADD_SUB_INT_CARRY<string OpcStr, SDNode OpNode> {
220  let hasSideEffects = 1 in {
221    def i32rr :
222      NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
223                !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
224                [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
225    def i32ri :
226      NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
227                !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
228                [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
229    def i64rr :
230      NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
231                !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
232                [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>,
233      Requires<[hasPTX<43>]>;
234    def i64ri :
235      NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
236                !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
237                [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>,
238      Requires<[hasPTX<43>]>;
239  }
240}
241
242// Template for instructions which take three fp64 or fp32 args.  The
243// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64").
244//
245// Also defines ftz (flush subnormal inputs and results to sign-preserving
246// zero) variants for fp32 functions.
247//
248// This multiclass should be used for nodes that cannot be folded into FMAs.
249// For nodes that can be folded into FMAs (i.e. adds and muls), use
250// F3_fma_component.
251multiclass F3<string OpcStr, SDNode OpNode> {
252   def f64rr :
253     NVPTXInst<(outs Float64Regs:$dst),
254               (ins Float64Regs:$a, Float64Regs:$b),
255               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
256               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
257   def f64ri :
258     NVPTXInst<(outs Float64Regs:$dst),
259               (ins Float64Regs:$a, f64imm:$b),
260               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
261               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
262   def f32rr_ftz :
263     NVPTXInst<(outs Float32Regs:$dst),
264               (ins Float32Regs:$a, Float32Regs:$b),
265               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
266               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
267               Requires<[doF32FTZ]>;
268   def f32ri_ftz :
269     NVPTXInst<(outs Float32Regs:$dst),
270               (ins Float32Regs:$a, f32imm:$b),
271               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
272               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
273               Requires<[doF32FTZ]>;
274   def f32rr :
275     NVPTXInst<(outs Float32Regs:$dst),
276               (ins Float32Regs:$a, Float32Regs:$b),
277               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
278               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
279   def f32ri :
280     NVPTXInst<(outs Float32Regs:$dst),
281               (ins Float32Regs:$a, f32imm:$b),
282               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
283               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
284
285   def f16rr_ftz :
286     NVPTXInst<(outs Int16Regs:$dst),
287               (ins Int16Regs:$a, Int16Regs:$b),
288               !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
289               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
290               Requires<[useFP16Math, doF32FTZ]>;
291   def f16rr :
292     NVPTXInst<(outs Int16Regs:$dst),
293               (ins Int16Regs:$a, Int16Regs:$b),
294               !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
295               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
296               Requires<[useFP16Math]>;
297
298   def f16x2rr_ftz :
299     NVPTXInst<(outs Int32Regs:$dst),
300               (ins Int32Regs:$a, Int32Regs:$b),
301               !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
302               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
303               Requires<[useFP16Math, doF32FTZ]>;
304   def f16x2rr :
305     NVPTXInst<(outs Int32Regs:$dst),
306               (ins Int32Regs:$a, Int32Regs:$b),
307               !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
308               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
309               Requires<[useFP16Math]>;
310   def bf16rr_ftz :
311     NVPTXInst<(outs Int16Regs:$dst),
312               (ins Int16Regs:$a, Int16Regs:$b),
313               !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"),
314               [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
315               Requires<[hasBF16Math, doF32FTZ]>;
316   def bf16rr :
317     NVPTXInst<(outs Int16Regs:$dst),
318               (ins Int16Regs:$a, Int16Regs:$b),
319               !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"),
320               [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
321               Requires<[hasBF16Math]>;
322
323   def bf16x2rr_ftz :
324     NVPTXInst<(outs Int32Regs:$dst),
325               (ins Int32Regs:$a, Int32Regs:$b),
326               !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"),
327               [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
328               Requires<[hasBF16Math, doF32FTZ]>;
329   def bf16x2rr :
330     NVPTXInst<(outs Int32Regs:$dst),
331               (ins Int32Regs:$a, Int32Regs:$b),
332               !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"),
333               [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
334               Requires<[hasBF16Math]>;
335}
336
337// Template for instructions which take three FP args.  The
338// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
339//
340// Also defines ftz (flush subnormal inputs and results to sign-preserving
341// zero) variants for fp32/fp16 functions.
342//
343// This multiclass should be used for nodes that can be folded to make fma ops.
344// In this case, we use the ".rn" variant when FMA is disabled, as this behaves
345// just like the non ".rn" op, but prevents ptxas from creating FMAs.
346multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
347   def f64rr :
348     NVPTXInst<(outs Float64Regs:$dst),
349               (ins Float64Regs:$a, Float64Regs:$b),
350               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
351               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
352               Requires<[allowFMA]>;
353   def f64ri :
354     NVPTXInst<(outs Float64Regs:$dst),
355               (ins Float64Regs:$a, f64imm:$b),
356               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
357               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
358               Requires<[allowFMA]>;
359   def f32rr_ftz :
360     NVPTXInst<(outs Float32Regs:$dst),
361               (ins Float32Regs:$a, Float32Regs:$b),
362               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
363               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
364               Requires<[allowFMA, doF32FTZ]>;
365   def f32ri_ftz :
366     NVPTXInst<(outs Float32Regs:$dst),
367               (ins Float32Regs:$a, f32imm:$b),
368               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
369               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
370               Requires<[allowFMA, doF32FTZ]>;
371   def f32rr :
372     NVPTXInst<(outs Float32Regs:$dst),
373               (ins Float32Regs:$a, Float32Regs:$b),
374               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
375               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
376               Requires<[allowFMA]>;
377   def f32ri :
378     NVPTXInst<(outs Float32Regs:$dst),
379               (ins Float32Regs:$a, f32imm:$b),
380               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
381               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
382               Requires<[allowFMA]>;
383
384   def f16rr_ftz :
385     NVPTXInst<(outs Int16Regs:$dst),
386               (ins Int16Regs:$a, Int16Regs:$b),
387               !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
388               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
389               Requires<[useFP16Math, allowFMA, doF32FTZ]>;
390   def f16rr :
391     NVPTXInst<(outs Int16Regs:$dst),
392               (ins Int16Regs:$a, Int16Regs:$b),
393               !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
394               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
395               Requires<[useFP16Math, allowFMA]>;
396
397   def f16x2rr_ftz :
398     NVPTXInst<(outs Int32Regs:$dst),
399               (ins Int32Regs:$a, Int32Regs:$b),
400               !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
401               [(set (v2f16 Int32Regs:$dst), (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
402               Requires<[useFP16Math, allowFMA, doF32FTZ]>;
403   def f16x2rr :
404     NVPTXInst<(outs Int32Regs:$dst),
405               (ins Int32Regs:$a, Int32Regs:$b),
406               !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
407               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
408               Requires<[useFP16Math, allowFMA]>;
409   def bf16rr_ftz :
410     NVPTXInst<(outs Int16Regs:$dst),
411               (ins Int16Regs:$a, Int16Regs:$b),
412               !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"),
413               [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
414               Requires<[hasBF16Math, allowFMA, doF32FTZ]>;
415   def bf16rr :
416     NVPTXInst<(outs Int16Regs:$dst),
417               (ins Int16Regs:$a, Int16Regs:$b),
418               !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"),
419               [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
420               Requires<[hasBF16Math, allowFMA]>;
421
422   def bf16x2rr_ftz :
423     NVPTXInst<(outs Int32Regs:$dst),
424               (ins Int32Regs:$a, Int32Regs:$b),
425               !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"),
426               [(set (v2bf16 Int32Regs:$dst), (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
427               Requires<[hasBF16Math, allowFMA, doF32FTZ]>;
428   def bf16x2rr :
429     NVPTXInst<(outs Int32Regs:$dst),
430               (ins Int32Regs:$a, Int32Regs:$b),
431               !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"),
432               [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
433               Requires<[hasBF16Math, allowFMA]>;
434   // These have strange names so we don't perturb existing mir tests.
435   def _rnf64rr :
436     NVPTXInst<(outs Float64Regs:$dst),
437               (ins Float64Regs:$a, Float64Regs:$b),
438               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
439               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
440               Requires<[noFMA]>;
441   def _rnf64ri :
442     NVPTXInst<(outs Float64Regs:$dst),
443               (ins Float64Regs:$a, f64imm:$b),
444               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
445               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
446               Requires<[noFMA]>;
447   def _rnf32rr_ftz :
448     NVPTXInst<(outs Float32Regs:$dst),
449               (ins Float32Regs:$a, Float32Regs:$b),
450               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
451               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
452               Requires<[noFMA, doF32FTZ]>;
453   def _rnf32ri_ftz :
454     NVPTXInst<(outs Float32Regs:$dst),
455               (ins Float32Regs:$a, f32imm:$b),
456               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
457               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
458               Requires<[noFMA, doF32FTZ]>;
459   def _rnf32rr :
460     NVPTXInst<(outs Float32Regs:$dst),
461               (ins Float32Regs:$a, Float32Regs:$b),
462               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
463               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
464               Requires<[noFMA]>;
465   def _rnf32ri :
466     NVPTXInst<(outs Float32Regs:$dst),
467               (ins Float32Regs:$a, f32imm:$b),
468               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
469               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
470               Requires<[noFMA]>;
471   def _rnf16rr_ftz :
472     NVPTXInst<(outs Int16Regs:$dst),
473               (ins Int16Regs:$a, Int16Regs:$b),
474               !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"),
475               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
476               Requires<[useFP16Math, noFMA, doF32FTZ]>;
477   def _rnf16rr :
478     NVPTXInst<(outs Int16Regs:$dst),
479               (ins Int16Regs:$a, Int16Regs:$b),
480               !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"),
481               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
482               Requires<[useFP16Math, noFMA]>;
483   def _rnf16x2rr_ftz :
484     NVPTXInst<(outs Int32Regs:$dst),
485               (ins Int32Regs:$a, Int32Regs:$b),
486               !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"),
487               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
488               Requires<[useFP16Math, noFMA, doF32FTZ]>;
489   def _rnf16x2rr :
490     NVPTXInst<(outs Int32Regs:$dst),
491               (ins Int32Regs:$a, Int32Regs:$b),
492               !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"),
493               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
494               Requires<[useFP16Math, noFMA]>;
495  def _rnbf16rr_ftz :
496     NVPTXInst<(outs Int16Regs:$dst),
497               (ins Int16Regs:$a, Int16Regs:$b),
498               !strconcat(OpcStr, ".rn.ftz.bf16 \t$dst, $a, $b;"),
499               [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
500               Requires<[hasBF16Math, noFMA, doF32FTZ]>;
501   def _rnbf16rr :
502     NVPTXInst<(outs Int16Regs:$dst),
503               (ins Int16Regs:$a, Int16Regs:$b),
504               !strconcat(OpcStr, ".rn.bf16 \t$dst, $a, $b;"),
505               [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
506               Requires<[hasBF16Math, noFMA]>;
507   def _rnbf16x2rr_ftz :
508     NVPTXInst<(outs Int32Regs:$dst),
509               (ins Int32Regs:$a, Int32Regs:$b),
510               !strconcat(OpcStr, ".rn.ftz.bf16x2 \t$dst, $a, $b;"),
511               [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
512               Requires<[hasBF16Math, noFMA, doF32FTZ]>;
513   def _rnbf16x2rr :
514     NVPTXInst<(outs Int32Regs:$dst),
515               (ins Int32Regs:$a, Int32Regs:$b),
516               !strconcat(OpcStr, ".rn.bf16x2 \t$dst, $a, $b;"),
517               [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
518               Requires<[hasBF16Math, noFMA]>;
519}
520
521// Template for operations which take two f32 or f64 operands.  Provides three
522// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
523// subnormal inputs and results to zero).
524multiclass F2<string OpcStr, SDNode OpNode> {
525   def f64 :     NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
526                           !strconcat(OpcStr, ".f64 \t$dst, $a;"),
527                           [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
528   def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
529                           !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
530                           [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
531                           Requires<[doF32FTZ]>;
532   def f32 :     NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
533                           !strconcat(OpcStr, ".f32 \t$dst, $a;"),
534                           [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
535}
536
537//===----------------------------------------------------------------------===//
538// NVPTX Instructions.
539//===----------------------------------------------------------------------===//
540
541//-----------------------------------
542// Type Conversion
543//-----------------------------------
544
545let hasSideEffects = false in {
546  // Generate a cvt to the given type from all possible types.  Each instance
547  // takes a CvtMode immediate that defines the conversion mode to use.  It can
548  // be CvtNONE to omit a conversion mode.
549  multiclass CVT_FROM_ALL<string ToType, RegisterClass RC, list<Predicate> Preds = []> {
550    def _s8 :
551      NVPTXInst<(outs RC:$dst),
552                (ins Int16Regs:$src, CvtMode:$mode),
553                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
554                ToType, ".s8 \t$dst, $src;"), []>,
555      Requires<Preds>;
556    def _u8 :
557      NVPTXInst<(outs RC:$dst),
558                (ins Int16Regs:$src, CvtMode:$mode),
559                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
560                ToType, ".u8 \t$dst, $src;"), []>,
561      Requires<Preds>;
562    def _s16 :
563      NVPTXInst<(outs RC:$dst),
564                (ins Int16Regs:$src, CvtMode:$mode),
565                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
566                ToType, ".s16 \t$dst, $src;"), []>,
567      Requires<Preds>;
568    def _u16 :
569      NVPTXInst<(outs RC:$dst),
570                (ins Int16Regs:$src, CvtMode:$mode),
571                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
572                ToType, ".u16 \t$dst, $src;"), []>,
573      Requires<Preds>;
574    def _s32 :
575      NVPTXInst<(outs RC:$dst),
576                (ins Int32Regs:$src, CvtMode:$mode),
577                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
578                ToType, ".s32 \t$dst, $src;"), []>,
579      Requires<Preds>;
580    def _u32 :
581      NVPTXInst<(outs RC:$dst),
582                (ins Int32Regs:$src, CvtMode:$mode),
583                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
584                ToType, ".u32 \t$dst, $src;"), []>,
585      Requires<Preds>;
586    def _s64 :
587      NVPTXInst<(outs RC:$dst),
588                (ins Int64Regs:$src, CvtMode:$mode),
589                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
590                ToType, ".s64 \t$dst, $src;"), []>,
591      Requires<Preds>;
592    def _u64 :
593      NVPTXInst<(outs RC:$dst),
594                (ins Int64Regs:$src, CvtMode:$mode),
595                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
596                ToType, ".u64 \t$dst, $src;"), []>,
597      Requires<Preds>;
598    def _f16 :
599      NVPTXInst<(outs RC:$dst),
600                (ins Int16Regs:$src, CvtMode:$mode),
601                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
602                ToType, ".f16 \t$dst, $src;"), []>,
603      Requires<Preds>;
604    def _bf16 :
605      NVPTXInst<(outs RC:$dst),
606                (ins Int16Regs:$src, CvtMode:$mode),
607                !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.",
608                ToType, ".bf16 \t$dst, $src;"), []>,
609      Requires<!if(!eq(ToType, "f32"),
610                   // bf16->f32 was introduced early.
611                   [hasPTX<71>, hasSM<80>],
612                   // bf16->everything else needs sm90/ptx78
613                   [hasPTX<78>, hasSM<90>])>;
614    def _f32 :
615      NVPTXInst<(outs RC:$dst),
616                (ins Float32Regs:$src, CvtMode:$mode),
617                !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.",
618                ToType, ".f32 \t$dst, $src;"), []>,
619      Requires<!if(!eq(ToType, "bf16"),
620                   // f32->bf16 was introduced early.
621                   [hasPTX<70>, hasSM<80>],
622                   Preds)>;
623    def _f64 :
624      NVPTXInst<(outs RC:$dst),
625                (ins Float64Regs:$src, CvtMode:$mode),
626                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
627                ToType, ".f64 \t$dst, $src;"), []>,
628      Requires<Preds>;
629  }
630
631  // Generate cvts from all types to all types.
632  defm CVT_s8  : CVT_FROM_ALL<"s8",  Int16Regs>;
633  defm CVT_u8  : CVT_FROM_ALL<"u8",  Int16Regs>;
634  defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
635  defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
636  defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
637  defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
638  defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
639  defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
640  defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>;
641  defm CVT_bf16 : CVT_FROM_ALL<"bf16", Int16Regs, [hasPTX<78>, hasSM<90>]>;
642  defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
643  defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
644
645  // These cvts are different from those above: The source and dest registers
646  // are of the same type.
647  def CVT_INREG_s16_s8 :  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
648                                    "cvt.s16.s8 \t$dst, $src;", []>;
649  def CVT_INREG_s32_s8 :  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
650                                    "cvt.s32.s8 \t$dst, $src;", []>;
651  def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
652                                    "cvt.s32.s16 \t$dst, $src;", []>;
653  def CVT_INREG_s64_s8 :  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
654                                    "cvt.s64.s8 \t$dst, $src;", []>;
655  def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
656                                    "cvt.s64.s16 \t$dst, $src;", []>;
657  def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
658                                    "cvt.s64.s32 \t$dst, $src;", []>;
659
660  multiclass CVT_FROM_FLOAT_V2_SM80<string FromName, RegisterClass RC> {
661    def _f32 :
662      NVPTXInst<(outs RC:$dst),
663                (ins Float32Regs:$src1, Float32Regs:$src2,  CvtMode:$mode),
664                !strconcat("cvt${mode:base}${mode:relu}.",
665                FromName, ".f32 \t$dst, $src1, $src2;"), []>,
666    Requires<[hasPTX<70>, hasSM<80>]>;
667  }
668
669  defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Int32Regs>;
670  defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>;
671}
672
673//-----------------------------------
674// Selection instructions (selp)
675//-----------------------------------
676
677// TODO: Missing slct
678
679// selp instructions that don't have any pattern matches; we explicitly use
680// them within this file.
681let hasSideEffects = false in {
682  multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
683    def rr : NVPTXInst<(outs RC:$dst),
684                       (ins RC:$a, RC:$b, Int1Regs:$p),
685                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
686    def ri : NVPTXInst<(outs RC:$dst),
687                       (ins RC:$a, ImmCls:$b, Int1Regs:$p),
688                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
689    def ir : NVPTXInst<(outs RC:$dst),
690                       (ins ImmCls:$a, RC:$b, Int1Regs:$p),
691                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
692    def ii : NVPTXInst<(outs RC:$dst),
693                       (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
694                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
695  }
696
697  multiclass SELP_PATTERN<string TypeStr, ValueType T, RegisterClass RC,
698                          Operand ImmCls, SDNode ImmNode> {
699    def rr :
700      NVPTXInst<(outs RC:$dst),
701                (ins RC:$a, RC:$b, Int1Regs:$p),
702                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
703                [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T RC:$b)))]>;
704    def ri :
705      NVPTXInst<(outs RC:$dst),
706                (ins RC:$a, ImmCls:$b, Int1Regs:$p),
707                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
708                [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T ImmNode:$b)))]>;
709    def ir :
710      NVPTXInst<(outs RC:$dst),
711                (ins ImmCls:$a, RC:$b, Int1Regs:$p),
712                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
713                [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, (T RC:$b)))]>;
714    def ii :
715      NVPTXInst<(outs RC:$dst),
716                (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
717                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
718                [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
719  }
720}
721
722// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as
723// good.
724defm SELP_b16 : SELP_PATTERN<"b16", i16, Int16Regs, i16imm, imm>;
725defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
726defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
727defm SELP_b32 : SELP_PATTERN<"b32", i32, Int32Regs, i32imm, imm>;
728defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;
729defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
730defm SELP_b64 : SELP_PATTERN<"b64", i64, Int64Regs, i64imm, imm>;
731defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
732defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
733defm SELP_f16 : SELP_PATTERN<"b16", f16, Int16Regs, f16imm, fpimm>;
734defm SELP_bf16 : SELP_PATTERN<"b16", bf16, Int16Regs, bf16imm, fpimm>;
735
736defm SELP_f32 : SELP_PATTERN<"f32", f32, Float32Regs, f32imm, fpimm>;
737defm SELP_f64 : SELP_PATTERN<"f64", f64, Float64Regs, f64imm, fpimm>;
738
739// This does not work as tablegen fails to infer the type of 'imm'.
740// def v2f16imm : Operand<v2f16>;
741// defm SELP_f16x2 : SELP_PATTERN<"b32", v2f16, Int32Regs, v2f16imm, imm>;
742
743def SELP_f16x2rr :
744    NVPTXInst<(outs Int32Regs:$dst),
745              (ins Int32Regs:$a, Int32Regs:$b, Int1Regs:$p),
746              "selp.b32 \t$dst, $a, $b, $p;",
747              [(set Int32Regs:$dst,
748                    (select Int1Regs:$p, (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>;
749
750//-----------------------------------
751// Test Instructions
752//-----------------------------------
753
754def TESTINF_f32r : NVPTXInst<(outs Int1Regs:$p), (ins Float32Regs:$a),
755                             "testp.infinite.f32 \t$p, $a;",
756                             []>;
757def TESTINF_f32i : NVPTXInst<(outs Int1Regs:$p), (ins f32imm:$a),
758                             "testp.infinite.f32 \t$p, $a;",
759                             []>;
760def TESTINF_f64r : NVPTXInst<(outs Int1Regs:$p), (ins Float64Regs:$a),
761                             "testp.infinite.f64 \t$p, $a;",
762                             []>;
763def TESTINF_f64i : NVPTXInst<(outs Int1Regs:$p), (ins f64imm:$a),
764                             "testp.infinite.f64 \t$p, $a;",
765                             []>;
766
767//-----------------------------------
768// Integer Arithmetic
769//-----------------------------------
770
771// Template for xor masquerading as int1 arithmetic.
772multiclass ADD_SUB_i1<SDNode OpNode> {
773   def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
774                      "xor.pred \t$dst, $a, $b;",
775                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
776   def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
777                      "xor.pred \t$dst, $a, $b;",
778                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
779}
780
781// int1 addition and subtraction are both just xor.
782defm ADD_i1 : ADD_SUB_i1<add>;
783defm SUB_i1 : ADD_SUB_i1<sub>;
784
785// int16, int32, and int64 signed addition.  Since nvptx is 2's complement, we
786// also use these for unsigned arithmetic.
787defm ADD : I3<"add.s", add>;
788defm SUB : I3<"sub.s", sub>;
789
790// in32 and int64 addition and subtraction with carry-out.
791defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc>;
792defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>;
793
794// int32 and int64 addition and subtraction with carry-in and carry-out.
795defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>;
796defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>;
797
798defm MULT : I3<"mul.lo.s", mul>;
799
800defm MULTHS : I3<"mul.hi.s", mulhs>;
801defm MULTHU : I3<"mul.hi.u", mulhu>;
802
803defm SDIV : I3<"div.s", sdiv>;
804defm UDIV : I3<"div.u", udiv>;
805
806// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
807// will lower it.
808defm SREM : I3<"rem.s", srem>;
809defm UREM : I3<"rem.u", urem>;
810
811// Integer absolute value.  NumBits should be one minus the bit width of RC.
812// This idiom implements the algorithm at
813// http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs.
814multiclass ABS<RegisterClass RC, string SizeName> {
815  def : NVPTXInst<(outs RC:$dst), (ins RC:$a),
816                  !strconcat("abs", SizeName, " \t$dst, $a;"),
817                  [(set RC:$dst, (abs RC:$a))]>;
818}
819defm ABS_16 : ABS<Int16Regs, ".s16">;
820defm ABS_32 : ABS<Int32Regs, ".s32">;
821defm ABS_64 : ABS<Int64Regs, ".s64">;
822
823// Integer min/max.
824defm SMAX : I3<"max.s", smax>;
825defm UMAX : I3<"max.u", umax>;
826defm SMIN : I3<"min.s", smin>;
827defm UMIN : I3<"min.u", umin>;
828
829//
830// Wide multiplication
831//
832def MULWIDES64 :
833  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
834            "mul.wide.s32 \t$dst, $a, $b;", []>;
835def MULWIDES64Imm :
836  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
837            "mul.wide.s32 \t$dst, $a, $b;", []>;
838def MULWIDES64Imm64 :
839  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
840            "mul.wide.s32 \t$dst, $a, $b;", []>;
841
842def MULWIDEU64 :
843  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
844            "mul.wide.u32 \t$dst, $a, $b;", []>;
845def MULWIDEU64Imm :
846  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
847            "mul.wide.u32 \t$dst, $a, $b;", []>;
848def MULWIDEU64Imm64 :
849  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
850            "mul.wide.u32 \t$dst, $a, $b;", []>;
851
852def MULWIDES32 :
853  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
854            "mul.wide.s16 \t$dst, $a, $b;", []>;
855def MULWIDES32Imm :
856  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
857            "mul.wide.s16 \t$dst, $a, $b;", []>;
858def MULWIDES32Imm32 :
859  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
860            "mul.wide.s16 \t$dst, $a, $b;", []>;
861
862def MULWIDEU32 :
863  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
864            "mul.wide.u16 \t$dst, $a, $b;", []>;
865def MULWIDEU32Imm :
866  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
867            "mul.wide.u16 \t$dst, $a, $b;", []>;
868def MULWIDEU32Imm32 :
869  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
870            "mul.wide.u16 \t$dst, $a, $b;", []>;
871
872def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
873def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
874def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
875
876// Matchers for signed, unsigned mul.wide ISD nodes.
877def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)),
878          (MULWIDES32 i16:$a, i16:$b)>,
879      Requires<[doMulWide]>;
880def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
881          (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
882      Requires<[doMulWide]>;
883def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)),
884          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
885      Requires<[doMulWide]>;
886def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
887          (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
888      Requires<[doMulWide]>;
889
890def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)),
891          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
892      Requires<[doMulWide]>;
893def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)),
894          (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
895      Requires<[doMulWide]>;
896def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)),
897          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
898      Requires<[doMulWide]>;
899def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)),
900          (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
901      Requires<[doMulWide]>;
902
903// Predicates used for converting some patterns to mul.wide.
904def SInt32Const : PatLeaf<(imm), [{
905  const APInt &v = N->getAPIntValue();
906  return v.isSignedIntN(32);
907}]>;
908
909def UInt32Const : PatLeaf<(imm), [{
910  const APInt &v = N->getAPIntValue();
911  return v.isIntN(32);
912}]>;
913
914def SInt16Const : PatLeaf<(imm), [{
915  const APInt &v = N->getAPIntValue();
916  return v.isSignedIntN(16);
917}]>;
918
919def UInt16Const : PatLeaf<(imm), [{
920  const APInt &v = N->getAPIntValue();
921  return v.isIntN(16);
922}]>;
923
924def IntConst_0_30 : PatLeaf<(imm), [{
925  // Check if 0 <= v < 31; only then will the result of (x << v) be an int32.
926  const APInt &v = N->getAPIntValue();
927  return v.sge(0) && v.slt(31);
928}]>;
929
930def IntConst_0_14 : PatLeaf<(imm), [{
931  // Check if 0 <= v < 15; only then will the result of (x << v) be an int16.
932  const APInt &v = N->getAPIntValue();
933  return v.sge(0) && v.slt(15);
934}]>;
935
936def SHL2MUL32 : SDNodeXForm<imm, [{
937  const APInt &v = N->getAPIntValue();
938  APInt temp(32, 1);
939  return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32);
940}]>;
941
942def SHL2MUL16 : SDNodeXForm<imm, [{
943  const APInt &v = N->getAPIntValue();
944  APInt temp(16, 1);
945  return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
946}]>;
947
948// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
949def : Pat<(shl (sext Int32Regs:$a), (i32 IntConst_0_30:$b)),
950          (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
951      Requires<[doMulWide]>;
952def : Pat<(shl (zext Int32Regs:$a), (i32 IntConst_0_30:$b)),
953          (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
954      Requires<[doMulWide]>;
955
956def : Pat<(shl (sext Int16Regs:$a), (i16 IntConst_0_14:$b)),
957          (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
958      Requires<[doMulWide]>;
959def : Pat<(shl (zext Int16Regs:$a), (i16 IntConst_0_14:$b)),
960          (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
961      Requires<[doMulWide]>;
962
963// Convert "sign/zero-extend then multiply" to mul.wide.
964def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
965          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
966      Requires<[doMulWide]>;
967def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
968          (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
969      Requires<[doMulWide]>;
970
971def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
972          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
973      Requires<[doMulWide]>;
974def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
975          (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
976      Requires<[doMulWide]>;
977
978def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
979          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
980      Requires<[doMulWide]>;
981def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
982          (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
983      Requires<[doMulWide]>;
984
985def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
986          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
987      Requires<[doMulWide]>;
988def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
989          (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
990      Requires<[doMulWide]>;
991
992//
993// Integer multiply-add
994//
995def SDTIMAD :
996  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>,
997                       SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
998def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
999
1000def MAD16rrr :
1001  NVPTXInst<(outs Int16Regs:$dst),
1002            (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
1003            "mad.lo.s16 \t$dst, $a, $b, $c;",
1004            [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
1005def MAD16rri :
1006  NVPTXInst<(outs Int16Regs:$dst),
1007            (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
1008            "mad.lo.s16 \t$dst, $a, $b, $c;",
1009            [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
1010def MAD16rir :
1011  NVPTXInst<(outs Int16Regs:$dst),
1012            (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
1013            "mad.lo.s16 \t$dst, $a, $b, $c;",
1014            [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
1015def MAD16rii :
1016  NVPTXInst<(outs Int16Regs:$dst),
1017            (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
1018            "mad.lo.s16 \t$dst, $a, $b, $c;",
1019            [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>;
1020
1021def MAD32rrr :
1022  NVPTXInst<(outs Int32Regs:$dst),
1023            (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
1024            "mad.lo.s32 \t$dst, $a, $b, $c;",
1025            [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>;
1026def MAD32rri :
1027  NVPTXInst<(outs Int32Regs:$dst),
1028            (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
1029            "mad.lo.s32 \t$dst, $a, $b, $c;",
1030            [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>;
1031def MAD32rir :
1032  NVPTXInst<(outs Int32Regs:$dst),
1033            (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
1034            "mad.lo.s32 \t$dst, $a, $b, $c;",
1035            [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>;
1036def MAD32rii :
1037  NVPTXInst<(outs Int32Regs:$dst),
1038            (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
1039            "mad.lo.s32 \t$dst, $a, $b, $c;",
1040            [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, imm:$c))]>;
1041
1042def MAD64rrr :
1043  NVPTXInst<(outs Int64Regs:$dst),
1044            (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
1045            "mad.lo.s64 \t$dst, $a, $b, $c;",
1046            [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
1047def MAD64rri :
1048  NVPTXInst<(outs Int64Regs:$dst),
1049            (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
1050            "mad.lo.s64 \t$dst, $a, $b, $c;",
1051            [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
1052def MAD64rir :
1053  NVPTXInst<(outs Int64Regs:$dst),
1054            (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
1055            "mad.lo.s64 \t$dst, $a, $b, $c;",
1056            [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
1057def MAD64rii :
1058  NVPTXInst<(outs Int64Regs:$dst),
1059            (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
1060            "mad.lo.s64 \t$dst, $a, $b, $c;",
1061            [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>;
1062
1063def INEG16 :
1064  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
1065            "neg.s16 \t$dst, $src;",
1066            [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
1067def INEG32 :
1068  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
1069            "neg.s32 \t$dst, $src;",
1070            [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>;
1071def INEG64 :
1072  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
1073            "neg.s64 \t$dst, $src;",
1074            [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
1075
1076//-----------------------------------
1077// Floating Point Arithmetic
1078//-----------------------------------
1079
1080// Constant 1.0f
1081def FloatConst1 : PatLeaf<(fpimm), [{
1082  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() &&
1083         N->getValueAPF().convertToFloat() == 1.0f;
1084}]>;
1085// Constant 1.0 (double)
1086def DoubleConst1 : PatLeaf<(fpimm), [{
1087  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
1088         N->getValueAPF().convertToDouble() == 1.0;
1089}]>;
1090
1091// Loads FP16 constant into a register.
1092//
1093// ptxas does not have hex representation for fp16, so we can't use
1094// fp16 immediate values in .f16 instructions. Instead we have to load
1095// the constant into a register using mov.b16.
1096def LOAD_CONST_F16 :
1097  NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$a),
1098            "mov.b16 \t$dst, $a;", []>;
1099def LOAD_CONST_BF16 :
1100  NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$a),
1101            "mov.b16 \t$dst, $a;", []>;
1102defm FADD : F3_fma_component<"add", fadd>;
1103defm FSUB : F3_fma_component<"sub", fsub>;
1104defm FMUL : F3_fma_component<"mul", fmul>;
1105
1106defm FMIN : F3<"min", fminnum>;
1107defm FMAX : F3<"max", fmaxnum>;
1108// Note: min.NaN.f64 and max.NaN.f64 do not actually exist.
1109defm FMINNAN : F3<"min.NaN", fminimum>;
1110defm FMAXNAN : F3<"max.NaN", fmaximum>;
1111
1112defm FABS  : F2<"abs", fabs>;
1113defm FNEG  : F2<"neg", fneg>;
1114defm FSQRT : F2<"sqrt.rn", fsqrt>;
1115
1116//
1117// F16 NEG
1118//
1119class FNEG_F16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> :
1120      NVPTXInst<(outs RC:$dst), (ins RC:$src),
1121                !strconcat(OpcStr, " \t$dst, $src;"),
1122                [(set RC:$dst, (fneg (T RC:$src)))]>,
1123                Requires<[useFP16Math, hasPTX<60>, hasSM<53>, Pred]>;
1124def FNEG16_ftz   : FNEG_F16_F16X2<"neg.ftz.f16", f16, Int16Regs, doF32FTZ>;
1125def FNEG16       : FNEG_F16_F16X2<"neg.f16", f16, Int16Regs, True>;
1126def FNEG16x2_ftz : FNEG_F16_F16X2<"neg.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>;
1127def FNEG16x2     : FNEG_F16_F16X2<"neg.f16x2", v2f16, Int32Regs, True>;
1128
1129//
1130// BF16 NEG
1131//
1132
1133class FNEG_BF16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> :
1134      NVPTXInst<(outs RC:$dst), (ins RC:$src),
1135                !strconcat(OpcStr, " \t$dst, $src;"),
1136                [(set RC:$dst, (fneg (T RC:$src)))]>,
1137                Requires<[hasBF16Math, hasPTX<70>, hasSM<80>, Pred]>;
1138def BFNEG16_ftz   : FNEG_BF16_F16X2<"neg.ftz.bf16", bf16, Int16Regs, doF32FTZ>;
1139def BFNEG16       : FNEG_BF16_F16X2<"neg.bf16", bf16, Int16Regs, True>;
1140def BFNEG16x2_ftz : FNEG_BF16_F16X2<"neg.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>;
1141def BFNEG16x2     : FNEG_BF16_F16X2<"neg.bf16x2", v2bf16, Int32Regs, True>;
1142
1143//
1144// F64 division
1145//
1146def FDIV641r :
1147  NVPTXInst<(outs Float64Regs:$dst),
1148            (ins f64imm:$a, Float64Regs:$b),
1149            "rcp.rn.f64 \t$dst, $b;",
1150            [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
1151def FDIV64rr :
1152  NVPTXInst<(outs Float64Regs:$dst),
1153            (ins Float64Regs:$a, Float64Regs:$b),
1154            "div.rn.f64 \t$dst, $a, $b;",
1155            [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>;
1156def FDIV64ri :
1157  NVPTXInst<(outs Float64Regs:$dst),
1158            (ins Float64Regs:$a, f64imm:$b),
1159            "div.rn.f64 \t$dst, $a, $b;",
1160            [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>;
1161
1162//
1163// F32 Approximate reciprocal
1164//
1165def FDIV321r_ftz :
1166  NVPTXInst<(outs Float32Regs:$dst),
1167            (ins f32imm:$a, Float32Regs:$b),
1168            "rcp.approx.ftz.f32 \t$dst, $b;",
1169            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1170            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
1171def FDIV321r :
1172  NVPTXInst<(outs Float32Regs:$dst),
1173            (ins f32imm:$a, Float32Regs:$b),
1174            "rcp.approx.f32 \t$dst, $b;",
1175            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1176            Requires<[do_DIVF32_APPROX]>;
1177//
1178// F32 Approximate division
1179//
1180def FDIV32approxrr_ftz :
1181  NVPTXInst<(outs Float32Regs:$dst),
1182            (ins Float32Regs:$a, Float32Regs:$b),
1183            "div.approx.ftz.f32 \t$dst, $a, $b;",
1184            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1185            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
1186def FDIV32approxri_ftz :
1187  NVPTXInst<(outs Float32Regs:$dst),
1188            (ins Float32Regs:$a, f32imm:$b),
1189            "div.approx.ftz.f32 \t$dst, $a, $b;",
1190            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1191            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
1192def FDIV32approxrr :
1193  NVPTXInst<(outs Float32Regs:$dst),
1194            (ins Float32Regs:$a, Float32Regs:$b),
1195            "div.approx.f32 \t$dst, $a, $b;",
1196            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1197            Requires<[do_DIVF32_APPROX]>;
1198def FDIV32approxri :
1199  NVPTXInst<(outs Float32Regs:$dst),
1200            (ins Float32Regs:$a, f32imm:$b),
1201            "div.approx.f32 \t$dst, $a, $b;",
1202            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1203            Requires<[do_DIVF32_APPROX]>;
1204//
1205// F32 Semi-accurate reciprocal
1206//
1207// rcp.approx gives the same result as div.full(1.0f, a) and is faster.
1208//
1209def FDIV321r_approx_ftz :
1210  NVPTXInst<(outs Float32Regs:$dst),
1211            (ins f32imm:$a, Float32Regs:$b),
1212            "rcp.approx.ftz.f32 \t$dst, $b;",
1213            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1214            Requires<[do_DIVF32_FULL, doF32FTZ]>;
1215def FDIV321r_approx :
1216  NVPTXInst<(outs Float32Regs:$dst),
1217            (ins f32imm:$a, Float32Regs:$b),
1218            "rcp.approx.f32 \t$dst, $b;",
1219            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1220            Requires<[do_DIVF32_FULL]>;
1221//
1222// F32 Semi-accurate division
1223//
1224def FDIV32rr_ftz :
1225  NVPTXInst<(outs Float32Regs:$dst),
1226            (ins Float32Regs:$a, Float32Regs:$b),
1227            "div.full.ftz.f32 \t$dst, $a, $b;",
1228            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1229            Requires<[do_DIVF32_FULL, doF32FTZ]>;
1230def FDIV32ri_ftz :
1231  NVPTXInst<(outs Float32Regs:$dst),
1232            (ins Float32Regs:$a, f32imm:$b),
1233            "div.full.ftz.f32 \t$dst, $a, $b;",
1234            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1235            Requires<[do_DIVF32_FULL, doF32FTZ]>;
1236def FDIV32rr :
1237  NVPTXInst<(outs Float32Regs:$dst),
1238            (ins Float32Regs:$a, Float32Regs:$b),
1239            "div.full.f32 \t$dst, $a, $b;",
1240            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1241            Requires<[do_DIVF32_FULL]>;
1242def FDIV32ri :
1243  NVPTXInst<(outs Float32Regs:$dst),
1244            (ins Float32Regs:$a, f32imm:$b),
1245            "div.full.f32 \t$dst, $a, $b;",
1246            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1247            Requires<[do_DIVF32_FULL]>;
1248//
1249// F32 Accurate reciprocal
1250//
1251def FDIV321r_prec_ftz :
1252  NVPTXInst<(outs Float32Regs:$dst),
1253            (ins f32imm:$a, Float32Regs:$b),
1254            "rcp.rn.ftz.f32 \t$dst, $b;",
1255            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1256            Requires<[doF32FTZ]>;
1257def FDIV321r_prec :
1258  NVPTXInst<(outs Float32Regs:$dst),
1259            (ins f32imm:$a, Float32Regs:$b),
1260            "rcp.rn.f32 \t$dst, $b;",
1261            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>;
1262//
1263// F32 Accurate division
1264//
1265def FDIV32rr_prec_ftz :
1266  NVPTXInst<(outs Float32Regs:$dst),
1267            (ins Float32Regs:$a, Float32Regs:$b),
1268            "div.rn.ftz.f32 \t$dst, $a, $b;",
1269            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1270            Requires<[doF32FTZ]>;
1271def FDIV32ri_prec_ftz :
1272  NVPTXInst<(outs Float32Regs:$dst),
1273            (ins Float32Regs:$a, f32imm:$b),
1274            "div.rn.ftz.f32 \t$dst, $a, $b;",
1275            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1276            Requires<[doF32FTZ]>;
1277def FDIV32rr_prec :
1278  NVPTXInst<(outs Float32Regs:$dst),
1279            (ins Float32Regs:$a, Float32Regs:$b),
1280            "div.rn.f32 \t$dst, $a, $b;",
1281            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>;
1282def FDIV32ri_prec :
1283  NVPTXInst<(outs Float32Regs:$dst),
1284            (ins Float32Regs:$a, f32imm:$b),
1285            "div.rn.f32 \t$dst, $a, $b;",
1286            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>;
1287
1288//
1289// FMA
1290//
1291
1292multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
1293   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1294                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1295                       [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
1296                       Requires<[Pred]>;
1297   def rri : NVPTXInst<(outs RC:$dst),
1298                       (ins RC:$a, RC:$b, ImmCls:$c),
1299                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1300                       [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>,
1301                       Requires<[Pred]>;
1302   def rir : NVPTXInst<(outs RC:$dst),
1303                       (ins RC:$a, ImmCls:$b, RC:$c),
1304                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1305                       [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>,
1306                       Requires<[Pred]>;
1307   def rii : NVPTXInst<(outs RC:$dst),
1308                       (ins RC:$a, ImmCls:$b, ImmCls:$c),
1309                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1310                       [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
1311                       Requires<[Pred]>;
1312}
1313
1314multiclass FMA_F16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> {
1315   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1316                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1317                       [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>,
1318                       Requires<[useFP16Math, Pred]>;
1319}
1320
1321multiclass FMA_BF16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> {
1322   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1323                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1324                       [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>,
1325                       Requires<[hasBF16Math, Pred]>;
1326}
1327
1328defm FMA16_ftz    : FMA_F16<"fma.rn.ftz.f16", f16, Int16Regs, doF32FTZ>;
1329defm FMA16        : FMA_F16<"fma.rn.f16", f16, Int16Regs, True>;
1330defm FMA16x2_ftz  : FMA_F16<"fma.rn.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>;
1331defm FMA16x2      : FMA_F16<"fma.rn.f16x2", v2f16, Int32Regs, True>;
1332defm BFMA16_ftz   : FMA_BF16<"fma.rn.ftz.bf16", bf16, Int16Regs, doF32FTZ>;
1333defm BFMA16       : FMA_BF16<"fma.rn.bf16", bf16, Int16Regs, True>;
1334defm BFMA16x2_ftz : FMA_BF16<"fma.rn.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>;
1335defm BFMA16x2     : FMA_BF16<"fma.rn.bf16x2", v2bf16, Int32Regs, True>;
1336defm FMA32_ftz    : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
1337defm FMA32        : FMA<"fma.rn.f32", Float32Regs, f32imm, True>;
1338defm FMA64        : FMA<"fma.rn.f64", Float64Regs, f64imm, True>;
1339
1340// sin/cos
1341def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1342                      "sin.approx.f32 \t$dst, $src;",
1343                      [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>,
1344                      Requires<[allowUnsafeFPMath]>;
1345def COSF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1346                      "cos.approx.f32 \t$dst, $src;",
1347                      [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>,
1348                      Requires<[allowUnsafeFPMath]>;
1349
1350// Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
1351// i.e. "poor man's fmod()". When y is infinite, x is returned. This matches the
1352// semantics of LLVM's frem.
1353
1354// frem - f32 FTZ
1355def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1356          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
1357            (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ),
1358             Float32Regs:$y))>,
1359          Requires<[doF32FTZ, allowUnsafeFPMath]>;
1360def : Pat<(frem Float32Regs:$x, fpimm:$y),
1361          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
1362            (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ),
1363             fpimm:$y))>,
1364          Requires<[doF32FTZ, allowUnsafeFPMath]>;
1365
1366def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1367          (SELP_f32rr Float32Regs:$x,
1368            (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
1369              (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ),
1370              Float32Regs:$y)),
1371            (TESTINF_f32r Float32Regs:$y))>,
1372          Requires<[doF32FTZ, noUnsafeFPMath]>;
1373def : Pat<(frem Float32Regs:$x, fpimm:$y),
1374          (SELP_f32rr Float32Regs:$x,
1375            (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
1376              (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ),
1377              fpimm:$y)),
1378            (TESTINF_f32i fpimm:$y))>,
1379          Requires<[doF32FTZ, noUnsafeFPMath]>;
1380
1381// frem - f32
1382def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1383          (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
1384            (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI),
1385             Float32Regs:$y))>,
1386          Requires<[allowUnsafeFPMath]>;
1387def : Pat<(frem Float32Regs:$x, fpimm:$y),
1388          (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
1389            (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI),
1390             fpimm:$y))>,
1391          Requires<[allowUnsafeFPMath]>;
1392
1393def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1394          (SELP_f32rr Float32Regs:$x,
1395            (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
1396              (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI),
1397              Float32Regs:$y)),
1398            (TESTINF_f32r Float32Regs:$y))>,
1399          Requires<[noUnsafeFPMath]>;
1400def : Pat<(frem Float32Regs:$x, fpimm:$y),
1401          (SELP_f32rr Float32Regs:$x,
1402            (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
1403              (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI),
1404              fpimm:$y)),
1405            (TESTINF_f32i fpimm:$y))>,
1406          Requires<[noUnsafeFPMath]>;
1407
1408// frem - f64
1409def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
1410          (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
1411            (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI),
1412             Float64Regs:$y))>,
1413          Requires<[allowUnsafeFPMath]>;
1414def : Pat<(frem Float64Regs:$x, fpimm:$y),
1415          (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
1416            (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI),
1417             fpimm:$y))>,
1418          Requires<[allowUnsafeFPMath]>;
1419
1420def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
1421          (SELP_f64rr Float64Regs:$x,
1422            (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
1423              (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI),
1424               Float64Regs:$y)),
1425            (TESTINF_f64r Float64Regs:$y))>,
1426          Requires<[noUnsafeFPMath]>;
1427def : Pat<(frem Float64Regs:$x, fpimm:$y),
1428          (SELP_f64rr Float64Regs:$x,
1429            (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
1430              (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI),
1431              fpimm:$y)),
1432            (TESTINF_f64r Float64Regs:$y))>,
1433          Requires<[noUnsafeFPMath]>;
1434
1435//-----------------------------------
1436// Bitwise operations
1437//-----------------------------------
1438
1439// Template for three-arg bitwise operations.  Takes three args, Creates .b16,
1440// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
1441multiclass BITWISE<string OpcStr, SDNode OpNode> {
1442  def b1rr :
1443    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
1444              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
1445              [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
1446  def b1ri :
1447    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
1448              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
1449              [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
1450  def b16rr :
1451    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
1452              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
1453              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
1454  def b16ri :
1455    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
1456              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
1457              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
1458  def b32rr :
1459    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1460              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
1461              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
1462  def b32ri :
1463    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1464              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
1465              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
1466  def b64rr :
1467    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
1468              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
1469              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
1470  def b64ri :
1471    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
1472              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
1473              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
1474}
1475
1476defm OR  : BITWISE<"or", or>;
1477defm AND : BITWISE<"and", and>;
1478defm XOR : BITWISE<"xor", xor>;
1479
1480def NOT1  : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
1481                      "not.pred \t$dst, $src;",
1482                      [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
1483def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
1484                      "not.b16 \t$dst, $src;",
1485                      [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
1486def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
1487                      "not.b32 \t$dst, $src;",
1488                      [(set Int32Regs:$dst, (not Int32Regs:$src))]>;
1489def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
1490                       "not.b64 \t$dst, $src;",
1491                       [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
1492
1493// Template for left/right shifts.  Takes three operands,
1494//   [dest (reg), src (reg), shift (reg or imm)].
1495// dest and src may be int64, int32, or int16, but shift is always int32.
1496//
1497// This template also defines a 32-bit shift (imm, imm) instruction.
1498multiclass SHIFT<string OpcStr, SDNode OpNode> {
1499   def i64rr :
1500     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
1501               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1502               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int32Regs:$b))]>;
1503   def i64ri :
1504     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
1505               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1506               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>;
1507   def i32rr :
1508     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1509               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1510               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
1511   def i32ri :
1512     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1513               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1514               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, (i32 imm:$b)))]>;
1515   def i32ii :
1516     NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
1517               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1518               [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
1519   def i16rr :
1520     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
1521               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1522               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int32Regs:$b))]>;
1523   def i16ri :
1524     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
1525               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1526               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>;
1527}
1528
1529defm SHL : SHIFT<"shl.b", shl>;
1530defm SRA : SHIFT<"shr.s", sra>;
1531defm SRL : SHIFT<"shr.u", srl>;
1532
1533// Bit-reverse
1534def BREV32 :
1535  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
1536             "brev.b32 \t$dst, $a;",
1537             [(set Int32Regs:$dst, (bitreverse Int32Regs:$a))]>;
1538def BREV64 :
1539  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a),
1540             "brev.b64 \t$dst, $a;",
1541             [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>;
1542
1543//
1544// Rotate: Use ptx shf instruction if available.
1545//
1546
1547// 32 bit r2 = rotl r1, n
1548//    =>
1549//        r2 = shf.l r1, r1, n
1550def ROTL32imm_hw :
1551  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
1552            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1553            [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>,
1554           Requires<[hasHWROT32]>;
1555
1556def ROTL32reg_hw :
1557  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1558            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1559            [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
1560           Requires<[hasHWROT32]>;
1561
1562// 32 bit r2 = rotr r1, n
1563//    =>
1564//        r2 = shf.r r1, r1, n
1565def ROTR32imm_hw :
1566  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
1567            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1568            [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>,
1569           Requires<[hasHWROT32]>;
1570
1571def ROTR32reg_hw :
1572  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1573            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1574            [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
1575           Requires<[hasHWROT32]>;
1576
1577// 32-bit software rotate by immediate.  $amt2 should equal 32 - $amt1.
1578def ROT32imm_sw :
1579  NVPTXInst<(outs Int32Regs:$dst),
1580            (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
1581            "{{\n\t"
1582            ".reg .b32 %lhs;\n\t"
1583            ".reg .b32 %rhs;\n\t"
1584            "shl.b32 \t%lhs, $src, $amt1;\n\t"
1585            "shr.b32 \t%rhs, $src, $amt2;\n\t"
1586            "add.u32 \t$dst, %lhs, %rhs;\n\t"
1587            "}}",
1588            []>;
1589
1590def SUB_FRM_32 : SDNodeXForm<imm, [{
1591  return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32);
1592}]>;
1593
1594def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
1595          (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
1596      Requires<[noHWROT32]>;
1597def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
1598          (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
1599      Requires<[noHWROT32]>;
1600
1601// 32-bit software rotate left by register.
1602def ROTL32reg_sw :
1603  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1604            "{{\n\t"
1605            ".reg .b32 %lhs;\n\t"
1606            ".reg .b32 %rhs;\n\t"
1607            ".reg .b32 %amt2;\n\t"
1608            "shl.b32 \t%lhs, $src, $amt;\n\t"
1609            "sub.s32 \t%amt2, 32, $amt;\n\t"
1610            "shr.b32 \t%rhs, $src, %amt2;\n\t"
1611            "add.u32 \t$dst, %lhs, %rhs;\n\t"
1612            "}}",
1613            [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
1614           Requires<[noHWROT32]>;
1615
1616// 32-bit software rotate right by register.
1617def ROTR32reg_sw :
1618  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1619            "{{\n\t"
1620            ".reg .b32 %lhs;\n\t"
1621            ".reg .b32 %rhs;\n\t"
1622            ".reg .b32 %amt2;\n\t"
1623            "shr.b32 \t%lhs, $src, $amt;\n\t"
1624            "sub.s32 \t%amt2, 32, $amt;\n\t"
1625            "shl.b32 \t%rhs, $src, %amt2;\n\t"
1626            "add.u32 \t$dst, %lhs, %rhs;\n\t"
1627            "}}",
1628            [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
1629           Requires<[noHWROT32]>;
1630
1631// 64-bit software rotate by immediate.  $amt2 should equal 64 - $amt1.
1632def ROT64imm_sw :
1633  NVPTXInst<(outs Int64Regs:$dst),
1634            (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2),
1635            "{{\n\t"
1636            ".reg .b64 %lhs;\n\t"
1637            ".reg .b64 %rhs;\n\t"
1638            "shl.b64 \t%lhs, $src, $amt1;\n\t"
1639            "shr.b64 \t%rhs, $src, $amt2;\n\t"
1640            "add.u64 \t$dst, %lhs, %rhs;\n\t"
1641            "}}",
1642            []>;
1643
1644def SUB_FRM_64 : SDNodeXForm<imm, [{
1645    return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32);
1646}]>;
1647
1648def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
1649          (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
1650def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
1651          (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
1652
1653// 64-bit software rotate left by register.
1654def ROTL64reg_sw :
1655  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
1656            "{{\n\t"
1657            ".reg .b64 %lhs;\n\t"
1658            ".reg .b64 %rhs;\n\t"
1659            ".reg .u32 %amt2;\n\t"
1660            "shl.b64 \t%lhs, $src, $amt;\n\t"
1661            "sub.u32 \t%amt2, 64, $amt;\n\t"
1662            "shr.b64 \t%rhs, $src, %amt2;\n\t"
1663            "add.u64 \t$dst, %lhs, %rhs;\n\t"
1664            "}}",
1665            [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>;
1666
1667def ROTR64reg_sw :
1668  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
1669            "{{\n\t"
1670            ".reg .b64 %lhs;\n\t"
1671            ".reg .b64 %rhs;\n\t"
1672            ".reg .u32 %amt2;\n\t"
1673            "shr.b64 \t%lhs, $src, $amt;\n\t"
1674            "sub.u32 \t%amt2, 64, $amt;\n\t"
1675            "shl.b64 \t%rhs, $src, %amt2;\n\t"
1676            "add.u64 \t$dst, %lhs, %rhs;\n\t"
1677            "}}",
1678            [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
1679
1680//
1681// Funnnel shift in clamp mode
1682//
1683
1684// Create SDNodes so they can be used in the DAG code, e.g.
1685// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
1686def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
1687def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
1688
1689def FUNSHFLCLAMP :
1690  NVPTXInst<(outs Int32Regs:$dst),
1691            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1692            "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
1693            [(set Int32Regs:$dst,
1694              (FUN_SHFL_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
1695
1696def FUNSHFRCLAMP :
1697  NVPTXInst<(outs Int32Regs:$dst),
1698            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1699            "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
1700            [(set Int32Regs:$dst,
1701             (FUN_SHFR_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
1702
1703//
1704// BFE - bit-field extract
1705//
1706
1707// Template for BFE instructions.  Takes four args,
1708//   [dest (reg), src (reg), start (reg or imm), end (reg or imm)].
1709// Start may be an imm only if end is also an imm.  FIXME: Is this a
1710// restriction in PTX?
1711//
1712// dest and src may be int32 or int64, but start and end are always int32.
1713multiclass BFE<string TyStr, RegisterClass RC> {
1714  def rrr
1715    : NVPTXInst<(outs RC:$d),
1716                (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
1717                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
1718  def rri
1719    : NVPTXInst<(outs RC:$d),
1720                (ins RC:$a, Int32Regs:$b, i32imm:$c),
1721                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
1722  def rii
1723    : NVPTXInst<(outs RC:$d),
1724                (ins RC:$a, i32imm:$b, i32imm:$c),
1725                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
1726}
1727
1728let hasSideEffects = false in {
1729  defm BFE_S32 : BFE<"s32", Int32Regs>;
1730  defm BFE_U32 : BFE<"u32", Int32Regs>;
1731  defm BFE_S64 : BFE<"s64", Int64Regs>;
1732  defm BFE_U64 : BFE<"u64", Int64Regs>;
1733}
1734
1735//-----------------------------------
1736// Comparison instructions (setp, set)
1737//-----------------------------------
1738
1739// FIXME: This doesn't cover versions of set and setp that combine with a
1740// boolean predicate, e.g. setp.eq.and.b16.
1741
1742let hasSideEffects = false in {
1743  multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
1744    def rr :
1745      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
1746                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1747                           " \t$dst, $a, $b;"), []>;
1748    def ri :
1749      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
1750                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1751                           " \t$dst, $a, $b;"), []>;
1752    def ir :
1753      NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
1754                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1755                           " \t$dst, $a, $b;"), []>;
1756  }
1757}
1758
1759defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
1760defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>;
1761defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>;
1762defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>;
1763defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>;
1764defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>;
1765defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>;
1766defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>;
1767defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
1768defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
1769defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
1770def SETP_f16rr :
1771      NVPTXInst<(outs Int1Regs:$dst),
1772                (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp),
1773                "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;",
1774                []>, Requires<[useFP16Math]>;
1775
1776def SETP_f16x2rr :
1777      NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
1778                (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp),
1779                "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;",
1780                []>,
1781                Requires<[useFP16Math]>;
1782def SETP_bf16rr :
1783      NVPTXInst<(outs Int1Regs:$dst),
1784                (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp),
1785                "setp${cmp:base}${cmp:ftz}.bf16 \t$dst, $a, $b;",
1786                []>, Requires<[hasBF16Math]>;
1787
1788def SETP_bf16x2rr :
1789      NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
1790                (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp),
1791                "setp${cmp:base}${cmp:ftz}.bf16x2 \t$p|$q, $a, $b;",
1792                []>,
1793                Requires<[hasBF16Math]>;
1794
1795
1796// FIXME: This doesn't appear to be correct.  The "set" mnemonic has the form
1797// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
1798// reg, either u32, s32, or f32.  Anyway these aren't used at the moment.
1799
1800let hasSideEffects = false in {
1801  multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
1802    def rr : NVPTXInst<(outs Int32Regs:$dst),
1803                       (ins RC:$a, RC:$b, CmpMode:$cmp),
1804                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
1805    def ri : NVPTXInst<(outs Int32Regs:$dst),
1806                       (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
1807                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
1808    def ir : NVPTXInst<(outs Int32Regs:$dst),
1809                       (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
1810                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
1811  }
1812}
1813
1814defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
1815defm SET_s16 : SET<"s16", Int16Regs, i16imm>;
1816defm SET_u16 : SET<"u16", Int16Regs, i16imm>;
1817defm SET_b32 : SET<"b32", Int32Regs, i32imm>;
1818defm SET_s32 : SET<"s32", Int32Regs, i32imm>;
1819defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
1820defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
1821defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
1822defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
1823defm SET_f16 : SET<"f16", Int16Regs, f16imm>;
1824defm SET_bf16 : SET<"bf16", Int16Regs, bf16imm>;
1825defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
1826defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
1827
1828//-----------------------------------
1829// Data Movement (Load / Store, Move)
1830//-----------------------------------
1831
1832def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
1833                            [SDNPWantRoot]>;
1834def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
1835                              [SDNPWantRoot]>;
1836def ADDRvar : ComplexPattern<iPTR, 1, "SelectDirectAddr", [], []>;
1837
1838def MEMri : Operand<i32> {
1839  let PrintMethod = "printMemOperand";
1840  let MIOperandInfo = (ops Int32Regs, i32imm);
1841}
1842def MEMri64 : Operand<i64> {
1843  let PrintMethod = "printMemOperand";
1844  let MIOperandInfo = (ops Int64Regs, i64imm);
1845}
1846
1847def imem : Operand<iPTR> {
1848  let PrintMethod = "printOperand";
1849}
1850
1851def imemAny : Operand<iPTRAny> {
1852  let PrintMethod = "printOperand";
1853}
1854
1855def LdStCode : Operand<i32> {
1856  let PrintMethod = "printLdStCode";
1857}
1858
1859def MmaCode : Operand<i32> {
1860  let PrintMethod = "printMmaCode";
1861}
1862
1863def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
1864def Wrapper    : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
1865
1866// Load a memory address into a u32 or u64 register.
1867def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
1868                         "mov.u32 \t$dst, $a;",
1869                         [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
1870def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
1871                           "mov.u64 \t$dst, $a;",
1872                           [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
1873
1874// Get pointer to local stack.
1875let hasSideEffects = false in {
1876  def MOV_DEPOT_ADDR :    NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
1877                                     "mov.u32 \t$d, __local_depot$num;", []>;
1878  def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
1879                                    "mov.u64 \t$d, __local_depot$num;", []>;
1880}
1881
1882
1883// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
1884let IsSimpleMove=1, hasSideEffects=0 in {
1885  def IMOV1rr :  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
1886                           "mov.pred \t$dst, $sss;", []>;
1887  def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
1888                           "mov.u16 \t$dst, $sss;", []>;
1889  def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
1890                           "mov.u32 \t$dst, $sss;", []>;
1891  def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
1892                           "mov.u64 \t$dst, $sss;", []>;
1893
1894  def IMOVB16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
1895                           "mov.b16 \t$dst, $sss;", []>;
1896  def IMOVB32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
1897                           "mov.b32 \t$dst, $sss;", []>;
1898  def IMOVB64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
1899                           "mov.b64 \t$dst, $sss;", []>;
1900
1901  def FMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
1902                           // We have to use .b16 here as there's no mov.f16.
1903                           "mov.b16 \t$dst, $src;", []>;
1904  def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1905                           "mov.f32 \t$dst, $src;", []>;
1906  def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
1907                           "mov.f64 \t$dst, $src;", []>;
1908}
1909
1910def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
1911                        "mov.pred \t$dst, $src;",
1912                        [(set Int1Regs:$dst, imm:$src)]>;
1913def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
1914                         "mov.u16 \t$dst, $src;",
1915                         [(set Int16Regs:$dst, imm:$src)]>;
1916def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
1917                         "mov.u32 \t$dst, $src;",
1918                         [(set Int32Regs:$dst, imm:$src)]>;
1919def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
1920                        "mov.u64 \t$dst, $src;",
1921                        [(set Int64Regs:$dst, imm:$src)]>;
1922
1923def IMOVB16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
1924                         "mov.b16 \t$dst, $src;", []>;
1925def IMOVB32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
1926                         "mov.b32 \t$dst, $src;", []>;
1927def IMOVB64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
1928                        "mov.b64 \t$dst, $src;", []>;
1929
1930def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
1931                         "mov.f32 \t$dst, $src;",
1932                         [(set Float32Regs:$dst, fpimm:$src)]>;
1933def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
1934                         "mov.f64 \t$dst, $src;",
1935                         [(set Float64Regs:$dst, fpimm:$src)]>;
1936
1937def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
1938def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64ri texternalsym:$dst)>;
1939
1940//---- Copy Frame Index ----
1941def LEA_ADDRi :   NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
1942                            "add.u32 \t$dst, ${addr:add};",
1943                            [(set Int32Regs:$dst, ADDRri:$addr)]>;
1944def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
1945                            "add.u64 \t$dst, ${addr:add};",
1946                            [(set Int64Regs:$dst, ADDRri64:$addr)]>;
1947
1948//-----------------------------------
1949// Comparison and Selection
1950//-----------------------------------
1951
1952multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
1953                       Instruction setp_16rr,
1954                       Instruction setp_16ri,
1955                       Instruction setp_16ir,
1956                       Instruction setp_32rr,
1957                       Instruction setp_32ri,
1958                       Instruction setp_32ir,
1959                       Instruction setp_64rr,
1960                       Instruction setp_64ri,
1961                       Instruction setp_64ir,
1962                       Instruction set_16rr,
1963                       Instruction set_16ri,
1964                       Instruction set_16ir,
1965                       Instruction set_32rr,
1966                       Instruction set_32ri,
1967                       Instruction set_32ir,
1968                       Instruction set_64rr,
1969                       Instruction set_64ri,
1970                       Instruction set_64ir> {
1971  // i16 -> pred
1972  def : Pat<(i1 (OpNode i16:$a, i16:$b)),
1973            (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
1974  def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)),
1975            (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
1976  def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)),
1977            (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
1978  // i32 -> pred
1979  def : Pat<(i1 (OpNode i32:$a, i32:$b)),
1980            (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
1981  def : Pat<(i1 (OpNode Int32Regs:$a, imm:$b)),
1982            (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
1983  def : Pat<(i1 (OpNode imm:$a, Int32Regs:$b)),
1984            (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
1985  // i64 -> pred
1986  def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)),
1987            (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
1988  def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)),
1989            (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
1990  def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)),
1991            (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
1992
1993  // i16 -> i32
1994  def : Pat<(i32 (OpNode i16:$a, i16:$b)),
1995            (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
1996  def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)),
1997            (set_16ri Int16Regs:$a, imm:$b, Mode)>;
1998  def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)),
1999            (set_16ir imm:$a, Int16Regs:$b, Mode)>;
2000  // i32 -> i32
2001  def : Pat<(i32 (OpNode i32:$a, i32:$b)),
2002            (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
2003  def : Pat<(i32 (OpNode Int32Regs:$a, imm:$b)),
2004            (set_32ri Int32Regs:$a, imm:$b, Mode)>;
2005  def : Pat<(i32 (OpNode imm:$a, Int32Regs:$b)),
2006            (set_32ir imm:$a, Int32Regs:$b, Mode)>;
2007  // i64 -> i32
2008  def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)),
2009            (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
2010  def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)),
2011            (set_64ri Int64Regs:$a, imm:$b, Mode)>;
2012  def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)),
2013            (set_64ir imm:$a, Int64Regs:$b, Mode)>;
2014}
2015
2016multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
2017  : ISET_FORMAT<OpNode, Mode,
2018                SETP_s16rr, SETP_s16ri, SETP_s16ir,
2019                SETP_s32rr, SETP_s32ri, SETP_s32ir,
2020                SETP_s64rr, SETP_s64ri, SETP_s64ir,
2021                SET_s16rr, SET_s16ri, SET_s16ir,
2022                SET_s32rr, SET_s32ri, SET_s32ir,
2023                SET_s64rr, SET_s64ri, SET_s64ir> {
2024  // TableGen doesn't like empty multiclasses.
2025  def : PatLeaf<(i32 0)>;
2026}
2027
2028multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
2029  : ISET_FORMAT<OpNode, Mode,
2030                SETP_u16rr, SETP_u16ri, SETP_u16ir,
2031                SETP_u32rr, SETP_u32ri, SETP_u32ir,
2032                SETP_u64rr, SETP_u64ri, SETP_u64ir,
2033                SET_u16rr, SET_u16ri, SET_u16ir,
2034                SET_u32rr, SET_u32ri, SET_u32ir,
2035                SET_u64rr, SET_u64ri, SET_u64ir> {
2036  // TableGen doesn't like empty multiclasses.
2037  def : PatLeaf<(i32 0)>;
2038}
2039
2040defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
2041defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
2042defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
2043defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
2044defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
2045defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
2046defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
2047defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
2048defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
2049defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
2050defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
2051defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
2052
2053// i1 compares
2054def : Pat<(setne Int1Regs:$a, Int1Regs:$b),
2055          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
2056def : Pat<(setune Int1Regs:$a, Int1Regs:$b),
2057          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
2058
2059def : Pat<(seteq Int1Regs:$a, Int1Regs:$b),
2060          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
2061def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
2062          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
2063
2064// i1 compare -> i32
2065def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
2066          (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
2067def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
2068          (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
2069
2070
2071
2072multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
2073  // f16 -> pred
2074  def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
2075            (SETP_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
2076        Requires<[useFP16Math,doF32FTZ]>;
2077  def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
2078            (SETP_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
2079        Requires<[useFP16Math]>;
2080  def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
2081            (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
2082        Requires<[useFP16Math,doF32FTZ]>;
2083  def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
2084            (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
2085        Requires<[useFP16Math]>;
2086  def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
2087            (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2088        Requires<[useFP16Math,doF32FTZ]>;
2089  def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
2090            (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>,
2091        Requires<[useFP16Math]>;
2092
2093  // bf16 -> pred
2094  def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
2095            (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
2096        Requires<[hasBF16Math,doF32FTZ]>;
2097  def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
2098            (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
2099        Requires<[hasBF16Math]>;
2100  def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
2101            (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>,
2102        Requires<[hasBF16Math,doF32FTZ]>;
2103  def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
2104            (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>,
2105        Requires<[hasBF16Math]>;
2106  def : Pat<(i1 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
2107            (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2108        Requires<[hasBF16Math,doF32FTZ]>;
2109  def : Pat<(i1 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
2110            (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>,
2111        Requires<[hasBF16Math]>;
2112
2113  // f32 -> pred
2114  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
2115            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
2116        Requires<[doF32FTZ]>;
2117  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
2118            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
2119  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
2120            (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
2121        Requires<[doF32FTZ]>;
2122  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
2123            (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
2124  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
2125            (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
2126        Requires<[doF32FTZ]>;
2127  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
2128            (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
2129
2130  // f64 -> pred
2131  def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),
2132            (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
2133  def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),
2134            (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
2135  def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
2136            (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
2137
2138  // f16 -> i32
2139  def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
2140            (SET_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
2141        Requires<[useFP16Math, doF32FTZ]>;
2142  def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
2143            (SET_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
2144        Requires<[useFP16Math]>;
2145  def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
2146            (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
2147        Requires<[useFP16Math, doF32FTZ]>;
2148  def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
2149            (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
2150        Requires<[useFP16Math]>;
2151  def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
2152            (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2153        Requires<[useFP16Math, doF32FTZ]>;
2154  def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
2155            (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>,
2156        Requires<[useFP16Math]>;
2157
2158  // bf16 -> i32
2159  def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
2160            (SET_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
2161        Requires<[hasBF16Math, doF32FTZ]>;
2162  def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
2163            (SET_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
2164        Requires<[hasBF16Math]>;
2165  def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
2166            (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>,
2167        Requires<[hasBF16Math, doF32FTZ]>;
2168  def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
2169            (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>,
2170        Requires<[hasBF16Math]>;
2171  def : Pat<(i32 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
2172            (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2173        Requires<[hasBF16Math, doF32FTZ]>;
2174  def : Pat<(i32 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
2175            (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>,
2176        Requires<[hasBF16Math]>;
2177
2178  // f32 -> i32
2179  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
2180            (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
2181        Requires<[doF32FTZ]>;
2182  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
2183            (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
2184  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
2185            (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
2186        Requires<[doF32FTZ]>;
2187  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
2188            (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
2189  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
2190            (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
2191        Requires<[doF32FTZ]>;
2192  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
2193            (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
2194
2195  // f64 -> i32
2196  def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)),
2197            (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
2198  def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)),
2199            (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
2200  def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)),
2201            (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
2202}
2203
2204defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
2205defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
2206defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
2207defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
2208defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
2209defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
2210
2211defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>;
2212defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>;
2213defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>;
2214defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>;
2215defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>;
2216defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>;
2217
2218defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>;
2219defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>;
2220defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>;
2221defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>;
2222defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>;
2223defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
2224
2225defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
2226defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
2227
2228// FIXME: What is this doing here?  Can it be deleted?
2229// def ld_param         : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
2230//                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
2231
2232def SDTDeclareParamProfile :
2233  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
2234def SDTDeclareScalarParamProfile :
2235  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
2236def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
2237def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
2238def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
2239def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
2240def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
2241def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
2242def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
2243def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
2244def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
2245def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
2246def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
2247def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
2248def SDTCallValProfile : SDTypeProfile<1, 0, []>;
2249def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
2250def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
2251def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
2252def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
2253def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
2254def SDTProxyRegProfile : SDTypeProfile<1, 1, []>;
2255
2256def DeclareParam :
2257  SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
2258         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2259def DeclareScalarParam :
2260  SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
2261         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2262def DeclareRetParam :
2263  SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile,
2264         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2265def DeclareRet :
2266  SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
2267         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2268def LoadParam :
2269  SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
2270         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
2271def LoadParamV2 :
2272  SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
2273         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
2274def LoadParamV4 :
2275  SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
2276         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
2277def PrintCall :
2278  SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
2279         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2280def PrintConvergentCall :
2281  SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile,
2282         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2283def PrintCallUni :
2284  SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
2285         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2286def PrintConvergentCallUni :
2287  SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile,
2288         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2289def StoreParam :
2290  SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
2291         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2292def StoreParamV2 :
2293  SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
2294         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2295def StoreParamV4 :
2296  SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
2297         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2298def StoreParamU32 :
2299  SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
2300         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2301def StoreParamS32 :
2302  SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
2303         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2304def CallArgBegin :
2305  SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
2306         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2307def CallArg :
2308  SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
2309         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2310def LastCallArg :
2311  SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
2312         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2313def CallArgEnd :
2314  SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
2315         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2316def CallVoid :
2317  SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
2318         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2319def Prototype :
2320  SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
2321         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2322def CallVal :
2323  SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
2324         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2325def MoveParam :
2326  SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
2327def StoreRetval :
2328  SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
2329         [SDNPHasChain, SDNPSideEffect]>;
2330def StoreRetvalV2 :
2331  SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
2332         [SDNPHasChain, SDNPSideEffect]>;
2333def StoreRetvalV4 :
2334  SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
2335         [SDNPHasChain, SDNPSideEffect]>;
2336def PseudoUseParam :
2337  SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile,
2338         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2339def RETURNNode :
2340  SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
2341         [SDNPHasChain, SDNPSideEffect]>;
2342def ProxyReg :
2343  SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile,
2344         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2345
2346let mayLoad = true in {
2347  class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
2348        NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
2349                  !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"),
2350                  []>;
2351
2352  class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
2353        NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
2354                  !strconcat("ld.param.v2", opstr,
2355                             " \t{{$dst, $dst2}}, [retval0+$b];"), []>;
2356
2357  class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
2358        NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
2359                        regclass:$dst4),
2360                  (ins i32imm:$b),
2361                  !strconcat("ld.param.v4", opstr,
2362                             " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"),
2363                  []>;
2364}
2365
2366class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
2367      NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
2368                !strconcat("mov", opstr, " \t$dst, retval$b;"),
2369                [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
2370
2371let mayStore = true in {
2372  class StoreParamInst<NVPTXRegClass regclass, string opstr> :
2373        NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
2374                  !strconcat("st.param", opstr, " \t[param$a+$b], $val;"),
2375                  []>;
2376
2377  class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
2378        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
2379                               i32imm:$a, i32imm:$b),
2380                  !strconcat("st.param.v2", opstr,
2381                             " \t[param$a+$b], {{$val, $val2}};"),
2382                  []>;
2383
2384  class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
2385        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3,
2386                               regclass:$val4, i32imm:$a,
2387                               i32imm:$b),
2388                  !strconcat("st.param.v4", opstr,
2389                             " \t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
2390                  []>;
2391
2392  class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
2393        NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
2394                  !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"),
2395                  []>;
2396
2397  class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
2398        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
2399                  !strconcat("st.param.v2", opstr,
2400                             " \t[func_retval0+$a], {{$val, $val2}};"),
2401                  []>;
2402
2403  class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
2404        NVPTXInst<(outs),
2405                  (ins regclass:$val, regclass:$val2, regclass:$val3,
2406                       regclass:$val4, i32imm:$a),
2407                  !strconcat("st.param.v4", opstr,
2408                             " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
2409                  []>;
2410}
2411
2412let isCall=1 in {
2413  multiclass CALL<string OpcStr, SDNode OpNode> {
2414     def PrintCallNoRetInst : NVPTXInst<(outs), (ins),
2415       !strconcat(OpcStr, " "), [(OpNode (i32 0))]>;
2416     def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
2417       !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>;
2418     def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
2419       !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>;
2420     def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
2421       !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>;
2422     def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
2423       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "),
2424       [(OpNode (i32 4))]>;
2425     def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
2426       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "),
2427       [(OpNode (i32 5))]>;
2428     def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
2429       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2430                            "retval5), "),
2431       [(OpNode (i32 6))]>;
2432     def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
2433       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2434                            "retval5, retval6), "),
2435       [(OpNode (i32 7))]>;
2436     def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
2437       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2438                            "retval5, retval6, retval7), "),
2439       [(OpNode (i32 8))]>;
2440  }
2441}
2442
2443defm Call : CALL<"call", PrintCall>;
2444defm CallUni : CALL<"call.uni", PrintCallUni>;
2445
2446// Convergent call instructions.  These are identical to regular calls, except
2447// they have the isConvergent bit set.
2448let isConvergent=1 in {
2449  defm ConvergentCall : CALL<"call", PrintConvergentCall>;
2450  defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>;
2451}
2452
2453def LoadParamMemI64    : LoadParamMemInst<Int64Regs, ".b64">;
2454def LoadParamMemI32    : LoadParamMemInst<Int32Regs, ".b32">;
2455def LoadParamMemI16    : LoadParamMemInst<Int16Regs, ".b16">;
2456def LoadParamMemI8     : LoadParamMemInst<Int16Regs, ".b8">;
2457def LoadParamMemV2I64  : LoadParamV2MemInst<Int64Regs, ".b64">;
2458def LoadParamMemV2I32  : LoadParamV2MemInst<Int32Regs, ".b32">;
2459def LoadParamMemV2I16  : LoadParamV2MemInst<Int16Regs, ".b16">;
2460def LoadParamMemV2I8   : LoadParamV2MemInst<Int16Regs, ".b8">;
2461def LoadParamMemV4I32  : LoadParamV4MemInst<Int32Regs, ".b32">;
2462def LoadParamMemV4I16  : LoadParamV4MemInst<Int16Regs, ".b16">;
2463def LoadParamMemV4I8   : LoadParamV4MemInst<Int16Regs, ".b8">;
2464def LoadParamMemF32    : LoadParamMemInst<Float32Regs, ".f32">;
2465def LoadParamMemF64    : LoadParamMemInst<Float64Regs, ".f64">;
2466def LoadParamMemV2F32  : LoadParamV2MemInst<Float32Regs, ".f32">;
2467def LoadParamMemV2F64  : LoadParamV2MemInst<Float64Regs, ".f64">;
2468def LoadParamMemV4F32  : LoadParamV4MemInst<Float32Regs, ".f32">;
2469
2470def StoreParamI64    : StoreParamInst<Int64Regs, ".b64">;
2471def StoreParamI32    : StoreParamInst<Int32Regs, ".b32">;
2472
2473def StoreParamI16    : StoreParamInst<Int16Regs, ".b16">;
2474def StoreParamI8     : StoreParamInst<Int16Regs, ".b8">;
2475def StoreParamV2I64  : StoreParamV2Inst<Int64Regs, ".b64">;
2476def StoreParamV2I32  : StoreParamV2Inst<Int32Regs, ".b32">;
2477def StoreParamV2I16  : StoreParamV2Inst<Int16Regs, ".b16">;
2478def StoreParamV2I8   : StoreParamV2Inst<Int16Regs, ".b8">;
2479
2480def StoreParamV4I32  : StoreParamV4Inst<Int32Regs, ".b32">;
2481def StoreParamV4I16  : StoreParamV4Inst<Int16Regs, ".b16">;
2482def StoreParamV4I8   : StoreParamV4Inst<Int16Regs, ".b8">;
2483
2484def StoreParamF32      : StoreParamInst<Float32Regs, ".f32">;
2485def StoreParamF64      : StoreParamInst<Float64Regs, ".f64">;
2486def StoreParamV2F32    : StoreParamV2Inst<Float32Regs, ".f32">;
2487def StoreParamV2F64    : StoreParamV2Inst<Float64Regs, ".f64">;
2488def StoreParamV4F32    : StoreParamV4Inst<Float32Regs, ".f32">;
2489
2490def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
2491def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
2492def StoreRetvalI16    : StoreRetvalInst<Int16Regs, ".b16">;
2493def StoreRetvalI8     : StoreRetvalInst<Int16Regs, ".b8">;
2494def StoreRetvalV2I64  : StoreRetvalV2Inst<Int64Regs, ".b64">;
2495def StoreRetvalV2I32  : StoreRetvalV2Inst<Int32Regs, ".b32">;
2496def StoreRetvalV2I16  : StoreRetvalV2Inst<Int16Regs, ".b16">;
2497def StoreRetvalV2I8   : StoreRetvalV2Inst<Int16Regs, ".b8">;
2498def StoreRetvalV4I32  : StoreRetvalV4Inst<Int32Regs, ".b32">;
2499def StoreRetvalV4I16  : StoreRetvalV4Inst<Int16Regs, ".b16">;
2500def StoreRetvalV4I8   : StoreRetvalV4Inst<Int16Regs, ".b8">;
2501
2502def StoreRetvalF64    : StoreRetvalInst<Float64Regs, ".f64">;
2503def StoreRetvalF32    : StoreRetvalInst<Float32Regs, ".f32">;
2504def StoreRetvalV2F64  : StoreRetvalV2Inst<Float64Regs, ".f64">;
2505def StoreRetvalV2F32  : StoreRetvalV2Inst<Float32Regs, ".f32">;
2506def StoreRetvalV4F32  : StoreRetvalV4Inst<Float32Regs, ".f32">;
2507
2508def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
2509def CallArgEndInst1  : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
2510def CallArgEndInst0  : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
2511def RETURNInst       : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
2512
2513class CallArgInst<NVPTXRegClass regclass> :
2514  NVPTXInst<(outs), (ins regclass:$a), "$a, ",
2515            [(CallArg (i32 0), regclass:$a)]>;
2516
2517class CallArgInstVT<NVPTXRegClass regclass, ValueType vt> :
2518  NVPTXInst<(outs), (ins regclass:$a), "$a, ",
2519            [(CallArg (i32 0), vt:$a)]>;
2520
2521class LastCallArgInst<NVPTXRegClass regclass> :
2522  NVPTXInst<(outs), (ins regclass:$a), "$a",
2523            [(LastCallArg (i32 0), regclass:$a)]>;
2524class LastCallArgInstVT<NVPTXRegClass regclass, ValueType vt> :
2525  NVPTXInst<(outs), (ins regclass:$a), "$a",
2526            [(LastCallArg (i32 0), vt:$a)]>;
2527
2528def CallArgI64     : CallArgInst<Int64Regs>;
2529def CallArgI32     : CallArgInstVT<Int32Regs, i32>;
2530def CallArgI16     : CallArgInstVT<Int16Regs, i16>;
2531def CallArgF64     : CallArgInst<Float64Regs>;
2532def CallArgF32     : CallArgInst<Float32Regs>;
2533
2534def LastCallArgI64 : LastCallArgInst<Int64Regs>;
2535def LastCallArgI32 : LastCallArgInstVT<Int32Regs, i32>;
2536def LastCallArgI16 : LastCallArgInstVT<Int16Regs, i16>;
2537def LastCallArgF64 : LastCallArgInst<Float64Regs>;
2538def LastCallArgF32 : LastCallArgInst<Float32Regs>;
2539
2540def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
2541                              [(CallArg (i32 0), (i32 imm:$a))]>;
2542def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
2543                                  [(LastCallArg (i32 0), (i32 imm:$a))]>;
2544
2545def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
2546                             [(CallArg (i32 1), (i32 imm:$a))]>;
2547def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
2548                                 [(LastCallArg (i32 1), (i32 imm:$a))]>;
2549
2550def CallVoidInst :      NVPTXInst<(outs), (ins imem:$addr), "$addr, ",
2551                                  [(CallVoid (Wrapper tglobaladdr:$addr))]>;
2552def CallVoidInstReg :   NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ",
2553                                  [(CallVoid i32:$addr)]>;
2554def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ",
2555                                  [(CallVoid Int64Regs:$addr)]>;
2556def PrototypeInst :     NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
2557                                  [(Prototype (i32 imm:$val))]>;
2558
2559def DeclareRetMemInst :
2560  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num),
2561            ".param .align $align .b8 retval$num[$size];",
2562            [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
2563def DeclareRetScalarInst :
2564  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
2565            ".param .b$size retval$num;",
2566            [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
2567def DeclareRetRegInst :
2568  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
2569            ".reg .b$size retval$num;",
2570            [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
2571
2572def DeclareParamInst :
2573  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
2574            ".param .align $align .b8 param$a[$size];",
2575            [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
2576def DeclareScalarParamInst :
2577  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
2578            ".param .b$size param$a;",
2579            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
2580def DeclareScalarRegInst :
2581  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
2582            ".reg .b$size param$a;",
2583            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
2584
2585class MoveParamInst<ValueType T, NVPTXRegClass regclass, string asmstr> :
2586  NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
2587            !strconcat("mov", asmstr, " \t$dst, $src;"),
2588            [(set (T regclass:$dst), (MoveParam (T regclass:$src)))]>;
2589
2590class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty, ValueType vt,
2591                          string asmstr> :
2592  NVPTXInst<(outs regclass:$dst), (ins srcty:$src),
2593            !strconcat("mov", asmstr, " \t$dst, $src;"),
2594            [(set vt:$dst, (MoveParam texternalsym:$src))]>;
2595
2596def MoveParamI64 : MoveParamInst<i64, Int64Regs, ".b64">;
2597def MoveParamI32 : MoveParamInst<i32, Int32Regs, ".b32">;
2598
2599def MoveParamSymbolI64 : MoveParamSymbolInst<Int64Regs, i64imm, i64, ".b64">;
2600def MoveParamSymbolI32 : MoveParamSymbolInst<Int32Regs, i32imm, i32, ".b32">;
2601
2602def MoveParamI16 :
2603  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
2604            "cvt.u16.u32 \t$dst, $src;", // ??? Why cvt.u16.u32 ?
2605            [(set i16:$dst, (MoveParam i16:$src))]>;
2606def MoveParamF64 : MoveParamInst<f64, Float64Regs, ".f64">;
2607def MoveParamF32 : MoveParamInst<f32, Float32Regs, ".f32">;
2608
2609class PseudoUseParamInst<NVPTXRegClass regclass, ValueType vt> :
2610  NVPTXInst<(outs), (ins regclass:$src),
2611            "// Pseudo use of $src",
2612            [(PseudoUseParam vt:$src)]>;
2613
2614def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs, i64>;
2615def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs, i32>;
2616def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs, i16>;
2617def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs, f64>;
2618def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs, f32>;
2619
2620class ProxyRegInst<string SzStr, ValueType T, NVPTXRegClass regclass> :
2621  NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
2622            !strconcat("mov.", SzStr, " \t$dst, $src;"),
2623            [(set (T regclass:$dst), (ProxyReg (T regclass:$src)))]>;
2624
2625def ProxyRegI1    : ProxyRegInst<"pred", i1, Int1Regs>;
2626def ProxyRegI16   : ProxyRegInst<"b16",  i16, Int16Regs>;
2627def ProxyRegI32   : ProxyRegInst<"b32",  i32, Int32Regs>;
2628def ProxyRegI64   : ProxyRegInst<"b64",  i64, Int64Regs>;
2629def ProxyRegF32   : ProxyRegInst<"f32",  f32, Float32Regs>;
2630def ProxyRegF64   : ProxyRegInst<"f64",  f64, Float64Regs>;
2631
2632foreach vt = [f16, bf16] in {
2633  def: Pat<(vt (ProxyReg  vt:$src)), (ProxyRegI16 Int16Regs:$src)>;
2634}
2635
2636foreach vt = [v2f16, v2bf16] in {
2637  def: Pat<(vt (ProxyReg  vt:$src)), (ProxyRegI32 Int32Regs:$src)>;
2638}
2639
2640//
2641// Load / Store Handling
2642//
2643multiclass LD<NVPTXRegClass regclass> {
2644  def _avar : NVPTXInst<
2645    (outs regclass:$dst),
2646    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2647         i32imm:$fromWidth, imem:$addr),
2648    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2649    "\t$dst, [$addr];", []>;
2650  def _areg : NVPTXInst<
2651    (outs regclass:$dst),
2652    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2653         i32imm:$fromWidth, Int32Regs:$addr),
2654    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2655    "\t$dst, [$addr];", []>;
2656  def _areg_64 : NVPTXInst<
2657    (outs regclass:$dst),
2658    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2659         i32imm:$fromWidth, Int64Regs:$addr),
2660    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2661    "\t$dst, [$addr];", []>;
2662  def _ari : NVPTXInst<
2663    (outs regclass:$dst),
2664    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2665         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2666    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2667    "\t$dst, [$addr+$offset];", []>;
2668  def _ari_64 : NVPTXInst<
2669    (outs regclass:$dst),
2670    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2671         LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2672    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2673    "\t$dst, [$addr+$offset];", []>;
2674  def _asi : NVPTXInst<
2675    (outs regclass:$dst),
2676    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2677         LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2678    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2679    "\t$dst, [$addr+$offset];", []>;
2680}
2681
2682let mayLoad=1, hasSideEffects=0 in {
2683  defm LD_i8  : LD<Int16Regs>;
2684  defm LD_i16 : LD<Int16Regs>;
2685  defm LD_i32 : LD<Int32Regs>;
2686  defm LD_i64 : LD<Int64Regs>;
2687  defm LD_f32 : LD<Float32Regs>;
2688  defm LD_f64 : LD<Float64Regs>;
2689}
2690
2691multiclass ST<NVPTXRegClass regclass> {
2692  def _avar : NVPTXInst<
2693    (outs),
2694    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2695         LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
2696    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2697    " \t[$addr], $src;", []>;
2698  def _areg : NVPTXInst<
2699    (outs),
2700    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
2701         LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
2702    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2703    " \t[$addr], $src;", []>;
2704  def _areg_64 : NVPTXInst<
2705    (outs),
2706    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2707         LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
2708    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2709    " \t[$addr], $src;", []>;
2710  def _ari : NVPTXInst<
2711    (outs),
2712    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2713         LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
2714    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2715    " \t[$addr+$offset], $src;", []>;
2716  def _ari_64 : NVPTXInst<
2717    (outs),
2718    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2719         LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
2720    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2721    " \t[$addr+$offset], $src;", []>;
2722  def _asi : NVPTXInst<
2723    (outs),
2724    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2725         LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
2726    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2727    " \t[$addr+$offset], $src;", []>;
2728}
2729
2730let mayStore=1, hasSideEffects=0 in {
2731  defm ST_i8  : ST<Int16Regs>;
2732  defm ST_i16 : ST<Int16Regs>;
2733  defm ST_i32 : ST<Int32Regs>;
2734  defm ST_i64 : ST<Int64Regs>;
2735  defm ST_f32 : ST<Float32Regs>;
2736  defm ST_f64 : ST<Float64Regs>;
2737}
2738
2739// The following is used only in and after vector elementizations.  Vector
2740// elementization happens at the machine instruction level, so the following
2741// instructions never appear in the DAG.
2742multiclass LD_VEC<NVPTXRegClass regclass> {
2743  def _v2_avar : NVPTXInst<
2744    (outs regclass:$dst1, regclass:$dst2),
2745    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2746         i32imm:$fromWidth, imem:$addr),
2747    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2748    "\t{{$dst1, $dst2}}, [$addr];", []>;
2749  def _v2_areg : NVPTXInst<
2750    (outs regclass:$dst1, regclass:$dst2),
2751    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2752         i32imm:$fromWidth, Int32Regs:$addr),
2753    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2754    "\t{{$dst1, $dst2}}, [$addr];", []>;
2755  def _v2_areg_64 : NVPTXInst<
2756    (outs regclass:$dst1, regclass:$dst2),
2757    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2758         i32imm:$fromWidth, Int64Regs:$addr),
2759    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2760    "\t{{$dst1, $dst2}}, [$addr];", []>;
2761  def _v2_ari : NVPTXInst<
2762    (outs regclass:$dst1, regclass:$dst2),
2763    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2764         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2765    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2766    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2767  def _v2_ari_64 : NVPTXInst<
2768    (outs regclass:$dst1, regclass:$dst2),
2769    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2770         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2771    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2772    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2773  def _v2_asi : NVPTXInst<
2774    (outs regclass:$dst1, regclass:$dst2),
2775    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2776         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2777    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2778    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2779  def _v4_avar : NVPTXInst<
2780    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2781    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2782         i32imm:$fromWidth, imem:$addr),
2783    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2784    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2785  def _v4_areg : NVPTXInst<
2786    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2787    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2788         i32imm:$fromWidth, Int32Regs:$addr),
2789    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2790    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2791  def _v4_areg_64 : NVPTXInst<
2792    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2793    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2794         i32imm:$fromWidth, Int64Regs:$addr),
2795    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2796    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2797  def _v4_ari : NVPTXInst<
2798    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2799    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2800         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2801    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2802    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
2803  def _v4_ari_64 : NVPTXInst<
2804    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2805    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2806         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2807    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2808    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
2809  def _v4_asi : NVPTXInst<
2810    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2811    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2812         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2813    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2814    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
2815}
2816let mayLoad=1, hasSideEffects=0 in {
2817  defm LDV_i8  : LD_VEC<Int16Regs>;
2818  defm LDV_i16 : LD_VEC<Int16Regs>;
2819  defm LDV_i32 : LD_VEC<Int32Regs>;
2820  defm LDV_i64 : LD_VEC<Int64Regs>;
2821  defm LDV_f32 : LD_VEC<Float32Regs>;
2822  defm LDV_f64 : LD_VEC<Float64Regs>;
2823}
2824
2825multiclass ST_VEC<NVPTXRegClass regclass> {
2826  def _v2_avar : NVPTXInst<
2827    (outs),
2828    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2829         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
2830    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2831    "\t[$addr], {{$src1, $src2}};", []>;
2832  def _v2_areg : NVPTXInst<
2833    (outs),
2834    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2835         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
2836    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2837    "\t[$addr], {{$src1, $src2}};", []>;
2838  def _v2_areg_64 : NVPTXInst<
2839    (outs),
2840    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2841         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
2842    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2843    "\t[$addr], {{$src1, $src2}};", []>;
2844  def _v2_ari : NVPTXInst<
2845    (outs),
2846    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2847         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
2848         i32imm:$offset),
2849    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2850    "\t[$addr+$offset], {{$src1, $src2}};", []>;
2851  def _v2_ari_64 : NVPTXInst<
2852    (outs),
2853    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2854         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
2855         i32imm:$offset),
2856    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2857    "\t[$addr+$offset], {{$src1, $src2}};", []>;
2858  def _v2_asi : NVPTXInst<
2859    (outs),
2860    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2861         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
2862         i32imm:$offset),
2863    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2864    "\t[$addr+$offset], {{$src1, $src2}};", []>;
2865  def _v4_avar : NVPTXInst<
2866    (outs),
2867    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2868         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2869         i32imm:$fromWidth, imem:$addr),
2870    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2871    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
2872  def _v4_areg : NVPTXInst<
2873    (outs),
2874    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2875         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2876         i32imm:$fromWidth, Int32Regs:$addr),
2877    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2878    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
2879  def _v4_areg_64 : NVPTXInst<
2880    (outs),
2881    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2882         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2883         i32imm:$fromWidth, Int64Regs:$addr),
2884    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2885    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
2886  def _v4_ari : NVPTXInst<
2887    (outs),
2888    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2889         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2890         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2891    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2892    "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
2893  def _v4_ari_64 : NVPTXInst<
2894    (outs),
2895    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2896         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2897         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2898    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2899    "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
2900  def _v4_asi : NVPTXInst<
2901    (outs),
2902    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2903         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2904         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2905    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
2906    "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
2907}
2908
2909let mayStore=1, hasSideEffects=0 in {
2910  defm STV_i8  : ST_VEC<Int16Regs>;
2911  defm STV_i16 : ST_VEC<Int16Regs>;
2912  defm STV_i32 : ST_VEC<Int32Regs>;
2913  defm STV_i64 : ST_VEC<Int64Regs>;
2914  defm STV_f32 : ST_VEC<Float32Regs>;
2915  defm STV_f64 : ST_VEC<Float64Regs>;
2916}
2917
2918//---- Conversion ----
2919
2920class F_BITCONVERT<string SzStr, ValueType TIn, ValueType TOut,
2921  NVPTXRegClass regclassIn = ValueToRegClass<TIn>.ret,
2922  NVPTXRegClass regclassOut = ValueToRegClass<TOut>.ret> :
2923           NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
2924           !strconcat("mov.b", SzStr, " \t$d, $a;"),
2925     [(set (TOut regclassOut:$d), (bitconvert (TIn regclassIn:$a)))]>;
2926
2927def BITCONVERT_32_I2F : F_BITCONVERT<"32", i32, f32>;
2928def BITCONVERT_32_F2I : F_BITCONVERT<"32", f32, i32>;
2929def BITCONVERT_64_I2F : F_BITCONVERT<"64", i64, f64>;
2930def BITCONVERT_64_F2I : F_BITCONVERT<"64", f64, i64>;
2931
2932foreach vt = [v2f16, v2bf16] in {
2933def: Pat<(vt (bitconvert (i32 UInt32Const:$a))),
2934         (IMOVB32ri UInt32Const:$a)>;
2935def: Pat<(vt (bitconvert (i32 Int32Regs:$a))),
2936         (ProxyRegI32 Int32Regs:$a)>;
2937def: Pat<(i32 (bitconvert (vt Int32Regs:$a))),
2938         (ProxyRegI32 Int32Regs:$a)>;
2939def: Pat<(vt (bitconvert (f32 Float32Regs:$a))),
2940         (BITCONVERT_32_F2I Float32Regs:$a)>;
2941}
2942foreach vt = [f16, bf16] in {
2943def: Pat<(vt (bitconvert (i16 UInt16Const:$a))),
2944         (IMOVB16ri UInt16Const:$a)>;
2945def: Pat<(vt (bitconvert (i16 Int16Regs:$a))),
2946         (ProxyRegI16 Int16Regs:$a)>;
2947def: Pat<(i16 (bitconvert (vt Int16Regs:$a))),
2948         (ProxyRegI16 Int16Regs:$a)>;
2949}
2950
2951// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
2952// we cannot specify floating-point literals in isel patterns.  Therefore, we
2953// use an integer selp to select either 1 or 0 and then cvt to floating-point.
2954
2955// sint -> f16
2956def : Pat<(f16 (sint_to_fp Int1Regs:$a)),
2957          (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2958def : Pat<(f16 (sint_to_fp Int16Regs:$a)),
2959          (CVT_f16_s16 Int16Regs:$a, CvtRN)>;
2960def : Pat<(f16 (sint_to_fp Int32Regs:$a)),
2961          (CVT_f16_s32 Int32Regs:$a, CvtRN)>;
2962def : Pat<(f16 (sint_to_fp Int64Regs:$a)),
2963          (CVT_f16_s64 Int64Regs:$a, CvtRN)>;
2964
2965// uint -> f16
2966def : Pat<(f16 (uint_to_fp Int1Regs:$a)),
2967          (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2968def : Pat<(f16 (uint_to_fp Int16Regs:$a)),
2969          (CVT_f16_u16 Int16Regs:$a, CvtRN)>;
2970def : Pat<(f16 (uint_to_fp Int32Regs:$a)),
2971          (CVT_f16_u32 Int32Regs:$a, CvtRN)>;
2972def : Pat<(f16 (uint_to_fp Int64Regs:$a)),
2973          (CVT_f16_u64 Int64Regs:$a, CvtRN)>;
2974
2975// sint -> bf16
2976def : Pat<(bf16 (sint_to_fp Int1Regs:$a)),
2977          (CVT_bf16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2978def : Pat<(bf16 (sint_to_fp Int16Regs:$a)),
2979          (CVT_bf16_s16 Int16Regs:$a, CvtRN)>;
2980def : Pat<(bf16 (sint_to_fp Int32Regs:$a)),
2981          (CVT_bf16_s32 Int32Regs:$a, CvtRN)>;
2982def : Pat<(bf16 (sint_to_fp Int64Regs:$a)),
2983          (CVT_bf16_s64 Int64Regs:$a, CvtRN)>;
2984
2985// uint -> bf16
2986def : Pat<(bf16 (uint_to_fp Int1Regs:$a)),
2987          (CVT_bf16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2988def : Pat<(bf16 (uint_to_fp Int16Regs:$a)),
2989          (CVT_bf16_u16 Int16Regs:$a, CvtRN)>;
2990def : Pat<(bf16 (uint_to_fp Int32Regs:$a)),
2991          (CVT_bf16_u32 Int32Regs:$a, CvtRN)>;
2992def : Pat<(bf16 (uint_to_fp Int64Regs:$a)),
2993          (CVT_bf16_u64 Int64Regs:$a, CvtRN)>;
2994
2995// sint -> f32
2996def : Pat<(f32 (sint_to_fp Int1Regs:$a)),
2997          (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2998def : Pat<(f32 (sint_to_fp Int16Regs:$a)),
2999          (CVT_f32_s16 Int16Regs:$a, CvtRN)>;
3000def : Pat<(f32 (sint_to_fp Int32Regs:$a)),
3001          (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
3002def : Pat<(f32 (sint_to_fp Int64Regs:$a)),
3003          (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
3004
3005// uint -> f32
3006def : Pat<(f32 (uint_to_fp Int1Regs:$a)),
3007          (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3008def : Pat<(f32 (uint_to_fp Int16Regs:$a)),
3009          (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
3010def : Pat<(f32 (uint_to_fp Int32Regs:$a)),
3011          (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
3012def : Pat<(f32 (uint_to_fp Int64Regs:$a)),
3013          (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
3014
3015// sint -> f64
3016def : Pat<(f64 (sint_to_fp Int1Regs:$a)),
3017          (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3018def : Pat<(f64 (sint_to_fp Int16Regs:$a)),
3019          (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
3020def : Pat<(f64 (sint_to_fp Int32Regs:$a)),
3021          (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
3022def : Pat<(f64 (sint_to_fp Int64Regs:$a)),
3023          (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
3024
3025// uint -> f64
3026def : Pat<(f64 (uint_to_fp Int1Regs:$a)),
3027          (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3028def : Pat<(f64 (uint_to_fp Int16Regs:$a)),
3029          (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
3030def : Pat<(f64 (uint_to_fp Int32Regs:$a)),
3031          (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
3032def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
3033          (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
3034
3035
3036// f16 -> sint
3037def : Pat<(i1 (fp_to_sint (f16 Int16Regs:$a))),
3038          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
3039def : Pat<(i16 (fp_to_sint (f16 Int16Regs:$a))),
3040          (CVT_s16_f16 (f16 Int16Regs:$a), CvtRZI)>;
3041def : Pat<(i32 (fp_to_sint (f16 Int16Regs:$a))),
3042          (CVT_s32_f16 (f16 Int16Regs:$a), CvtRZI)>;
3043def : Pat<(i64 (fp_to_sint (f16 Int16Regs:$a))),
3044          (CVT_s64_f16 Int16Regs:$a, CvtRZI)>;
3045
3046// f16 -> uint
3047def : Pat<(i1 (fp_to_uint (f16 Int16Regs:$a))),
3048          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
3049def : Pat<(i16 (fp_to_uint (f16 Int16Regs:$a))),
3050          (CVT_u16_f16 Int16Regs:$a, CvtRZI)>;
3051def : Pat<(i32 (fp_to_uint (f16 Int16Regs:$a))),
3052          (CVT_u32_f16 Int16Regs:$a, CvtRZI)>;
3053def : Pat<(i64 (fp_to_uint (f16 Int16Regs:$a))),
3054          (CVT_u64_f16 Int16Regs:$a, CvtRZI)>;
3055
3056// bf16 -> sint
3057def : Pat<(i1 (fp_to_sint (bf16 Int16Regs:$a))),
3058          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
3059def : Pat<(i16 (fp_to_sint (bf16 Int16Regs:$a))),
3060          (CVT_s16_bf16 (bf16 Int16Regs:$a), CvtRZI)>;
3061def : Pat<(i32 (fp_to_sint (bf16 Int16Regs:$a))),
3062          (CVT_s32_bf16 (bf16 Int16Regs:$a), CvtRZI)>;
3063def : Pat<(i64 (fp_to_sint (bf16 Int16Regs:$a))),
3064          (CVT_s64_bf16 Int16Regs:$a, CvtRZI)>;
3065
3066// bf16 -> uint
3067def : Pat<(i1 (fp_to_uint (bf16 Int16Regs:$a))),
3068          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
3069def : Pat<(i16 (fp_to_uint (bf16 Int16Regs:$a))),
3070          (CVT_u16_bf16 Int16Regs:$a, CvtRZI)>;
3071def : Pat<(i32 (fp_to_uint (bf16 Int16Regs:$a))),
3072          (CVT_u32_bf16 Int16Regs:$a, CvtRZI)>;
3073def : Pat<(i64 (fp_to_uint (bf16 Int16Regs:$a))),
3074          (CVT_u64_bf16 Int16Regs:$a, CvtRZI)>;
3075// f32 -> sint
3076def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
3077          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
3078def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
3079          (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3080def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
3081          (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
3082def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
3083          (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3084def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
3085          (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
3086def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
3087          (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3088def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
3089          (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
3090
3091// f32 -> uint
3092def : Pat<(i1 (fp_to_uint Float32Regs:$a)),
3093          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
3094def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
3095          (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3096def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
3097          (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
3098def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
3099          (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3100def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
3101          (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
3102def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
3103          (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3104def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
3105          (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
3106
3107// f64 -> sint
3108def : Pat<(i1 (fp_to_sint Float64Regs:$a)),
3109          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
3110def : Pat<(i16 (fp_to_sint Float64Regs:$a)),
3111          (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
3112def : Pat<(i32 (fp_to_sint Float64Regs:$a)),
3113          (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
3114def : Pat<(i64 (fp_to_sint Float64Regs:$a)),
3115          (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
3116
3117// f64 -> uint
3118def : Pat<(i1 (fp_to_uint Float64Regs:$a)),
3119          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
3120def : Pat<(i16 (fp_to_uint Float64Regs:$a)),
3121          (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
3122def : Pat<(i32 (fp_to_uint Float64Regs:$a)),
3123          (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
3124def : Pat<(i64 (fp_to_uint Float64Regs:$a)),
3125          (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
3126
3127// sext i1
3128def : Pat<(i16 (sext Int1Regs:$a)),
3129          (SELP_s16ii -1, 0, Int1Regs:$a)>;
3130def : Pat<(i32 (sext Int1Regs:$a)),
3131          (SELP_s32ii -1, 0, Int1Regs:$a)>;
3132def : Pat<(i64 (sext Int1Regs:$a)),
3133          (SELP_s64ii -1, 0, Int1Regs:$a)>;
3134
3135// zext i1
3136def : Pat<(i16 (zext Int1Regs:$a)),
3137          (SELP_u16ii 1, 0, Int1Regs:$a)>;
3138def : Pat<(i32 (zext Int1Regs:$a)),
3139          (SELP_u32ii 1, 0, Int1Regs:$a)>;
3140def : Pat<(i64 (zext Int1Regs:$a)),
3141          (SELP_u64ii 1, 0, Int1Regs:$a)>;
3142
3143// anyext i1
3144def : Pat<(i16 (anyext Int1Regs:$a)),
3145          (SELP_u16ii -1, 0, Int1Regs:$a)>;
3146def : Pat<(i32 (anyext Int1Regs:$a)),
3147          (SELP_u32ii -1, 0, Int1Regs:$a)>;
3148def : Pat<(i64 (anyext Int1Regs:$a)),
3149          (SELP_u64ii -1, 0, Int1Regs:$a)>;
3150
3151// sext i16
3152def : Pat<(i32 (sext Int16Regs:$a)),
3153          (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
3154def : Pat<(i64 (sext Int16Regs:$a)),
3155          (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
3156
3157// zext i16
3158def : Pat<(i32 (zext Int16Regs:$a)),
3159          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
3160def : Pat<(i64 (zext Int16Regs:$a)),
3161          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
3162
3163// anyext i16
3164def : Pat<(i32 (anyext Int16Regs:$a)),
3165          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
3166def : Pat<(i64 (anyext Int16Regs:$a)),
3167          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
3168
3169// sext i32
3170def : Pat<(i64 (sext Int32Regs:$a)),
3171          (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
3172
3173// zext i32
3174def : Pat<(i64 (zext Int32Regs:$a)),
3175          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
3176
3177// anyext i32
3178def : Pat<(i64 (anyext Int32Regs:$a)),
3179          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
3180
3181
3182// truncate i64
3183def : Pat<(i32 (trunc Int64Regs:$a)),
3184          (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
3185def : Pat<(i16 (trunc Int64Regs:$a)),
3186          (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
3187def : Pat<(i1 (trunc Int64Regs:$a)),
3188          (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
3189
3190// truncate i32
3191def : Pat<(i16 (trunc Int32Regs:$a)),
3192          (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
3193def : Pat<(i1 (trunc Int32Regs:$a)),
3194          (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
3195
3196// truncate i16
3197def : Pat<(i1 (trunc Int16Regs:$a)),
3198          (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
3199
3200// sext_inreg
3201def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
3202def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
3203def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
3204def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
3205def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
3206def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
3207
3208
3209// Select instructions with 32-bit predicates
3210def : Pat<(select Int32Regs:$pred, i16:$a, i16:$b),
3211          (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
3212          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3213def : Pat<(select Int32Regs:$pred, i32:$a, i32:$b),
3214          (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
3215          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3216def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
3217          (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
3218          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3219def : Pat<(select Int32Regs:$pred, (f16 Int16Regs:$a), (f16 Int16Regs:$b)),
3220          (SELP_f16rr Int16Regs:$a, Int16Regs:$b,
3221          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3222def : Pat<(select Int32Regs:$pred, (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)),
3223          (SELP_bf16rr Int16Regs:$a, Int16Regs:$b,
3224          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3225def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
3226          (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
3227          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3228def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
3229          (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
3230          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3231
3232
3233let hasSideEffects = false in {
3234  // pack a set of smaller int registers to a larger int register
3235  def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
3236                             (ins Int16Regs:$s1, Int16Regs:$s2,
3237                                  Int16Regs:$s3, Int16Regs:$s4),
3238                             "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
3239  def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
3240                             (ins Int16Regs:$s1, Int16Regs:$s2),
3241                             "mov.b32 \t$d, {{$s1, $s2}};", []>;
3242  def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
3243                             (ins Int32Regs:$s1, Int32Regs:$s2),
3244                             "mov.b64 \t$d, {{$s1, $s2}};", []>;
3245  def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
3246                             (ins Float32Regs:$s1, Float32Regs:$s2),
3247                             "mov.b64 \t$d, {{$s1, $s2}};", []>;
3248
3249  // unpack a larger int register to a set of smaller int registers
3250  def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
3251                                   Int16Regs:$d3, Int16Regs:$d4),
3252                             (ins Int64Regs:$s),
3253                             "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
3254  def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
3255                             (ins Int32Regs:$s),
3256                             "mov.b32 \t{{$d1, $d2}}, $s;", []>;
3257  def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
3258                             (ins Int64Regs:$s),
3259                             "mov.b64 \t{{$d1, $d2}}, $s;", []>;
3260  def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
3261                             (ins Float64Regs:$s),
3262                             "mov.b64 \t{{$d1, $d2}}, $s;", []>;
3263
3264  def I32toI16H  : NVPTXInst<(outs Int16Regs:$high),
3265                             (ins Int32Regs:$s),
3266                             "{{ .reg .b16 tmp; mov.b32 {tmp, $high}, $s; }}",
3267                             []>;
3268  def I32toI16L  : NVPTXInst<(outs Int16Regs:$low),
3269                             (ins Int32Regs:$s),
3270                             "{{ .reg .b16 tmp; mov.b32 {$low, tmp}, $s; }}",
3271                             []>;
3272  def I64toI32H  : NVPTXInst<(outs Int32Regs:$high),
3273                             (ins Int64Regs:$s),
3274                             "{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}",
3275                             []>;
3276}
3277
3278// Using partial vectorized move produces better SASS code for extraction of
3279// upper/lower parts of an integer.
3280def : Pat<(i16 (trunc (srl Int32Regs:$s, (i32 16)))),
3281          (I32toI16H Int32Regs:$s)>;
3282def : Pat<(i16 (trunc (sra Int32Regs:$s, (i32 16)))),
3283          (I32toI16H Int32Regs:$s)>;
3284def : Pat<(i32 (trunc (srl Int64Regs:$s, (i32 32)))),
3285          (I64toI32H Int64Regs:$s)>;
3286def : Pat<(i32 (trunc (sra Int64Regs:$s, (i32 32)))),
3287          (I64toI32H Int64Regs:$s)>;
3288
3289def : Pat<(f16 (extractelt (v2f16 Int32Regs:$src), 0)),
3290          (I32toI16L Int32Regs:$src)>;
3291def : Pat<(f16 (extractelt (v2f16 Int32Regs:$src), 1)),
3292          (I32toI16H Int32Regs:$src)>;
3293def : Pat<(v2f16 (build_vector (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
3294          (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
3295
3296def : Pat<(bf16 (extractelt (v2bf16 Int32Regs:$src), 0)),
3297          (I32toI16L Int32Regs:$src)>;
3298def : Pat<(bf16 (extractelt (v2bf16 Int32Regs:$src), 1)),
3299          (I32toI16H Int32Regs:$src)>;
3300def : Pat<(v2bf16 (build_vector (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
3301          (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
3302
3303// Count leading zeros
3304let hasSideEffects = false in {
3305  def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
3306                         "clz.b32 \t$d, $a;", []>;
3307  def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
3308                         "clz.b64 \t$d, $a;", []>;
3309}
3310
3311// 32-bit has a direct PTX instruction
3312def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
3313
3314// The return type of the ctlz ISD node is the same as its input, but the PTX
3315// ctz instruction always returns a 32-bit value.  For ctlz.i64, convert the
3316// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
3317// truncating back down to 32 bits.
3318def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
3319def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
3320
3321// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
3322// result back to 16-bits if necessary.  We also need to subtract 16 because
3323// the high-order 16 zeros were counted.
3324//
3325// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
3326// use to save one SASS instruction (on sm_35 anyway):
3327//
3328//   mov.b32 $tmp, {0xffff, $a}
3329//   ctlz.b32 $result, $tmp
3330//
3331// That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
3332// and then ctlz that value.  This way we don't have to subtract 16 from the
3333// result.  Unfortunately today we don't have a way to generate
3334// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
3335def : Pat<(i16 (ctlz Int16Regs:$a)),
3336          (SUBi16ri (CVT_u16_u32
3337           (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
3338def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))),
3339          (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
3340
3341// Population count
3342let hasSideEffects = false in {
3343  def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
3344                          "popc.b32 \t$d, $a;", []>;
3345  def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
3346                          "popc.b64 \t$d, $a;", []>;
3347}
3348
3349// 32-bit has a direct PTX instruction
3350def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>;
3351
3352// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
3353// to match the LLVM semantics.  Just as with ctlz.i64, we provide a second
3354// pattern that avoids the type conversion if we're truncating the result to
3355// i32 anyway.
3356def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
3357def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>;
3358
3359// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
3360// If we know that we're storing into an i32, we can avoid the final trunc.
3361def : Pat<(ctpop Int16Regs:$a),
3362          (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
3363def : Pat<(i32 (zext (i16 (ctpop Int16Regs:$a)))),
3364          (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>;
3365
3366// fpround f32 -> f16
3367def : Pat<(f16 (fpround Float32Regs:$a)),
3368          (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
3369
3370// fpround f32 -> bf16
3371def : Pat<(bf16 (fpround Float32Regs:$a)),
3372          (CVT_bf16_f32 Float32Regs:$a, CvtRN)>;
3373
3374// fpround f64 -> f16
3375def : Pat<(f16 (fpround Float64Regs:$a)),
3376          (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
3377
3378// fpround f64 -> bf16
3379def : Pat<(bf16 (fpround Float64Regs:$a)),
3380          (CVT_bf16_f64 Float64Regs:$a, CvtRN)>;
3381// fpround f64 -> f32
3382def : Pat<(f32 (fpround Float64Regs:$a)),
3383          (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
3384def : Pat<(f32 (fpround Float64Regs:$a)),
3385          (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
3386
3387// fpextend f16 -> f32
3388def : Pat<(f32 (fpextend (f16 Int16Regs:$a))),
3389          (CVT_f32_f16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
3390def : Pat<(f32 (fpextend (f16 Int16Regs:$a))),
3391          (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
3392// fpextend bf16 -> f32
3393def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))),
3394          (CVT_f32_bf16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
3395def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))),
3396          (CVT_f32_bf16 Int16Regs:$a, CvtNONE)>;
3397
3398// fpextend f16 -> f64
3399def : Pat<(f64 (fpextend (f16 Int16Regs:$a))),
3400          (CVT_f64_f16 Int16Regs:$a, CvtNONE)>;
3401
3402// fpextend bf16 -> f64
3403def : Pat<(f64 (fpextend (bf16 Int16Regs:$a))),
3404          (CVT_f64_bf16 Int16Regs:$a, CvtNONE)>;
3405
3406// fpextend f32 -> f64
3407def : Pat<(f64 (fpextend Float32Regs:$a)),
3408          (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
3409def : Pat<(f64 (fpextend Float32Regs:$a)),
3410          (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
3411
3412def retglue : SDNode<"NVPTXISD::RET_GLUE", SDTNone,
3413                     [SDNPHasChain, SDNPOptInGlue]>;
3414
3415// fceil, ffloor, froundeven, ftrunc.
3416
3417multiclass CVT_ROUND<SDNode OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
3418  def : Pat<(OpNode (f16 Int16Regs:$a)),
3419            (CVT_f16_f16 Int16Regs:$a, Mode)>;
3420  def : Pat<(OpNode (bf16 Int16Regs:$a)),
3421            (CVT_bf16_bf16 Int16Regs:$a, Mode)>;
3422  def : Pat<(OpNode Float32Regs:$a),
3423            (CVT_f32_f32 Float32Regs:$a, ModeFTZ)>, Requires<[doF32FTZ]>;
3424  def : Pat<(OpNode Float32Regs:$a),
3425            (CVT_f32_f32 Float32Regs:$a, Mode)>, Requires<[doNoF32FTZ]>;
3426  def : Pat<(OpNode Float64Regs:$a),
3427            (CVT_f64_f64 Float64Regs:$a, Mode)>;
3428}
3429
3430defm : CVT_ROUND<fceil, CvtRPI, CvtRPI_FTZ>;
3431defm : CVT_ROUND<ffloor, CvtRMI, CvtRMI_FTZ>;
3432defm : CVT_ROUND<froundeven, CvtRNI, CvtRNI_FTZ>;
3433defm : CVT_ROUND<ftrunc, CvtRZI, CvtRZI_FTZ>;
3434
3435// nearbyint and rint are implemented as rounding to nearest even.  This isn't
3436// strictly correct, because it causes us to ignore the rounding mode.  But it
3437// matches what CUDA's "libm" does.
3438
3439defm : CVT_ROUND<fnearbyint, CvtRNI, CvtRNI_FTZ>;
3440defm : CVT_ROUND<frint, CvtRNI, CvtRNI_FTZ>;
3441
3442//-----------------------------------
3443// Control-flow
3444//-----------------------------------
3445
3446let isTerminator=1 in {
3447   let isReturn=1, isBarrier=1 in
3448      def Return : NVPTXInst<(outs), (ins), "ret;", [(retglue)]>;
3449
3450   let isBranch=1 in
3451      def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
3452                              "@$a bra \t$target;",
3453                              [(brcond Int1Regs:$a, bb:$target)]>;
3454   let isBranch=1 in
3455      def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
3456                                   "@!$a bra \t$target;", []>;
3457
3458   let isBranch=1, isBarrier=1 in
3459      def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
3460                           "bra.uni \t$target;", [(br bb:$target)]>;
3461}
3462
3463def : Pat<(brcond Int32Regs:$a, bb:$target),
3464          (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
3465
3466// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
3467// conditional branch if the target block is the next block so that the code
3468// can fall through to the target block.  The invertion is done by 'xor
3469// condition, 1', which will be translated to (setne condition, -1).  Since ptx
3470// supports '@!pred bra target', we should use it.
3471def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
3472          (CBranchOther Int1Regs:$a, bb:$target)>;
3473
3474// Call
3475def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
3476                                            SDTCisVT<1, i32>]>;
3477def SDT_NVPTXCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
3478
3479def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
3480                           [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
3481def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
3482                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
3483                            SDNPSideEffect]>;
3484
3485def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
3486def call          : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
3487                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
3488def calltarget : Operand<i32>;
3489let isCall=1 in {
3490   def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>;
3491}
3492
3493def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
3494def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>;
3495
3496// Pseudo instructions.
3497class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
3498   : NVPTXInst<outs, ins, asmstr, pattern>;
3499
3500def Callseq_Start :
3501  NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
3502            "\\{ // callseq $amt1, $amt2\n"
3503            "\t.reg .b32 temp_param_reg;",
3504            [(callseq_start timm:$amt1, timm:$amt2)]>;
3505def Callseq_End :
3506  NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
3507            "\\} // callseq $amt1",
3508            [(callseq_end timm:$amt1, timm:$amt2)]>;
3509
3510// trap instruction
3511def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>;
3512
3513// Call prototype wrapper
3514def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
3515def CallPrototype :
3516  SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
3517         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
3518def ProtoIdent : Operand<i32> {
3519  let PrintMethod = "printProtoIdent";
3520}
3521def CALL_PROTOTYPE :
3522  NVPTXInst<(outs), (ins ProtoIdent:$ident),
3523            "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
3524
3525
3526include "NVPTXIntrinsics.td"
3527
3528
3529//-----------------------------------
3530// Notes
3531//-----------------------------------
3532// BSWAP is currently expanded. The following is a more efficient
3533// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
3534// - for sm_20, use pmpt (use vector scalar mov to get the pack and
3535//   unpack). sm_20 supports native 32-bit register, but not native 16-bit
3536// register.
3537