1//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file describes the PTX instructions in TableGen format.
10//
11//===----------------------------------------------------------------------===//
12
13include "NVPTXInstrFormats.td"
14
15let OperandType = "OPERAND_IMMEDIATE" in {
16  def f16imm : Operand<f16>;
17  def bf16imm : Operand<bf16>;
18
19}
20
21// List of vector specific properties
22def isVecLD      : VecInstTypeEnum<1>;
23def isVecST      : VecInstTypeEnum<2>;
24def isVecBuild   : VecInstTypeEnum<3>;
25def isVecShuffle : VecInstTypeEnum<4>;
26def isVecExtract : VecInstTypeEnum<5>;
27def isVecInsert  : VecInstTypeEnum<6>;
28def isVecDest    : VecInstTypeEnum<7>;
29def isVecOther   : VecInstTypeEnum<15>;
30
31//===----------------------------------------------------------------------===//
32// NVPTX Operand Definitions.
33//===----------------------------------------------------------------------===//
34
35def brtarget    : Operand<OtherVT>;
36
37// CVT conversion modes
38// These must match the enum in NVPTX.h
39def CvtNONE : PatLeaf<(i32 0x0)>;
40def CvtRNI  : PatLeaf<(i32 0x1)>;
41def CvtRZI  : PatLeaf<(i32 0x2)>;
42def CvtRMI  : PatLeaf<(i32 0x3)>;
43def CvtRPI  : PatLeaf<(i32 0x4)>;
44def CvtRN   : PatLeaf<(i32 0x5)>;
45def CvtRZ   : PatLeaf<(i32 0x6)>;
46def CvtRM   : PatLeaf<(i32 0x7)>;
47def CvtRP   : PatLeaf<(i32 0x8)>;
48def CvtRNA   : PatLeaf<(i32 0x9)>;
49
50def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
51def CvtRNI_FTZ  : PatLeaf<(i32 0x11)>;
52def CvtRZI_FTZ  : PatLeaf<(i32 0x12)>;
53def CvtRMI_FTZ  : PatLeaf<(i32 0x13)>;
54def CvtRPI_FTZ  : PatLeaf<(i32 0x14)>;
55def CvtRN_FTZ   : PatLeaf<(i32 0x15)>;
56def CvtRZ_FTZ   : PatLeaf<(i32 0x16)>;
57def CvtRM_FTZ   : PatLeaf<(i32 0x17)>;
58def CvtRP_FTZ   : PatLeaf<(i32 0x18)>;
59
60def CvtSAT      : PatLeaf<(i32 0x20)>;
61def CvtSAT_FTZ  : PatLeaf<(i32 0x30)>;
62
63def CvtNONE_RELU   : PatLeaf<(i32 0x40)>;
64def CvtRN_RELU   : PatLeaf<(i32 0x45)>;
65def CvtRZ_RELU   : PatLeaf<(i32 0x46)>;
66
67def CvtMode : Operand<i32> {
68  let PrintMethod = "printCvtMode";
69}
70
71// Compare modes
72// These must match the enum in NVPTX.h
73def CmpEQ   : PatLeaf<(i32 0)>;
74def CmpNE   : PatLeaf<(i32 1)>;
75def CmpLT   : PatLeaf<(i32 2)>;
76def CmpLE   : PatLeaf<(i32 3)>;
77def CmpGT   : PatLeaf<(i32 4)>;
78def CmpGE   : PatLeaf<(i32 5)>;
79def CmpLO   : PatLeaf<(i32 6)>;
80def CmpLS   : PatLeaf<(i32 7)>;
81def CmpHI   : PatLeaf<(i32 8)>;
82def CmpHS   : PatLeaf<(i32 9)>;
83def CmpEQU  : PatLeaf<(i32 10)>;
84def CmpNEU  : PatLeaf<(i32 11)>;
85def CmpLTU  : PatLeaf<(i32 12)>;
86def CmpLEU  : PatLeaf<(i32 13)>;
87def CmpGTU  : PatLeaf<(i32 14)>;
88def CmpGEU  : PatLeaf<(i32 15)>;
89def CmpNUM  : PatLeaf<(i32 16)>;
90def CmpNAN  : PatLeaf<(i32 17)>;
91
92def CmpEQ_FTZ   : PatLeaf<(i32 0x100)>;
93def CmpNE_FTZ   : PatLeaf<(i32 0x101)>;
94def CmpLT_FTZ   : PatLeaf<(i32 0x102)>;
95def CmpLE_FTZ   : PatLeaf<(i32 0x103)>;
96def CmpGT_FTZ   : PatLeaf<(i32 0x104)>;
97def CmpGE_FTZ   : PatLeaf<(i32 0x105)>;
98def CmpEQU_FTZ  : PatLeaf<(i32 0x10A)>;
99def CmpNEU_FTZ  : PatLeaf<(i32 0x10B)>;
100def CmpLTU_FTZ  : PatLeaf<(i32 0x10C)>;
101def CmpLEU_FTZ  : PatLeaf<(i32 0x10D)>;
102def CmpGTU_FTZ  : PatLeaf<(i32 0x10E)>;
103def CmpGEU_FTZ  : PatLeaf<(i32 0x10F)>;
104def CmpNUM_FTZ  : PatLeaf<(i32 0x110)>;
105def CmpNAN_FTZ  : PatLeaf<(i32 0x111)>;
106
107def CmpMode : Operand<i32> {
108  let PrintMethod = "printCmpMode";
109}
110def VecElement : Operand<i32> {
111  let PrintMethod = "printVecElement";
112}
113
114// PRMT modes
115// These must match the enum in NVPTX.h
116def PrmtNONE : PatLeaf<(i32 0x0)>;
117def PrmtF4E  : PatLeaf<(i32 0x1)>;
118def PrmtB4E  : PatLeaf<(i32 0x2)>;
119def PrmtRC8  : PatLeaf<(i32 0x3)>;
120def PrmtECL  : PatLeaf<(i32 0x4)>;
121def PrmtECR  : PatLeaf<(i32 0x5)>;
122def PrmtRC16 : PatLeaf<(i32 0x6)>;
123
124def PrmtMode : Operand<i32> {
125  let PrintMethod = "printPrmtMode";
126}
127
128
129//===----------------------------------------------------------------------===//
130// NVPTX Instruction Predicate Definitions
131//===----------------------------------------------------------------------===//
132
133
134def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
135def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
136def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
137def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
138def hasVote : Predicate<"Subtarget->hasVote()">;
139def hasDouble : Predicate<"Subtarget->hasDouble()">;
140def hasLDG : Predicate<"Subtarget->hasLDG()">;
141def hasLDU : Predicate<"Subtarget->hasLDU()">;
142
143def doF32FTZ : Predicate<"useF32FTZ()">;
144def doNoF32FTZ : Predicate<"!useF32FTZ()">;
145
146def doMulWide      : Predicate<"doMulWide">;
147
148def allowFMA : Predicate<"allowFMA()">;
149def noFMA : Predicate<"!allowFMA()">;
150def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">;
151def noUnsafeFPMath : Predicate<"!allowUnsafeFPMath()">;
152
153def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
154def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
155
156def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
157def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
158
159def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
160def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
161
162def True : Predicate<"true">;
163
164class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>;
165class hasSM<int version>: Predicate<"Subtarget->getSmVersion() >= " # version>;
166
167// Explicit records for arch-accelerated SM versions
168def hasSM90a : Predicate<"Subtarget->getFullSmVersion() == 901">;
169
170// non-sync shfl instructions are not available on sm_70+ in PTX6.4+
171def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"
172                          "&& Subtarget->getPTXVersion() >= 64)">;
173
174def useShortPtr : Predicate<"useShortPointers()">;
175def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
176def hasBF16Math: Predicate<"Subtarget->hasBF16Math()">;
177
178// Helper class to aid conversion between ValueType and a matching RegisterClass.
179
180class ValueToRegClass<ValueType T> {
181   string name = !cast<string>(T);
182   NVPTXRegClass ret = !cond(
183     !eq(name, "i1"): Int1Regs,
184     !eq(name, "i16"): Int16Regs,
185     !eq(name, "v2i16"): Int32Regs,
186     !eq(name, "i32"): Int32Regs,
187     !eq(name, "i64"): Int64Regs,
188     !eq(name, "f16"): Int16Regs,
189     !eq(name, "v2f16"): Int32Regs,
190     !eq(name, "bf16"): Int16Regs,
191     !eq(name, "v2bf16"): Int32Regs,
192     !eq(name, "f32"): Float32Regs,
193     !eq(name, "f64"): Float64Regs,
194     !eq(name, "ai32"): Int32ArgRegs,
195     !eq(name, "ai64"): Int64ArgRegs,
196     !eq(name, "af32"): Float32ArgRegs,
197     !eq(name, "if64"): Float64ArgRegs,
198    );
199}
200
201
202//===----------------------------------------------------------------------===//
203// Some Common Instruction Class Templates
204//===----------------------------------------------------------------------===//
205
206// Template for instructions which take three int64, int32, or int16 args.
207// The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
208multiclass I3<string OpcStr, SDNode OpNode> {
209  def i64rr :
210    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
211              !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
212              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
213  def i64ri :
214    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
215              !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
216              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
217  def i32rr :
218    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
219              !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
220              [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>;
221  def i32ri :
222    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
223              !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
224              [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>;
225  def i16rr :
226    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
227              !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
228              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
229  def i16ri :
230    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
231              !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
232              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
233}
234
235class I16x2<string OpcStr, SDNode OpNode> :
236 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
237              !strconcat(OpcStr, "16x2 \t$dst, $a, $b;"),
238              [(set Int32Regs:$dst, (OpNode (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)))]>,
239              Requires<[hasPTX<80>, hasSM<90>]>;
240
241// Template for instructions which take 3 int args.  The instructions are
242// named "<OpcStr>.s32" (e.g. "addc.cc.s32").
243multiclass ADD_SUB_INT_CARRY<string OpcStr, SDNode OpNode> {
244  let hasSideEffects = 1 in {
245    def i32rr :
246      NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
247                !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
248                [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>;
249    def i32ri :
250      NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
251                !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
252                [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>;
253    def i64rr :
254      NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
255                !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
256                [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>,
257      Requires<[hasPTX<43>]>;
258    def i64ri :
259      NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
260                !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
261                [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>,
262      Requires<[hasPTX<43>]>;
263  }
264}
265
266// Template for instructions which take three fp64 or fp32 args.  The
267// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64").
268//
269// Also defines ftz (flush subnormal inputs and results to sign-preserving
270// zero) variants for fp32 functions.
271//
272// This multiclass should be used for nodes that cannot be folded into FMAs.
273// For nodes that can be folded into FMAs (i.e. adds and muls), use
274// F3_fma_component.
275multiclass F3<string OpcStr, SDNode OpNode> {
276   def f64rr :
277     NVPTXInst<(outs Float64Regs:$dst),
278               (ins Float64Regs:$a, Float64Regs:$b),
279               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
280               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
281   def f64ri :
282     NVPTXInst<(outs Float64Regs:$dst),
283               (ins Float64Regs:$a, f64imm:$b),
284               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
285               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
286   def f32rr_ftz :
287     NVPTXInst<(outs Float32Regs:$dst),
288               (ins Float32Regs:$a, Float32Regs:$b),
289               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
290               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
291               Requires<[doF32FTZ]>;
292   def f32ri_ftz :
293     NVPTXInst<(outs Float32Regs:$dst),
294               (ins Float32Regs:$a, f32imm:$b),
295               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
296               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
297               Requires<[doF32FTZ]>;
298   def f32rr :
299     NVPTXInst<(outs Float32Regs:$dst),
300               (ins Float32Regs:$a, Float32Regs:$b),
301               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
302               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
303   def f32ri :
304     NVPTXInst<(outs Float32Regs:$dst),
305               (ins Float32Regs:$a, f32imm:$b),
306               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
307               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
308
309   def f16rr_ftz :
310     NVPTXInst<(outs Int16Regs:$dst),
311               (ins Int16Regs:$a, Int16Regs:$b),
312               !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
313               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
314               Requires<[useFP16Math, doF32FTZ]>;
315   def f16rr :
316     NVPTXInst<(outs Int16Regs:$dst),
317               (ins Int16Regs:$a, Int16Regs:$b),
318               !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
319               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
320               Requires<[useFP16Math]>;
321
322   def f16x2rr_ftz :
323     NVPTXInst<(outs Int32Regs:$dst),
324               (ins Int32Regs:$a, Int32Regs:$b),
325               !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
326               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
327               Requires<[useFP16Math, doF32FTZ]>;
328   def f16x2rr :
329     NVPTXInst<(outs Int32Regs:$dst),
330               (ins Int32Regs:$a, Int32Regs:$b),
331               !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
332               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
333               Requires<[useFP16Math]>;
334   def bf16rr_ftz :
335     NVPTXInst<(outs Int16Regs:$dst),
336               (ins Int16Regs:$a, Int16Regs:$b),
337               !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"),
338               [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
339               Requires<[hasBF16Math, doF32FTZ]>;
340   def bf16rr :
341     NVPTXInst<(outs Int16Regs:$dst),
342               (ins Int16Regs:$a, Int16Regs:$b),
343               !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"),
344               [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
345               Requires<[hasBF16Math]>;
346
347   def bf16x2rr_ftz :
348     NVPTXInst<(outs Int32Regs:$dst),
349               (ins Int32Regs:$a, Int32Regs:$b),
350               !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"),
351               [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
352               Requires<[hasBF16Math, doF32FTZ]>;
353   def bf16x2rr :
354     NVPTXInst<(outs Int32Regs:$dst),
355               (ins Int32Regs:$a, Int32Regs:$b),
356               !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"),
357               [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
358               Requires<[hasBF16Math]>;
359}
360
361// Template for instructions which take three FP args.  The
362// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
363//
364// Also defines ftz (flush subnormal inputs and results to sign-preserving
365// zero) variants for fp32/fp16 functions.
366//
367// This multiclass should be used for nodes that can be folded to make fma ops.
368// In this case, we use the ".rn" variant when FMA is disabled, as this behaves
369// just like the non ".rn" op, but prevents ptxas from creating FMAs.
370multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
371   def f64rr :
372     NVPTXInst<(outs Float64Regs:$dst),
373               (ins Float64Regs:$a, Float64Regs:$b),
374               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
375               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
376               Requires<[allowFMA]>;
377   def f64ri :
378     NVPTXInst<(outs Float64Regs:$dst),
379               (ins Float64Regs:$a, f64imm:$b),
380               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
381               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
382               Requires<[allowFMA]>;
383   def f32rr_ftz :
384     NVPTXInst<(outs Float32Regs:$dst),
385               (ins Float32Regs:$a, Float32Regs:$b),
386               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
387               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
388               Requires<[allowFMA, doF32FTZ]>;
389   def f32ri_ftz :
390     NVPTXInst<(outs Float32Regs:$dst),
391               (ins Float32Regs:$a, f32imm:$b),
392               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
393               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
394               Requires<[allowFMA, doF32FTZ]>;
395   def f32rr :
396     NVPTXInst<(outs Float32Regs:$dst),
397               (ins Float32Regs:$a, Float32Regs:$b),
398               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
399               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
400               Requires<[allowFMA]>;
401   def f32ri :
402     NVPTXInst<(outs Float32Regs:$dst),
403               (ins Float32Regs:$a, f32imm:$b),
404               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
405               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
406               Requires<[allowFMA]>;
407
408   def f16rr_ftz :
409     NVPTXInst<(outs Int16Regs:$dst),
410               (ins Int16Regs:$a, Int16Regs:$b),
411               !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
412               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
413               Requires<[useFP16Math, allowFMA, doF32FTZ]>;
414   def f16rr :
415     NVPTXInst<(outs Int16Regs:$dst),
416               (ins Int16Regs:$a, Int16Regs:$b),
417               !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
418               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
419               Requires<[useFP16Math, allowFMA]>;
420
421   def f16x2rr_ftz :
422     NVPTXInst<(outs Int32Regs:$dst),
423               (ins Int32Regs:$a, Int32Regs:$b),
424               !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
425               [(set (v2f16 Int32Regs:$dst), (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
426               Requires<[useFP16Math, allowFMA, doF32FTZ]>;
427   def f16x2rr :
428     NVPTXInst<(outs Int32Regs:$dst),
429               (ins Int32Regs:$a, Int32Regs:$b),
430               !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
431               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
432               Requires<[useFP16Math, allowFMA]>;
433   def bf16rr_ftz :
434     NVPTXInst<(outs Int16Regs:$dst),
435               (ins Int16Regs:$a, Int16Regs:$b),
436               !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"),
437               [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
438               Requires<[hasBF16Math, allowFMA, doF32FTZ]>;
439   def bf16rr :
440     NVPTXInst<(outs Int16Regs:$dst),
441               (ins Int16Regs:$a, Int16Regs:$b),
442               !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"),
443               [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
444               Requires<[hasBF16Math, allowFMA]>;
445
446   def bf16x2rr_ftz :
447     NVPTXInst<(outs Int32Regs:$dst),
448               (ins Int32Regs:$a, Int32Regs:$b),
449               !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"),
450               [(set (v2bf16 Int32Regs:$dst), (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
451               Requires<[hasBF16Math, allowFMA, doF32FTZ]>;
452   def bf16x2rr :
453     NVPTXInst<(outs Int32Regs:$dst),
454               (ins Int32Regs:$a, Int32Regs:$b),
455               !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"),
456               [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
457               Requires<[hasBF16Math, allowFMA]>;
458   // These have strange names so we don't perturb existing mir tests.
459   def _rnf64rr :
460     NVPTXInst<(outs Float64Regs:$dst),
461               (ins Float64Regs:$a, Float64Regs:$b),
462               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
463               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
464               Requires<[noFMA]>;
465   def _rnf64ri :
466     NVPTXInst<(outs Float64Regs:$dst),
467               (ins Float64Regs:$a, f64imm:$b),
468               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
469               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
470               Requires<[noFMA]>;
471   def _rnf32rr_ftz :
472     NVPTXInst<(outs Float32Regs:$dst),
473               (ins Float32Regs:$a, Float32Regs:$b),
474               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
475               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
476               Requires<[noFMA, doF32FTZ]>;
477   def _rnf32ri_ftz :
478     NVPTXInst<(outs Float32Regs:$dst),
479               (ins Float32Regs:$a, f32imm:$b),
480               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
481               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
482               Requires<[noFMA, doF32FTZ]>;
483   def _rnf32rr :
484     NVPTXInst<(outs Float32Regs:$dst),
485               (ins Float32Regs:$a, Float32Regs:$b),
486               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
487               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
488               Requires<[noFMA]>;
489   def _rnf32ri :
490     NVPTXInst<(outs Float32Regs:$dst),
491               (ins Float32Regs:$a, f32imm:$b),
492               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
493               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
494               Requires<[noFMA]>;
495   def _rnf16rr_ftz :
496     NVPTXInst<(outs Int16Regs:$dst),
497               (ins Int16Regs:$a, Int16Regs:$b),
498               !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"),
499               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
500               Requires<[useFP16Math, noFMA, doF32FTZ]>;
501   def _rnf16rr :
502     NVPTXInst<(outs Int16Regs:$dst),
503               (ins Int16Regs:$a, Int16Regs:$b),
504               !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"),
505               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
506               Requires<[useFP16Math, noFMA]>;
507   def _rnf16x2rr_ftz :
508     NVPTXInst<(outs Int32Regs:$dst),
509               (ins Int32Regs:$a, Int32Regs:$b),
510               !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"),
511               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
512               Requires<[useFP16Math, noFMA, doF32FTZ]>;
513   def _rnf16x2rr :
514     NVPTXInst<(outs Int32Regs:$dst),
515               (ins Int32Regs:$a, Int32Regs:$b),
516               !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"),
517               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
518               Requires<[useFP16Math, noFMA]>;
519  def _rnbf16rr_ftz :
520     NVPTXInst<(outs Int16Regs:$dst),
521               (ins Int16Regs:$a, Int16Regs:$b),
522               !strconcat(OpcStr, ".rn.ftz.bf16 \t$dst, $a, $b;"),
523               [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
524               Requires<[hasBF16Math, noFMA, doF32FTZ]>;
525   def _rnbf16rr :
526     NVPTXInst<(outs Int16Regs:$dst),
527               (ins Int16Regs:$a, Int16Regs:$b),
528               !strconcat(OpcStr, ".rn.bf16 \t$dst, $a, $b;"),
529               [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
530               Requires<[hasBF16Math, noFMA]>;
531   def _rnbf16x2rr_ftz :
532     NVPTXInst<(outs Int32Regs:$dst),
533               (ins Int32Regs:$a, Int32Regs:$b),
534               !strconcat(OpcStr, ".rn.ftz.bf16x2 \t$dst, $a, $b;"),
535               [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
536               Requires<[hasBF16Math, noFMA, doF32FTZ]>;
537   def _rnbf16x2rr :
538     NVPTXInst<(outs Int32Regs:$dst),
539               (ins Int32Regs:$a, Int32Regs:$b),
540               !strconcat(OpcStr, ".rn.bf16x2 \t$dst, $a, $b;"),
541               [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
542               Requires<[hasBF16Math, noFMA]>;
543}
544
545// Template for operations which take two f32 or f64 operands.  Provides three
546// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
547// subnormal inputs and results to zero).
548multiclass F2<string OpcStr, SDNode OpNode> {
549   def f64 :     NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
550                           !strconcat(OpcStr, ".f64 \t$dst, $a;"),
551                           [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
552   def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
553                           !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
554                           [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
555                           Requires<[doF32FTZ]>;
556   def f32 :     NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
557                           !strconcat(OpcStr, ".f32 \t$dst, $a;"),
558                           [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
559}
560
561multiclass F2_Support_Half<string OpcStr, SDNode OpNode> {
562   def bf16 :      NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
563                           !strconcat(OpcStr, ".bf16 \t$dst, $a;"),
564                           [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a)))]>,
565                           Requires<[hasSM<80>, hasPTX<70>]>;
566   def bf16x2 :    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
567                           !strconcat(OpcStr, ".bf16x2 \t$dst, $a;"),
568                           [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a)))]>,
569                           Requires<[hasSM<80>, hasPTX<70>]>;
570   def f16_ftz :   NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
571                           !strconcat(OpcStr, ".ftz.f16 \t$dst, $a;"),
572                           [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a)))]>,
573                           Requires<[hasSM<53>, hasPTX<65>, doF32FTZ]>;
574   def f16x2_ftz : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
575                           !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a;"),
576                           [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a)))]>,
577                           Requires<[hasSM<53>, hasPTX<65>, doF32FTZ]>;
578   def f16 :       NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
579                           !strconcat(OpcStr, ".f16 \t$dst, $a;"),
580                           [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a)))]>,
581                           Requires<[hasSM<53>, hasPTX<65>]>;
582   def f16x2 :     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
583                           !strconcat(OpcStr, ".f16x2 \t$dst, $a;"),
584                           [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a)))]>,
585                           Requires<[hasSM<53>, hasPTX<65>]>;
586
587}
588
589//===----------------------------------------------------------------------===//
590// NVPTX Instructions.
591//===----------------------------------------------------------------------===//
592
593//-----------------------------------
594// Type Conversion
595//-----------------------------------
596
597let hasSideEffects = false in {
598  // Generate a cvt to the given type from all possible types.  Each instance
599  // takes a CvtMode immediate that defines the conversion mode to use.  It can
600  // be CvtNONE to omit a conversion mode.
601  multiclass CVT_FROM_ALL<string ToType, RegisterClass RC, list<Predicate> Preds = []> {
602    def _s8 :
603      NVPTXInst<(outs RC:$dst),
604                (ins Int16Regs:$src, CvtMode:$mode),
605                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
606                ToType, ".s8 \t$dst, $src;"), []>,
607      Requires<Preds>;
608    def _u8 :
609      NVPTXInst<(outs RC:$dst),
610                (ins Int16Regs:$src, CvtMode:$mode),
611                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
612                ToType, ".u8 \t$dst, $src;"), []>,
613      Requires<Preds>;
614    def _s16 :
615      NVPTXInst<(outs RC:$dst),
616                (ins Int16Regs:$src, CvtMode:$mode),
617                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
618                ToType, ".s16 \t$dst, $src;"), []>,
619      Requires<Preds>;
620    def _u16 :
621      NVPTXInst<(outs RC:$dst),
622                (ins Int16Regs:$src, CvtMode:$mode),
623                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
624                ToType, ".u16 \t$dst, $src;"), []>,
625      Requires<Preds>;
626    def _s32 :
627      NVPTXInst<(outs RC:$dst),
628                (ins Int32Regs:$src, CvtMode:$mode),
629                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
630                ToType, ".s32 \t$dst, $src;"), []>,
631      Requires<Preds>;
632    def _u32 :
633      NVPTXInst<(outs RC:$dst),
634                (ins Int32Regs:$src, CvtMode:$mode),
635                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
636                ToType, ".u32 \t$dst, $src;"), []>,
637      Requires<Preds>;
638    def _s64 :
639      NVPTXInst<(outs RC:$dst),
640                (ins Int64Regs:$src, CvtMode:$mode),
641                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
642                ToType, ".s64 \t$dst, $src;"), []>,
643      Requires<Preds>;
644    def _u64 :
645      NVPTXInst<(outs RC:$dst),
646                (ins Int64Regs:$src, CvtMode:$mode),
647                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
648                ToType, ".u64 \t$dst, $src;"), []>,
649      Requires<Preds>;
650    def _f16 :
651      NVPTXInst<(outs RC:$dst),
652                (ins Int16Regs:$src, CvtMode:$mode),
653                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
654                ToType, ".f16 \t$dst, $src;"), []>,
655      Requires<Preds>;
656    def _bf16 :
657      NVPTXInst<(outs RC:$dst),
658                (ins Int16Regs:$src, CvtMode:$mode),
659                !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.",
660                ToType, ".bf16 \t$dst, $src;"), []>,
661      Requires<!if(!eq(ToType, "f32"),
662                   // bf16->f32 was introduced early.
663                   [hasPTX<71>, hasSM<80>],
664                   // bf16->everything else needs sm90/ptx78
665                   [hasPTX<78>, hasSM<90>])>;
666    def _f32 :
667      NVPTXInst<(outs RC:$dst),
668                (ins Float32Regs:$src, CvtMode:$mode),
669                !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.",
670                ToType, ".f32 \t$dst, $src;"), []>,
671      Requires<!if(!eq(ToType, "bf16"),
672                   // f32->bf16 was introduced early.
673                   [hasPTX<70>, hasSM<80>],
674                   Preds)>;
675    def _f64 :
676      NVPTXInst<(outs RC:$dst),
677                (ins Float64Regs:$src, CvtMode:$mode),
678                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
679                ToType, ".f64 \t$dst, $src;"), []>,
680      Requires<Preds>;
681  }
682
683  // Generate cvts from all types to all types.
684  defm CVT_s8  : CVT_FROM_ALL<"s8",  Int16Regs>;
685  defm CVT_u8  : CVT_FROM_ALL<"u8",  Int16Regs>;
686  defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
687  defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
688  defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
689  defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
690  defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
691  defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
692  defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>;
693  defm CVT_bf16 : CVT_FROM_ALL<"bf16", Int16Regs, [hasPTX<78>, hasSM<90>]>;
694  defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
695  defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
696
697  // These cvts are different from those above: The source and dest registers
698  // are of the same type.
699  def CVT_INREG_s16_s8 :  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
700                                    "cvt.s16.s8 \t$dst, $src;", []>;
701  def CVT_INREG_s32_s8 :  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
702                                    "cvt.s32.s8 \t$dst, $src;", []>;
703  def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
704                                    "cvt.s32.s16 \t$dst, $src;", []>;
705  def CVT_INREG_s64_s8 :  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
706                                    "cvt.s64.s8 \t$dst, $src;", []>;
707  def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
708                                    "cvt.s64.s16 \t$dst, $src;", []>;
709  def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
710                                    "cvt.s64.s32 \t$dst, $src;", []>;
711
712  multiclass CVT_FROM_FLOAT_V2_SM80<string FromName, RegisterClass RC> {
713    def _f32 :
714      NVPTXInst<(outs RC:$dst),
715                (ins Float32Regs:$src1, Float32Regs:$src2,  CvtMode:$mode),
716                !strconcat("cvt${mode:base}${mode:relu}.",
717                FromName, ".f32 \t$dst, $src1, $src2;"), []>,
718    Requires<[hasPTX<70>, hasSM<80>]>;
719  }
720
721  defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Int32Regs>;
722  defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>;
723}
724
725//-----------------------------------
726// Selection instructions (selp)
727//-----------------------------------
728
729// TODO: Missing slct
730
731// selp instructions that don't have any pattern matches; we explicitly use
732// them within this file.
733let hasSideEffects = false in {
734  multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
735    def rr : NVPTXInst<(outs RC:$dst),
736                       (ins RC:$a, RC:$b, Int1Regs:$p),
737                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
738    def ri : NVPTXInst<(outs RC:$dst),
739                       (ins RC:$a, ImmCls:$b, Int1Regs:$p),
740                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
741    def ir : NVPTXInst<(outs RC:$dst),
742                       (ins ImmCls:$a, RC:$b, Int1Regs:$p),
743                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
744    def ii : NVPTXInst<(outs RC:$dst),
745                       (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
746                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
747  }
748
749  multiclass SELP_PATTERN<string TypeStr, ValueType T, RegisterClass RC,
750                          Operand ImmCls, SDNode ImmNode> {
751    def rr :
752      NVPTXInst<(outs RC:$dst),
753                (ins RC:$a, RC:$b, Int1Regs:$p),
754                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
755                [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T RC:$b)))]>;
756    def ri :
757      NVPTXInst<(outs RC:$dst),
758                (ins RC:$a, ImmCls:$b, Int1Regs:$p),
759                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
760                [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T ImmNode:$b)))]>;
761    def ir :
762      NVPTXInst<(outs RC:$dst),
763                (ins ImmCls:$a, RC:$b, Int1Regs:$p),
764                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
765                [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, (T RC:$b)))]>;
766    def ii :
767      NVPTXInst<(outs RC:$dst),
768                (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
769                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
770                [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
771  }
772}
773
774// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as
775// good.
776defm SELP_b16 : SELP_PATTERN<"b16", i16, Int16Regs, i16imm, imm>;
777defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
778defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
779defm SELP_b32 : SELP_PATTERN<"b32", i32, Int32Regs, i32imm, imm>;
780defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;
781defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
782defm SELP_b64 : SELP_PATTERN<"b64", i64, Int64Regs, i64imm, imm>;
783defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
784defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
785defm SELP_f16 : SELP_PATTERN<"b16", f16, Int16Regs, f16imm, fpimm>;
786defm SELP_bf16 : SELP_PATTERN<"b16", bf16, Int16Regs, bf16imm, fpimm>;
787
788defm SELP_f32 : SELP_PATTERN<"f32", f32, Float32Regs, f32imm, fpimm>;
789defm SELP_f64 : SELP_PATTERN<"f64", f64, Float64Regs, f64imm, fpimm>;
790
791// This does not work as tablegen fails to infer the type of 'imm'.
792// def v2f16imm : Operand<v2f16>;
793// defm SELP_f16x2 : SELP_PATTERN<"b32", v2f16, Int32Regs, v2f16imm, imm>;
794
795foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
796def : Pat<(vt (select Int1Regs:$p, (vt Int32Regs:$a), (vt Int32Regs:$b))),
797          (SELP_b32rr Int32Regs:$a, Int32Regs:$b, Int1Regs:$p)>;
798}
799
800//-----------------------------------
801// Test Instructions
802//-----------------------------------
803
804def TESTINF_f32r : NVPTXInst<(outs Int1Regs:$p), (ins Float32Regs:$a),
805                             "testp.infinite.f32 \t$p, $a;",
806                             []>;
807def TESTINF_f32i : NVPTXInst<(outs Int1Regs:$p), (ins f32imm:$a),
808                             "testp.infinite.f32 \t$p, $a;",
809                             []>;
810def TESTINF_f64r : NVPTXInst<(outs Int1Regs:$p), (ins Float64Regs:$a),
811                             "testp.infinite.f64 \t$p, $a;",
812                             []>;
813def TESTINF_f64i : NVPTXInst<(outs Int1Regs:$p), (ins f64imm:$a),
814                             "testp.infinite.f64 \t$p, $a;",
815                             []>;
816
817//-----------------------------------
818// Integer Arithmetic
819//-----------------------------------
820
821// Template for xor masquerading as int1 arithmetic.
822multiclass ADD_SUB_i1<SDNode OpNode> {
823   def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
824                      "xor.pred \t$dst, $a, $b;",
825                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
826   def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
827                      "xor.pred \t$dst, $a, $b;",
828                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
829}
830
831// int1 addition and subtraction are both just xor.
832defm ADD_i1 : ADD_SUB_i1<add>;
833defm SUB_i1 : ADD_SUB_i1<sub>;
834
835// int16, int32, and int64 signed addition.  Since nvptx is 2's complement, we
836// also use these for unsigned arithmetic.
837defm ADD : I3<"add.s", add>;
838defm SUB : I3<"sub.s", sub>;
839
840def ADD16x2 : I16x2<"add.s", add>;
841def SUB16x2 : I16x2<"sub.s", sub>;
842
843// in32 and int64 addition and subtraction with carry-out.
844defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc>;
845defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>;
846
847// int32 and int64 addition and subtraction with carry-in and carry-out.
848defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>;
849defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>;
850
851defm MULT : I3<"mul.lo.s", mul>;
852
853defm MULTHS : I3<"mul.hi.s", mulhs>;
854defm MULTHU : I3<"mul.hi.u", mulhu>;
855
856defm SDIV : I3<"div.s", sdiv>;
857defm UDIV : I3<"div.u", udiv>;
858
859// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
860// will lower it.
861defm SREM : I3<"rem.s", srem>;
862defm UREM : I3<"rem.u", urem>;
863
864// Integer absolute value.  NumBits should be one minus the bit width of RC.
865// This idiom implements the algorithm at
866// http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs.
867multiclass ABS<ValueType T, RegisterClass RC, string SizeName> {
868  def : NVPTXInst<(outs RC:$dst), (ins RC:$a),
869                  !strconcat("abs", SizeName, " \t$dst, $a;"),
870                  [(set (T RC:$dst), (abs (T RC:$a)))]>;
871}
872defm ABS_16 : ABS<i16, Int16Regs, ".s16">;
873defm ABS_32 : ABS<i32, Int32Regs, ".s32">;
874defm ABS_64 : ABS<i64, Int64Regs, ".s64">;
875
876// Integer min/max.
877defm SMAX : I3<"max.s", smax>;
878defm UMAX : I3<"max.u", umax>;
879defm SMIN : I3<"min.s", smin>;
880defm UMIN : I3<"min.u", umin>;
881
882def SMAX16x2 : I16x2<"max.s", smax>;
883def UMAX16x2 : I16x2<"max.u", umax>;
884def SMIN16x2 : I16x2<"min.s", smin>;
885def UMIN16x2 : I16x2<"min.u", umin>;
886
887
888//
889// Wide multiplication
890//
891def MULWIDES64 :
892  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
893            "mul.wide.s32 \t$dst, $a, $b;", []>;
894def MULWIDES64Imm :
895  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
896            "mul.wide.s32 \t$dst, $a, $b;", []>;
897def MULWIDES64Imm64 :
898  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
899            "mul.wide.s32 \t$dst, $a, $b;", []>;
900
901def MULWIDEU64 :
902  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
903            "mul.wide.u32 \t$dst, $a, $b;", []>;
904def MULWIDEU64Imm :
905  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
906            "mul.wide.u32 \t$dst, $a, $b;", []>;
907def MULWIDEU64Imm64 :
908  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
909            "mul.wide.u32 \t$dst, $a, $b;", []>;
910
911def MULWIDES32 :
912  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
913            "mul.wide.s16 \t$dst, $a, $b;", []>;
914def MULWIDES32Imm :
915  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
916            "mul.wide.s16 \t$dst, $a, $b;", []>;
917def MULWIDES32Imm32 :
918  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
919            "mul.wide.s16 \t$dst, $a, $b;", []>;
920
921def MULWIDEU32 :
922  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
923            "mul.wide.u16 \t$dst, $a, $b;", []>;
924def MULWIDEU32Imm :
925  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
926            "mul.wide.u16 \t$dst, $a, $b;", []>;
927def MULWIDEU32Imm32 :
928  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
929            "mul.wide.u16 \t$dst, $a, $b;", []>;
930
931def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
932def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
933def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
934
935// Matchers for signed, unsigned mul.wide ISD nodes.
936def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)),
937          (MULWIDES32 i16:$a, i16:$b)>,
938      Requires<[doMulWide]>;
939def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
940          (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
941      Requires<[doMulWide]>;
942def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)),
943          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
944      Requires<[doMulWide]>;
945def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
946          (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
947      Requires<[doMulWide]>;
948
949def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)),
950          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
951      Requires<[doMulWide]>;
952def : Pat<(i64 (mul_wide_signed (i32 Int32Regs:$a), imm:$b)),
953          (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
954      Requires<[doMulWide]>;
955def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)),
956          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
957      Requires<[doMulWide]>;
958def : Pat<(i64 (mul_wide_unsigned (i32 Int32Regs:$a), imm:$b)),
959          (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
960      Requires<[doMulWide]>;
961
962// Predicates used for converting some patterns to mul.wide.
963def SInt32Const : PatLeaf<(imm), [{
964  const APInt &v = N->getAPIntValue();
965  return v.isSignedIntN(32);
966}]>;
967
968def UInt32Const : PatLeaf<(imm), [{
969  const APInt &v = N->getAPIntValue();
970  return v.isIntN(32);
971}]>;
972
973def SInt16Const : PatLeaf<(imm), [{
974  const APInt &v = N->getAPIntValue();
975  return v.isSignedIntN(16);
976}]>;
977
978def UInt16Const : PatLeaf<(imm), [{
979  const APInt &v = N->getAPIntValue();
980  return v.isIntN(16);
981}]>;
982
983def IntConst_0_30 : PatLeaf<(imm), [{
984  // Check if 0 <= v < 31; only then will the result of (x << v) be an int32.
985  const APInt &v = N->getAPIntValue();
986  return v.sge(0) && v.slt(31);
987}]>;
988
989def IntConst_0_14 : PatLeaf<(imm), [{
990  // Check if 0 <= v < 15; only then will the result of (x << v) be an int16.
991  const APInt &v = N->getAPIntValue();
992  return v.sge(0) && v.slt(15);
993}]>;
994
995def SHL2MUL32 : SDNodeXForm<imm, [{
996  const APInt &v = N->getAPIntValue();
997  APInt temp(32, 1);
998  return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32);
999}]>;
1000
1001def SHL2MUL16 : SDNodeXForm<imm, [{
1002  const APInt &v = N->getAPIntValue();
1003  APInt temp(16, 1);
1004  return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
1005}]>;
1006
1007// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
1008def : Pat<(shl (sext Int32Regs:$a), (i32 IntConst_0_30:$b)),
1009          (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
1010      Requires<[doMulWide]>;
1011def : Pat<(shl (zext Int32Regs:$a), (i32 IntConst_0_30:$b)),
1012          (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
1013      Requires<[doMulWide]>;
1014
1015def : Pat<(shl (sext Int16Regs:$a), (i16 IntConst_0_14:$b)),
1016          (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
1017      Requires<[doMulWide]>;
1018def : Pat<(shl (zext Int16Regs:$a), (i16 IntConst_0_14:$b)),
1019          (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
1020      Requires<[doMulWide]>;
1021
1022// Convert "sign/zero-extend then multiply" to mul.wide.
1023def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
1024          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
1025      Requires<[doMulWide]>;
1026def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
1027          (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
1028      Requires<[doMulWide]>;
1029
1030def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
1031          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
1032      Requires<[doMulWide]>;
1033def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
1034          (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
1035      Requires<[doMulWide]>;
1036
1037def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
1038          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
1039      Requires<[doMulWide]>;
1040def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
1041          (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
1042      Requires<[doMulWide]>;
1043
1044def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
1045          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
1046      Requires<[doMulWide]>;
1047def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
1048          (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
1049      Requires<[doMulWide]>;
1050
1051//
1052// Integer multiply-add
1053//
1054def SDTIMAD :
1055  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>,
1056                       SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
1057def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
1058
1059def MAD16rrr :
1060  NVPTXInst<(outs Int16Regs:$dst),
1061            (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
1062            "mad.lo.s16 \t$dst, $a, $b, $c;",
1063            [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
1064def MAD16rri :
1065  NVPTXInst<(outs Int16Regs:$dst),
1066            (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
1067            "mad.lo.s16 \t$dst, $a, $b, $c;",
1068            [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
1069def MAD16rir :
1070  NVPTXInst<(outs Int16Regs:$dst),
1071            (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
1072            "mad.lo.s16 \t$dst, $a, $b, $c;",
1073            [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
1074def MAD16rii :
1075  NVPTXInst<(outs Int16Regs:$dst),
1076            (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
1077            "mad.lo.s16 \t$dst, $a, $b, $c;",
1078            [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>;
1079
1080def MAD32rrr :
1081  NVPTXInst<(outs Int32Regs:$dst),
1082            (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
1083            "mad.lo.s32 \t$dst, $a, $b, $c;",
1084            [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), (i32 Int32Regs:$b), (i32 Int32Regs:$c)))]>;
1085def MAD32rri :
1086  NVPTXInst<(outs Int32Regs:$dst),
1087            (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
1088            "mad.lo.s32 \t$dst, $a, $b, $c;",
1089            [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), (i32 Int32Regs:$b), imm:$c))]>;
1090def MAD32rir :
1091  NVPTXInst<(outs Int32Regs:$dst),
1092            (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
1093            "mad.lo.s32 \t$dst, $a, $b, $c;",
1094            [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), imm:$b, (i32 Int32Regs:$c)))]>;
1095def MAD32rii :
1096  NVPTXInst<(outs Int32Regs:$dst),
1097            (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
1098            "mad.lo.s32 \t$dst, $a, $b, $c;",
1099            [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), imm:$b, imm:$c))]>;
1100
1101def MAD64rrr :
1102  NVPTXInst<(outs Int64Regs:$dst),
1103            (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
1104            "mad.lo.s64 \t$dst, $a, $b, $c;",
1105            [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
1106def MAD64rri :
1107  NVPTXInst<(outs Int64Regs:$dst),
1108            (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
1109            "mad.lo.s64 \t$dst, $a, $b, $c;",
1110            [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
1111def MAD64rir :
1112  NVPTXInst<(outs Int64Regs:$dst),
1113            (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
1114            "mad.lo.s64 \t$dst, $a, $b, $c;",
1115            [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
1116def MAD64rii :
1117  NVPTXInst<(outs Int64Regs:$dst),
1118            (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
1119            "mad.lo.s64 \t$dst, $a, $b, $c;",
1120            [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>;
1121
1122def INEG16 :
1123  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
1124            "neg.s16 \t$dst, $src;",
1125            [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
1126def INEG32 :
1127  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
1128            "neg.s32 \t$dst, $src;",
1129            [(set (i32 Int32Regs:$dst), (ineg (i32 Int32Regs:$src)))]>;
1130def INEG64 :
1131  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
1132            "neg.s64 \t$dst, $src;",
1133            [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
1134
1135//-----------------------------------
1136// Floating Point Arithmetic
1137//-----------------------------------
1138
1139// Constant 1.0f
1140def FloatConst1 : PatLeaf<(fpimm), [{
1141  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() &&
1142         N->getValueAPF().convertToFloat() == 1.0f;
1143}]>;
1144// Constant 1.0 (double)
1145def DoubleConst1 : PatLeaf<(fpimm), [{
1146  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
1147         N->getValueAPF().convertToDouble() == 1.0;
1148}]>;
1149
1150// Loads FP16 constant into a register.
1151//
1152// ptxas does not have hex representation for fp16, so we can't use
1153// fp16 immediate values in .f16 instructions. Instead we have to load
1154// the constant into a register using mov.b16.
1155def LOAD_CONST_F16 :
1156  NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$a),
1157            "mov.b16 \t$dst, $a;", []>;
1158def LOAD_CONST_BF16 :
1159  NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$a),
1160            "mov.b16 \t$dst, $a;", []>;
1161defm FADD : F3_fma_component<"add", fadd>;
1162defm FSUB : F3_fma_component<"sub", fsub>;
1163defm FMUL : F3_fma_component<"mul", fmul>;
1164
1165defm FMIN : F3<"min", fminnum>;
1166defm FMAX : F3<"max", fmaxnum>;
1167// Note: min.NaN.f64 and max.NaN.f64 do not actually exist.
1168defm FMINNAN : F3<"min.NaN", fminimum>;
1169defm FMAXNAN : F3<"max.NaN", fmaximum>;
1170
1171defm FABS  : F2<"abs", fabs>;
1172defm FNEG  : F2<"neg", fneg>;
1173defm FABS_H: F2_Support_Half<"abs", fabs>;
1174defm FNEG_H: F2_Support_Half<"neg", fneg>;
1175
1176defm FSQRT : F2<"sqrt.rn", fsqrt>;
1177
1178//
1179// F16 NEG
1180//
1181class FNEG_F16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> :
1182      NVPTXInst<(outs RC:$dst), (ins RC:$src),
1183                !strconcat(OpcStr, " \t$dst, $src;"),
1184                [(set RC:$dst, (fneg (T RC:$src)))]>,
1185                Requires<[useFP16Math, hasPTX<60>, hasSM<53>, Pred]>;
1186def FNEG16_ftz   : FNEG_F16_F16X2<"neg.ftz.f16", f16, Int16Regs, doF32FTZ>;
1187def FNEG16       : FNEG_F16_F16X2<"neg.f16", f16, Int16Regs, True>;
1188def FNEG16x2_ftz : FNEG_F16_F16X2<"neg.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>;
1189def FNEG16x2     : FNEG_F16_F16X2<"neg.f16x2", v2f16, Int32Regs, True>;
1190
1191//
1192// BF16 NEG
1193//
1194
1195class FNEG_BF16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> :
1196      NVPTXInst<(outs RC:$dst), (ins RC:$src),
1197                !strconcat(OpcStr, " \t$dst, $src;"),
1198                [(set RC:$dst, (fneg (T RC:$src)))]>,
1199                Requires<[hasBF16Math, hasPTX<70>, hasSM<80>, Pred]>;
1200def BFNEG16_ftz   : FNEG_BF16_F16X2<"neg.ftz.bf16", bf16, Int16Regs, doF32FTZ>;
1201def BFNEG16       : FNEG_BF16_F16X2<"neg.bf16", bf16, Int16Regs, True>;
1202def BFNEG16x2_ftz : FNEG_BF16_F16X2<"neg.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>;
1203def BFNEG16x2     : FNEG_BF16_F16X2<"neg.bf16x2", v2bf16, Int32Regs, True>;
1204
1205//
1206// F64 division
1207//
1208def FDIV641r :
1209  NVPTXInst<(outs Float64Regs:$dst),
1210            (ins f64imm:$a, Float64Regs:$b),
1211            "rcp.rn.f64 \t$dst, $b;",
1212            [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
1213def FDIV64rr :
1214  NVPTXInst<(outs Float64Regs:$dst),
1215            (ins Float64Regs:$a, Float64Regs:$b),
1216            "div.rn.f64 \t$dst, $a, $b;",
1217            [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>;
1218def FDIV64ri :
1219  NVPTXInst<(outs Float64Regs:$dst),
1220            (ins Float64Regs:$a, f64imm:$b),
1221            "div.rn.f64 \t$dst, $a, $b;",
1222            [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>;
1223
1224//
1225// F32 Approximate reciprocal
1226//
1227def FDIV321r_ftz :
1228  NVPTXInst<(outs Float32Regs:$dst),
1229            (ins f32imm:$a, Float32Regs:$b),
1230            "rcp.approx.ftz.f32 \t$dst, $b;",
1231            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1232            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
1233def FDIV321r :
1234  NVPTXInst<(outs Float32Regs:$dst),
1235            (ins f32imm:$a, Float32Regs:$b),
1236            "rcp.approx.f32 \t$dst, $b;",
1237            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1238            Requires<[do_DIVF32_APPROX]>;
1239//
1240// F32 Approximate division
1241//
1242def FDIV32approxrr_ftz :
1243  NVPTXInst<(outs Float32Regs:$dst),
1244            (ins Float32Regs:$a, Float32Regs:$b),
1245            "div.approx.ftz.f32 \t$dst, $a, $b;",
1246            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1247            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
1248def FDIV32approxri_ftz :
1249  NVPTXInst<(outs Float32Regs:$dst),
1250            (ins Float32Regs:$a, f32imm:$b),
1251            "div.approx.ftz.f32 \t$dst, $a, $b;",
1252            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1253            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
1254def FDIV32approxrr :
1255  NVPTXInst<(outs Float32Regs:$dst),
1256            (ins Float32Regs:$a, Float32Regs:$b),
1257            "div.approx.f32 \t$dst, $a, $b;",
1258            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1259            Requires<[do_DIVF32_APPROX]>;
1260def FDIV32approxri :
1261  NVPTXInst<(outs Float32Regs:$dst),
1262            (ins Float32Regs:$a, f32imm:$b),
1263            "div.approx.f32 \t$dst, $a, $b;",
1264            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1265            Requires<[do_DIVF32_APPROX]>;
1266//
1267// F32 Semi-accurate reciprocal
1268//
1269// rcp.approx gives the same result as div.full(1.0f, a) and is faster.
1270//
1271def FDIV321r_approx_ftz :
1272  NVPTXInst<(outs Float32Regs:$dst),
1273            (ins f32imm:$a, Float32Regs:$b),
1274            "rcp.approx.ftz.f32 \t$dst, $b;",
1275            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1276            Requires<[do_DIVF32_FULL, doF32FTZ]>;
1277def FDIV321r_approx :
1278  NVPTXInst<(outs Float32Regs:$dst),
1279            (ins f32imm:$a, Float32Regs:$b),
1280            "rcp.approx.f32 \t$dst, $b;",
1281            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1282            Requires<[do_DIVF32_FULL]>;
1283//
1284// F32 Semi-accurate division
1285//
1286def FDIV32rr_ftz :
1287  NVPTXInst<(outs Float32Regs:$dst),
1288            (ins Float32Regs:$a, Float32Regs:$b),
1289            "div.full.ftz.f32 \t$dst, $a, $b;",
1290            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1291            Requires<[do_DIVF32_FULL, doF32FTZ]>;
1292def FDIV32ri_ftz :
1293  NVPTXInst<(outs Float32Regs:$dst),
1294            (ins Float32Regs:$a, f32imm:$b),
1295            "div.full.ftz.f32 \t$dst, $a, $b;",
1296            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1297            Requires<[do_DIVF32_FULL, doF32FTZ]>;
1298def FDIV32rr :
1299  NVPTXInst<(outs Float32Regs:$dst),
1300            (ins Float32Regs:$a, Float32Regs:$b),
1301            "div.full.f32 \t$dst, $a, $b;",
1302            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1303            Requires<[do_DIVF32_FULL]>;
1304def FDIV32ri :
1305  NVPTXInst<(outs Float32Regs:$dst),
1306            (ins Float32Regs:$a, f32imm:$b),
1307            "div.full.f32 \t$dst, $a, $b;",
1308            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1309            Requires<[do_DIVF32_FULL]>;
1310//
1311// F32 Accurate reciprocal
1312//
1313def FDIV321r_prec_ftz :
1314  NVPTXInst<(outs Float32Regs:$dst),
1315            (ins f32imm:$a, Float32Regs:$b),
1316            "rcp.rn.ftz.f32 \t$dst, $b;",
1317            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1318            Requires<[doF32FTZ]>;
1319def FDIV321r_prec :
1320  NVPTXInst<(outs Float32Regs:$dst),
1321            (ins f32imm:$a, Float32Regs:$b),
1322            "rcp.rn.f32 \t$dst, $b;",
1323            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>;
1324//
1325// F32 Accurate division
1326//
1327def FDIV32rr_prec_ftz :
1328  NVPTXInst<(outs Float32Regs:$dst),
1329            (ins Float32Regs:$a, Float32Regs:$b),
1330            "div.rn.ftz.f32 \t$dst, $a, $b;",
1331            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1332            Requires<[doF32FTZ]>;
1333def FDIV32ri_prec_ftz :
1334  NVPTXInst<(outs Float32Regs:$dst),
1335            (ins Float32Regs:$a, f32imm:$b),
1336            "div.rn.ftz.f32 \t$dst, $a, $b;",
1337            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1338            Requires<[doF32FTZ]>;
1339def FDIV32rr_prec :
1340  NVPTXInst<(outs Float32Regs:$dst),
1341            (ins Float32Regs:$a, Float32Regs:$b),
1342            "div.rn.f32 \t$dst, $a, $b;",
1343            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>;
1344def FDIV32ri_prec :
1345  NVPTXInst<(outs Float32Regs:$dst),
1346            (ins Float32Regs:$a, f32imm:$b),
1347            "div.rn.f32 \t$dst, $a, $b;",
1348            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>;
1349
1350//
1351// FMA
1352//
1353
1354multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
1355   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1356                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1357                       [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
1358                       Requires<[Pred]>;
1359   def rri : NVPTXInst<(outs RC:$dst),
1360                       (ins RC:$a, RC:$b, ImmCls:$c),
1361                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1362                       [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>,
1363                       Requires<[Pred]>;
1364   def rir : NVPTXInst<(outs RC:$dst),
1365                       (ins RC:$a, ImmCls:$b, RC:$c),
1366                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1367                       [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>,
1368                       Requires<[Pred]>;
1369   def rii : NVPTXInst<(outs RC:$dst),
1370                       (ins RC:$a, ImmCls:$b, ImmCls:$c),
1371                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1372                       [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
1373                       Requires<[Pred]>;
1374}
1375
1376multiclass FMA_F16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> {
1377   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1378                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1379                       [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>,
1380                       Requires<[useFP16Math, Pred]>;
1381}
1382
1383multiclass FMA_BF16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> {
1384   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1385                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1386                       [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>,
1387                       Requires<[hasBF16Math, Pred]>;
1388}
1389
1390defm FMA16_ftz    : FMA_F16<"fma.rn.ftz.f16", f16, Int16Regs, doF32FTZ>;
1391defm FMA16        : FMA_F16<"fma.rn.f16", f16, Int16Regs, True>;
1392defm FMA16x2_ftz  : FMA_F16<"fma.rn.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>;
1393defm FMA16x2      : FMA_F16<"fma.rn.f16x2", v2f16, Int32Regs, True>;
1394defm BFMA16_ftz   : FMA_BF16<"fma.rn.ftz.bf16", bf16, Int16Regs, doF32FTZ>;
1395defm BFMA16       : FMA_BF16<"fma.rn.bf16", bf16, Int16Regs, True>;
1396defm BFMA16x2_ftz : FMA_BF16<"fma.rn.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>;
1397defm BFMA16x2     : FMA_BF16<"fma.rn.bf16x2", v2bf16, Int32Regs, True>;
1398defm FMA32_ftz    : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
1399defm FMA32        : FMA<"fma.rn.f32", Float32Regs, f32imm, True>;
1400defm FMA64        : FMA<"fma.rn.f64", Float64Regs, f64imm, True>;
1401
1402// sin/cos
1403def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1404                      "sin.approx.f32 \t$dst, $src;",
1405                      [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>,
1406                      Requires<[allowUnsafeFPMath]>;
1407def COSF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1408                      "cos.approx.f32 \t$dst, $src;",
1409                      [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>,
1410                      Requires<[allowUnsafeFPMath]>;
1411
1412// Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
1413// i.e. "poor man's fmod()". When y is infinite, x is returned. This matches the
1414// semantics of LLVM's frem.
1415
1416// frem - f32 FTZ
1417def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1418          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
1419            (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ),
1420             Float32Regs:$y))>,
1421          Requires<[doF32FTZ, allowUnsafeFPMath]>;
1422def : Pat<(frem Float32Regs:$x, fpimm:$y),
1423          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
1424            (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ),
1425             fpimm:$y))>,
1426          Requires<[doF32FTZ, allowUnsafeFPMath]>;
1427
1428def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1429          (SELP_f32rr Float32Regs:$x,
1430            (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
1431              (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ),
1432              Float32Regs:$y)),
1433            (TESTINF_f32r Float32Regs:$y))>,
1434          Requires<[doF32FTZ, noUnsafeFPMath]>;
1435def : Pat<(frem Float32Regs:$x, fpimm:$y),
1436          (SELP_f32rr Float32Regs:$x,
1437            (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
1438              (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ),
1439              fpimm:$y)),
1440            (TESTINF_f32i fpimm:$y))>,
1441          Requires<[doF32FTZ, noUnsafeFPMath]>;
1442
1443// frem - f32
1444def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1445          (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
1446            (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI),
1447             Float32Regs:$y))>,
1448          Requires<[allowUnsafeFPMath]>;
1449def : Pat<(frem Float32Regs:$x, fpimm:$y),
1450          (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
1451            (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI),
1452             fpimm:$y))>,
1453          Requires<[allowUnsafeFPMath]>;
1454
1455def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1456          (SELP_f32rr Float32Regs:$x,
1457            (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
1458              (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI),
1459              Float32Regs:$y)),
1460            (TESTINF_f32r Float32Regs:$y))>,
1461          Requires<[noUnsafeFPMath]>;
1462def : Pat<(frem Float32Regs:$x, fpimm:$y),
1463          (SELP_f32rr Float32Regs:$x,
1464            (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
1465              (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI),
1466              fpimm:$y)),
1467            (TESTINF_f32i fpimm:$y))>,
1468          Requires<[noUnsafeFPMath]>;
1469
1470// frem - f64
1471def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
1472          (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
1473            (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI),
1474             Float64Regs:$y))>,
1475          Requires<[allowUnsafeFPMath]>;
1476def : Pat<(frem Float64Regs:$x, fpimm:$y),
1477          (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
1478            (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI),
1479             fpimm:$y))>,
1480          Requires<[allowUnsafeFPMath]>;
1481
1482def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
1483          (SELP_f64rr Float64Regs:$x,
1484            (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
1485              (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI),
1486               Float64Regs:$y)),
1487            (TESTINF_f64r Float64Regs:$y))>,
1488          Requires<[noUnsafeFPMath]>;
1489def : Pat<(frem Float64Regs:$x, fpimm:$y),
1490          (SELP_f64rr Float64Regs:$x,
1491            (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
1492              (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI),
1493              fpimm:$y)),
1494            (TESTINF_f64r Float64Regs:$y))>,
1495          Requires<[noUnsafeFPMath]>;
1496
1497//-----------------------------------
1498// Bitwise operations
1499//-----------------------------------
1500
1501// Template for three-arg bitwise operations.  Takes three args, Creates .b16,
1502// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
1503multiclass BITWISE<string OpcStr, SDNode OpNode> {
1504  def b1rr :
1505    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
1506              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
1507              [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
1508  def b1ri :
1509    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
1510              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
1511              [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
1512  def b16rr :
1513    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
1514              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
1515              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
1516  def b16ri :
1517    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
1518              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
1519              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
1520  def b32rr :
1521    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1522              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
1523              [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>;
1524  def b32ri :
1525    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1526              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
1527              [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>;
1528  def b64rr :
1529    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
1530              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
1531              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
1532  def b64ri :
1533    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
1534              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
1535              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
1536}
1537
1538defm OR  : BITWISE<"or", or>;
1539defm AND : BITWISE<"and", and>;
1540defm XOR : BITWISE<"xor", xor>;
1541
1542// Lower logical v2i16/v4i8 ops as bitwise ops on b32.
1543foreach vt = [v2i16, v4i8] in {
1544  def: Pat<(or (vt Int32Regs:$a), (vt Int32Regs:$b)),
1545           (ORb32rr Int32Regs:$a, Int32Regs:$b)>;
1546  def: Pat<(xor (vt Int32Regs:$a), (vt Int32Regs:$b)),
1547           (XORb32rr Int32Regs:$a, Int32Regs:$b)>;
1548  def: Pat<(and (vt Int32Regs:$a), (vt Int32Regs:$b)),
1549           (ANDb32rr Int32Regs:$a, Int32Regs:$b)>;
1550
1551  // The constants get legalized into a bitcast from i32, so that's what we need
1552  // to match here.
1553  def: Pat<(or Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))),
1554           (ORb32ri Int32Regs:$a, imm:$b)>;
1555  def: Pat<(xor Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))),
1556           (XORb32ri Int32Regs:$a, imm:$b)>;
1557  def: Pat<(and Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))),
1558           (ANDb32ri Int32Regs:$a, imm:$b)>;
1559}
1560
1561def NOT1  : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
1562                      "not.pred \t$dst, $src;",
1563                      [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
1564def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
1565                      "not.b16 \t$dst, $src;",
1566                      [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
1567def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
1568                      "not.b32 \t$dst, $src;",
1569                      [(set (i32 Int32Regs:$dst), (not (i32 Int32Regs:$src)))]>;
1570def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
1571                       "not.b64 \t$dst, $src;",
1572                       [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
1573
1574// Template for left/right shifts.  Takes three operands,
1575//   [dest (reg), src (reg), shift (reg or imm)].
1576// dest and src may be int64, int32, or int16, but shift is always int32.
1577//
1578// This template also defines a 32-bit shift (imm, imm) instruction.
1579multiclass SHIFT<string OpcStr, SDNode OpNode> {
1580   def i64rr :
1581     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
1582               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1583               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 Int32Regs:$b)))]>;
1584   def i64ri :
1585     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
1586               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1587               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>;
1588   def i32rr :
1589     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1590               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1591               [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>;
1592   def i32ri :
1593     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1594               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1595               [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 imm:$b)))]>;
1596   def i32ii :
1597     NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
1598               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1599               [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
1600   def i16rr :
1601     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
1602               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1603               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 Int32Regs:$b)))]>;
1604   def i16ri :
1605     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
1606               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1607               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>;
1608}
1609
1610defm SHL : SHIFT<"shl.b", shl>;
1611defm SRA : SHIFT<"shr.s", sra>;
1612defm SRL : SHIFT<"shr.u", srl>;
1613
1614// Bit-reverse
1615def BREV32 :
1616  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
1617             "brev.b32 \t$dst, $a;",
1618             [(set Int32Regs:$dst, (bitreverse (i32 Int32Regs:$a)))]>;
1619def BREV64 :
1620  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a),
1621             "brev.b64 \t$dst, $a;",
1622             [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>;
1623
1624//
1625// Rotate: Use ptx shf instruction if available.
1626//
1627
1628// 32 bit r2 = rotl r1, n
1629//    =>
1630//        r2 = shf.l r1, r1, n
1631def ROTL32imm_hw :
1632  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
1633            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1634            [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 imm:$amt)))]>,
1635           Requires<[hasHWROT32]>;
1636
1637def ROTL32reg_hw :
1638  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1639            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1640            [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
1641           Requires<[hasHWROT32]>;
1642
1643// 32 bit r2 = rotr r1, n
1644//    =>
1645//        r2 = shf.r r1, r1, n
1646def ROTR32imm_hw :
1647  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
1648            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1649            [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 imm:$amt)))]>,
1650           Requires<[hasHWROT32]>;
1651
1652def ROTR32reg_hw :
1653  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1654            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1655            [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
1656           Requires<[hasHWROT32]>;
1657
1658// 32-bit software rotate by immediate.  $amt2 should equal 32 - $amt1.
1659def ROT32imm_sw :
1660  NVPTXInst<(outs Int32Regs:$dst),
1661            (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
1662            "{{\n\t"
1663            ".reg .b32 %lhs;\n\t"
1664            ".reg .b32 %rhs;\n\t"
1665            "shl.b32 \t%lhs, $src, $amt1;\n\t"
1666            "shr.b32 \t%rhs, $src, $amt2;\n\t"
1667            "add.u32 \t$dst, %lhs, %rhs;\n\t"
1668            "}}",
1669            []>;
1670
1671def SUB_FRM_32 : SDNodeXForm<imm, [{
1672  return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32);
1673}]>;
1674
1675def : Pat<(rotl (i32 Int32Regs:$src), (i32 imm:$amt)),
1676          (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
1677      Requires<[noHWROT32]>;
1678def : Pat<(rotr (i32 Int32Regs:$src), (i32 imm:$amt)),
1679          (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
1680      Requires<[noHWROT32]>;
1681
1682// 32-bit software rotate left by register.
1683def ROTL32reg_sw :
1684  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1685            "{{\n\t"
1686            ".reg .b32 %lhs;\n\t"
1687            ".reg .b32 %rhs;\n\t"
1688            ".reg .b32 %amt2;\n\t"
1689            "shl.b32 \t%lhs, $src, $amt;\n\t"
1690            "sub.s32 \t%amt2, 32, $amt;\n\t"
1691            "shr.b32 \t%rhs, $src, %amt2;\n\t"
1692            "add.u32 \t$dst, %lhs, %rhs;\n\t"
1693            "}}",
1694            [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
1695           Requires<[noHWROT32]>;
1696
1697// 32-bit software rotate right by register.
1698def ROTR32reg_sw :
1699  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1700            "{{\n\t"
1701            ".reg .b32 %lhs;\n\t"
1702            ".reg .b32 %rhs;\n\t"
1703            ".reg .b32 %amt2;\n\t"
1704            "shr.b32 \t%lhs, $src, $amt;\n\t"
1705            "sub.s32 \t%amt2, 32, $amt;\n\t"
1706            "shl.b32 \t%rhs, $src, %amt2;\n\t"
1707            "add.u32 \t$dst, %lhs, %rhs;\n\t"
1708            "}}",
1709            [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
1710           Requires<[noHWROT32]>;
1711
1712// 64-bit software rotate by immediate.  $amt2 should equal 64 - $amt1.
1713def ROT64imm_sw :
1714  NVPTXInst<(outs Int64Regs:$dst),
1715            (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2),
1716            "{{\n\t"
1717            ".reg .b64 %lhs;\n\t"
1718            ".reg .b64 %rhs;\n\t"
1719            "shl.b64 \t%lhs, $src, $amt1;\n\t"
1720            "shr.b64 \t%rhs, $src, $amt2;\n\t"
1721            "add.u64 \t$dst, %lhs, %rhs;\n\t"
1722            "}}",
1723            []>;
1724
1725def SUB_FRM_64 : SDNodeXForm<imm, [{
1726    return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32);
1727}]>;
1728
1729def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
1730          (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
1731def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
1732          (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
1733
1734// 64-bit software rotate left by register.
1735def ROTL64reg_sw :
1736  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
1737            "{{\n\t"
1738            ".reg .b64 %lhs;\n\t"
1739            ".reg .b64 %rhs;\n\t"
1740            ".reg .u32 %amt2;\n\t"
1741            "shl.b64 \t%lhs, $src, $amt;\n\t"
1742            "sub.u32 \t%amt2, 64, $amt;\n\t"
1743            "shr.b64 \t%rhs, $src, %amt2;\n\t"
1744            "add.u64 \t$dst, %lhs, %rhs;\n\t"
1745            "}}",
1746            [(set Int64Regs:$dst, (rotl Int64Regs:$src, (i32 Int32Regs:$amt)))]>;
1747
1748def ROTR64reg_sw :
1749  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
1750            "{{\n\t"
1751            ".reg .b64 %lhs;\n\t"
1752            ".reg .b64 %rhs;\n\t"
1753            ".reg .u32 %amt2;\n\t"
1754            "shr.b64 \t%lhs, $src, $amt;\n\t"
1755            "sub.u32 \t%amt2, 64, $amt;\n\t"
1756            "shl.b64 \t%rhs, $src, %amt2;\n\t"
1757            "add.u64 \t$dst, %lhs, %rhs;\n\t"
1758            "}}",
1759            [(set Int64Regs:$dst, (rotr Int64Regs:$src, (i32 Int32Regs:$amt)))]>;
1760
1761//
1762// Funnnel shift in clamp mode
1763//
1764
1765// Create SDNodes so they can be used in the DAG code, e.g.
1766// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
1767def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
1768def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
1769
1770def FUNSHFLCLAMP :
1771  NVPTXInst<(outs Int32Regs:$dst),
1772            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1773            "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
1774            [(set Int32Regs:$dst,
1775              (FUN_SHFL_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>;
1776
1777def FUNSHFRCLAMP :
1778  NVPTXInst<(outs Int32Regs:$dst),
1779            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1780            "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
1781            [(set Int32Regs:$dst,
1782             (FUN_SHFR_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>;
1783
1784//
1785// BFE - bit-field extract
1786//
1787
1788// Template for BFE/BFI instructions.
1789// Args: [dest (reg), src (reg), start (reg or imm), end (reg or imm)].
1790// Start may be an imm only if end is also an imm.  FIXME: Is this a
1791// restriction in PTX?
1792//
1793// dest and src may be int32 or int64, but start and end are always int32.
1794def SDTBFE :
1795  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>,
1796                       SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
1797def bfe : SDNode<"NVPTXISD::BFE", SDTBFE>;
1798
1799def SDTBFI :
1800  SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
1801                       SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
1802def bfi : SDNode<"NVPTXISD::BFI", SDTBFI>;
1803
1804def SDTPRMT :
1805  SDTypeProfile<1, 4, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
1806                       SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>,]>;
1807def prmt : SDNode<"NVPTXISD::PRMT", SDTPRMT>;
1808
1809multiclass BFE<string Instr, ValueType T, RegisterClass RC> {
1810  def rrr
1811    : NVPTXInst<(outs RC:$d),
1812                (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
1813                !strconcat(Instr, " \t$d, $a, $b, $c;"),
1814                [(set (T RC:$d), (bfe (T RC:$a), (i32 Int32Regs:$b), (i32 Int32Regs:$c)))]>;
1815  def rri
1816    : NVPTXInst<(outs RC:$d),
1817                (ins RC:$a, Int32Regs:$b, i32imm:$c),
1818                !strconcat(Instr, " \t$d, $a, $b, $c;"),
1819                [(set (T RC:$d), (bfe (T RC:$a), (i32 Int32Regs:$b), (i32 imm:$c)))]>;
1820  def rii
1821    : NVPTXInst<(outs RC:$d),
1822                (ins RC:$a, i32imm:$b, i32imm:$c),
1823                !strconcat(Instr, " \t$d, $a, $b, $c;"),
1824                [(set (T RC:$d), (bfe (T RC:$a), (i32 imm:$b), (i32 imm:$c)))]>;
1825}
1826
1827multiclass BFI<string Instr, ValueType T, RegisterClass RC, Operand ImmCls> {
1828  def rrrr
1829    : NVPTXInst<(outs RC:$f),
1830                (ins RC:$a, RC:$b, Int32Regs:$c, Int32Regs:$d),
1831                !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1832                [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 Int32Regs:$d)))]>;
1833  def rrri
1834    : NVPTXInst<(outs RC:$f),
1835                (ins RC:$a, RC:$b, Int32Regs:$c, i32imm:$d),
1836                !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1837                [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 imm:$d)))]>;
1838  def rrii
1839    : NVPTXInst<(outs RC:$f),
1840                (ins RC:$a, RC:$b, i32imm:$c, i32imm:$d),
1841                !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1842                [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 imm:$c), (i32 imm:$d)))]>;
1843  def irrr
1844    : NVPTXInst<(outs RC:$f),
1845                (ins ImmCls:$a, RC:$b, Int32Regs:$c, Int32Regs:$d),
1846                !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1847                [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 Int32Regs:$d)))]>;
1848  def irri
1849    : NVPTXInst<(outs RC:$f),
1850                (ins ImmCls:$a, RC:$b, Int32Regs:$c, i32imm:$d),
1851                !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1852                [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 imm:$d)))]>;
1853  def irii
1854    : NVPTXInst<(outs RC:$f),
1855                (ins ImmCls:$a, RC:$b, i32imm:$c, i32imm:$d),
1856                !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1857                [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 imm:$c), (i32 imm:$d)))]>;
1858}
1859
1860multiclass PRMT<ValueType T, RegisterClass RC> {
1861  def rrr
1862    : NVPTXInst<(outs RC:$d),
1863                (ins RC:$a, Int32Regs:$b, Int32Regs:$c, PrmtMode:$mode),
1864                !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
1865                [(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), imm:$mode))]>;
1866  def rri
1867    : NVPTXInst<(outs RC:$d),
1868                (ins RC:$a, Int32Regs:$b, i32imm:$c, PrmtMode:$mode),
1869                !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
1870                [(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 imm:$c), imm:$mode))]>;
1871  def rii
1872    : NVPTXInst<(outs RC:$d),
1873                (ins RC:$a, i32imm:$b, i32imm:$c, PrmtMode:$mode),
1874                !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
1875                [(set (T RC:$d), (prmt (T RC:$a), (T imm:$b), (i32 imm:$c), imm:$mode))]>;
1876}
1877
1878let hasSideEffects = false in {
1879  defm BFE_S32 : BFE<"bfe.s32", i32, Int32Regs>;
1880  defm BFE_U32 : BFE<"bfe.u32", i32, Int32Regs>;
1881  defm BFE_S64 : BFE<"bfe.s64", i64, Int64Regs>;
1882  defm BFE_U64 : BFE<"bfe.u64", i64, Int64Regs>;
1883
1884  defm BFI_B32 : BFI<"bfi.b32", i32, Int32Regs, i32imm>;
1885  defm BFI_B64 : BFI<"bfi.b64", i64, Int64Regs, i64imm>;
1886
1887  defm PRMT_B32 : PRMT<i32, Int32Regs>;
1888}
1889
1890
1891// byte extraction + signed/unsigned extension to i32.
1892def : Pat<(i32 (sext_inreg (bfe (i32 Int32Regs:$s),  (i32 Int32Regs:$o), 8), i8)),
1893          (BFE_S32rri Int32Regs:$s, Int32Regs:$o, 8)>;
1894def : Pat<(i32 (sext_inreg (bfe (i32 Int32Regs:$s),  (i32 imm:$o), 8), i8)),
1895          (BFE_S32rii Int32Regs:$s, imm:$o, 8)>;
1896def : Pat<(i32 (and (bfe (i32 Int32Regs:$s),  (i32 Int32Regs:$o), 8), 255)),
1897          (BFE_U32rri Int32Regs:$s, Int32Regs:$o, 8)>;
1898def : Pat<(i32 (and (bfe (i32 Int32Regs:$s),  (i32 imm:$o), 8), 255)),
1899          (BFE_U32rii Int32Regs:$s, imm:$o, 8)>;
1900
1901// byte extraction + signed extension to i16
1902def : Pat<(i16 (sext_inreg (trunc (bfe (i32 Int32Regs:$s),  (i32 imm:$o), 8)), i8)),
1903          (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, imm:$o, 8), CvtNONE)>;
1904
1905
1906// Byte extraction via shift/trunc/sext
1907def : Pat<(i16 (sext_inreg (trunc Int32Regs:$s), i8)),
1908          (CVT_s8_s32 Int32Regs:$s, CvtNONE)>;
1909def : Pat<(i16 (sext_inreg (trunc (srl (i32 Int32Regs:$s),  (i32 imm:$o))), i8)),
1910          (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, imm:$o, 8), CvtNONE)>;
1911def : Pat<(sext_inreg (srl (i32 Int32Regs:$s),  (i32 imm:$o)), i8),
1912          (BFE_S32rii Int32Regs:$s, imm:$o, 8)>;
1913def : Pat<(i16 (sra (i16 (trunc Int32Regs:$s)), (i32 8))),
1914          (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, 8, 8), CvtNONE)>;
1915def : Pat<(sext_inreg (srl (i64 Int64Regs:$s),  (i32 imm:$o)), i8),
1916          (BFE_S64rii Int64Regs:$s, imm:$o, 8)>;
1917def : Pat<(i16 (sext_inreg (trunc Int64Regs:$s), i8)),
1918          (CVT_s8_s64 Int64Regs:$s, CvtNONE)>;
1919def : Pat<(i16 (sext_inreg (trunc (srl (i64 Int64Regs:$s),  (i32 imm:$o))), i8)),
1920          (CVT_s8_s64 (BFE_S64rii Int64Regs:$s, imm:$o, 8), CvtNONE)>;
1921
1922//-----------------------------------
1923// Comparison instructions (setp, set)
1924//-----------------------------------
1925
1926// FIXME: This doesn't cover versions of set and setp that combine with a
1927// boolean predicate, e.g. setp.eq.and.b16.
1928
1929let hasSideEffects = false in {
1930  multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
1931    def rr :
1932      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
1933                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1934                           " \t$dst, $a, $b;"), []>;
1935    def ri :
1936      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
1937                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1938                           " \t$dst, $a, $b;"), []>;
1939    def ir :
1940      NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
1941                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1942                           " \t$dst, $a, $b;"), []>;
1943  }
1944}
1945
1946defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
1947defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>;
1948defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>;
1949defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>;
1950defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>;
1951defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>;
1952defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>;
1953defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>;
1954defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
1955defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
1956defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
1957def SETP_f16rr :
1958      NVPTXInst<(outs Int1Regs:$dst),
1959                (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp),
1960                "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;",
1961                []>, Requires<[useFP16Math]>;
1962
1963def SETP_f16x2rr :
1964      NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
1965                (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp),
1966                "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;",
1967                []>,
1968                Requires<[useFP16Math]>;
1969def SETP_bf16rr :
1970      NVPTXInst<(outs Int1Regs:$dst),
1971                (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp),
1972                "setp${cmp:base}${cmp:ftz}.bf16 \t$dst, $a, $b;",
1973                []>, Requires<[hasBF16Math, hasPTX<78>, hasSM<90>]>;
1974
1975def SETP_bf16x2rr :
1976      NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
1977                (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp),
1978                "setp${cmp:base}${cmp:ftz}.bf16x2 \t$p|$q, $a, $b;",
1979                []>,
1980                Requires<[hasBF16Math, hasPTX<78>, hasSM<90>]>;
1981
1982
1983// FIXME: This doesn't appear to be correct.  The "set" mnemonic has the form
1984// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
1985// reg, either u32, s32, or f32.  Anyway these aren't used at the moment.
1986
1987let hasSideEffects = false in {
1988  multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
1989    def rr : NVPTXInst<(outs Int32Regs:$dst),
1990                       (ins RC:$a, RC:$b, CmpMode:$cmp),
1991                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
1992    def ri : NVPTXInst<(outs Int32Regs:$dst),
1993                       (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
1994                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
1995    def ir : NVPTXInst<(outs Int32Regs:$dst),
1996                       (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
1997                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
1998  }
1999}
2000
2001defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
2002defm SET_s16 : SET<"s16", Int16Regs, i16imm>;
2003defm SET_u16 : SET<"u16", Int16Regs, i16imm>;
2004defm SET_b32 : SET<"b32", Int32Regs, i32imm>;
2005defm SET_s32 : SET<"s32", Int32Regs, i32imm>;
2006defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
2007defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
2008defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
2009defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
2010defm SET_f16 : SET<"f16", Int16Regs, f16imm>;
2011defm SET_bf16 : SET<"bf16", Int16Regs, bf16imm>, Requires<[hasPTX<78>, hasSM<90>]>;
2012defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
2013defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
2014
2015//-----------------------------------
2016// Data Movement (Load / Store, Move)
2017//-----------------------------------
2018
2019def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
2020                            [SDNPWantRoot]>;
2021def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
2022                              [SDNPWantRoot]>;
2023def ADDRvar : ComplexPattern<iPTR, 1, "SelectDirectAddr", [], []>;
2024
2025def MEMri : Operand<i32> {
2026  let PrintMethod = "printMemOperand";
2027  let MIOperandInfo = (ops Int32Regs, i32imm);
2028}
2029def MEMri64 : Operand<i64> {
2030  let PrintMethod = "printMemOperand";
2031  let MIOperandInfo = (ops Int64Regs, i64imm);
2032}
2033
2034def imem : Operand<iPTR> {
2035  let PrintMethod = "printOperand";
2036}
2037
2038def imemAny : Operand<iPTRAny> {
2039  let PrintMethod = "printOperand";
2040}
2041
2042def LdStCode : Operand<i32> {
2043  let PrintMethod = "printLdStCode";
2044}
2045
2046def MmaCode : Operand<i32> {
2047  let PrintMethod = "printMmaCode";
2048}
2049
2050def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
2051def Wrapper    : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
2052
2053// Load a memory address into a u32 or u64 register.
2054def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
2055                         "mov.u32 \t$dst, $a;",
2056                         [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
2057def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
2058                           "mov.u64 \t$dst, $a;",
2059                           [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
2060
2061// Get pointer to local stack.
2062let hasSideEffects = false in {
2063  def MOV_DEPOT_ADDR :    NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
2064                                     "mov.u32 \t$d, __local_depot$num;", []>;
2065  def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
2066                                    "mov.u64 \t$d, __local_depot$num;", []>;
2067}
2068
2069
2070// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
2071let IsSimpleMove=1, hasSideEffects=0 in {
2072  def IMOV1rr :  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
2073                           "mov.pred \t$dst, $sss;", []>;
2074  def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
2075                           "mov.u16 \t$dst, $sss;", []>;
2076  def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
2077                           "mov.u32 \t$dst, $sss;", []>;
2078  def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
2079                           "mov.u64 \t$dst, $sss;", []>;
2080
2081  def IMOVB16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
2082                           "mov.b16 \t$dst, $sss;", []>;
2083  def IMOVB32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
2084                           "mov.b32 \t$dst, $sss;", []>;
2085  def IMOVB64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
2086                           "mov.b64 \t$dst, $sss;", []>;
2087
2088  def FMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
2089                           // We have to use .b16 here as there's no mov.f16.
2090                           "mov.b16 \t$dst, $src;", []>;
2091  def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
2092                           "mov.f32 \t$dst, $src;", []>;
2093  def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
2094                           "mov.f64 \t$dst, $src;", []>;
2095}
2096
2097def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
2098                        "mov.pred \t$dst, $src;",
2099                        [(set Int1Regs:$dst, imm:$src)]>;
2100def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
2101                         "mov.u16 \t$dst, $src;",
2102                         [(set Int16Regs:$dst, imm:$src)]>;
2103def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
2104                         "mov.u32 \t$dst, $src;",
2105                         [(set (i32 Int32Regs:$dst), imm:$src)]>;
2106def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
2107                        "mov.u64 \t$dst, $src;",
2108                        [(set Int64Regs:$dst, imm:$src)]>;
2109
2110def IMOVB16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
2111                         "mov.b16 \t$dst, $src;", []>;
2112def IMOVB32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
2113                         "mov.b32 \t$dst, $src;", []>;
2114def IMOVB64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
2115                        "mov.b64 \t$dst, $src;", []>;
2116
2117def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
2118                         "mov.f32 \t$dst, $src;",
2119                         [(set Float32Regs:$dst, fpimm:$src)]>;
2120def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
2121                         "mov.f64 \t$dst, $src;",
2122                         [(set Float64Regs:$dst, fpimm:$src)]>;
2123
2124def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
2125def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64ri texternalsym:$dst)>;
2126
2127//---- Copy Frame Index ----
2128def LEA_ADDRi :   NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
2129                            "add.u32 \t$dst, ${addr:add};",
2130                            [(set Int32Regs:$dst, ADDRri:$addr)]>;
2131def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
2132                            "add.u64 \t$dst, ${addr:add};",
2133                            [(set Int64Regs:$dst, ADDRri64:$addr)]>;
2134
2135//-----------------------------------
2136// Comparison and Selection
2137//-----------------------------------
2138
2139multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
2140                       Instruction setp_16rr,
2141                       Instruction setp_16ri,
2142                       Instruction setp_16ir,
2143                       Instruction setp_32rr,
2144                       Instruction setp_32ri,
2145                       Instruction setp_32ir,
2146                       Instruction setp_64rr,
2147                       Instruction setp_64ri,
2148                       Instruction setp_64ir,
2149                       Instruction set_16rr,
2150                       Instruction set_16ri,
2151                       Instruction set_16ir,
2152                       Instruction set_32rr,
2153                       Instruction set_32ri,
2154                       Instruction set_32ir,
2155                       Instruction set_64rr,
2156                       Instruction set_64ri,
2157                       Instruction set_64ir> {
2158  // i16 -> pred
2159  def : Pat<(i1 (OpNode i16:$a, i16:$b)),
2160            (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
2161  def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)),
2162            (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
2163  def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)),
2164            (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
2165  // i32 -> pred
2166  def : Pat<(i1 (OpNode i32:$a, i32:$b)),
2167            (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
2168  def : Pat<(i1 (OpNode (i32 Int32Regs:$a), imm:$b)),
2169            (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
2170  def : Pat<(i1 (OpNode imm:$a, (i32 Int32Regs:$b))),
2171            (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
2172  // i64 -> pred
2173  def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)),
2174            (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
2175  def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)),
2176            (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
2177  def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)),
2178            (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
2179
2180  // i16 -> i32
2181  def : Pat<(i32 (OpNode i16:$a, i16:$b)),
2182            (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
2183  def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)),
2184            (set_16ri Int16Regs:$a, imm:$b, Mode)>;
2185  def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)),
2186            (set_16ir imm:$a, Int16Regs:$b, Mode)>;
2187  // i32 -> i32
2188  def : Pat<(i32 (OpNode i32:$a, i32:$b)),
2189            (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
2190  def : Pat<(i32 (OpNode (i32 Int32Regs:$a), imm:$b)),
2191            (set_32ri Int32Regs:$a, imm:$b, Mode)>;
2192  def : Pat<(i32 (OpNode imm:$a, (i32 Int32Regs:$b))),
2193            (set_32ir imm:$a, Int32Regs:$b, Mode)>;
2194  // i64 -> i32
2195  def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)),
2196            (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
2197  def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)),
2198            (set_64ri Int64Regs:$a, imm:$b, Mode)>;
2199  def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)),
2200            (set_64ir imm:$a, Int64Regs:$b, Mode)>;
2201}
2202
2203multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
2204  : ISET_FORMAT<OpNode, Mode,
2205                SETP_s16rr, SETP_s16ri, SETP_s16ir,
2206                SETP_s32rr, SETP_s32ri, SETP_s32ir,
2207                SETP_s64rr, SETP_s64ri, SETP_s64ir,
2208                SET_s16rr, SET_s16ri, SET_s16ir,
2209                SET_s32rr, SET_s32ri, SET_s32ir,
2210                SET_s64rr, SET_s64ri, SET_s64ir> {
2211  // TableGen doesn't like empty multiclasses.
2212  def : PatLeaf<(i32 0)>;
2213}
2214
2215multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
2216  : ISET_FORMAT<OpNode, Mode,
2217                SETP_u16rr, SETP_u16ri, SETP_u16ir,
2218                SETP_u32rr, SETP_u32ri, SETP_u32ir,
2219                SETP_u64rr, SETP_u64ri, SETP_u64ir,
2220                SET_u16rr, SET_u16ri, SET_u16ir,
2221                SET_u32rr, SET_u32ri, SET_u32ir,
2222                SET_u64rr, SET_u64ri, SET_u64ir> {
2223  // TableGen doesn't like empty multiclasses.
2224  def : PatLeaf<(i32 0)>;
2225}
2226
2227defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
2228defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
2229defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
2230defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
2231defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
2232defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
2233defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
2234defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
2235defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
2236defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
2237defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
2238defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
2239
2240// i1 compares
2241def : Pat<(setne Int1Regs:$a, Int1Regs:$b),
2242          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
2243def : Pat<(setune Int1Regs:$a, Int1Regs:$b),
2244          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
2245
2246def : Pat<(seteq Int1Regs:$a, Int1Regs:$b),
2247          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
2248def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
2249          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
2250
2251// comparisons of i8 extracted with BFE as i32
2252def: Pat<(setgt (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
2253         (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpGT)>;
2254def: Pat<(setge (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
2255         (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpGE)>;
2256def: Pat<(setlt (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
2257         (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpLT)>;
2258def: Pat<(setle (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
2259         (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpLE)>;
2260
2261def: Pat<(setugt (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
2262         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpHI)>;
2263def: Pat<(setuge (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
2264         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpHS)>;
2265def: Pat<(setult (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
2266         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLO)>;
2267def: Pat<(setule (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
2268         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLS)>;
2269def: Pat<(seteq (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
2270         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpEQ)>;
2271def: Pat<(setne (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
2272         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpNE)>;
2273
2274// i1 compare -> i32
2275def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
2276          (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
2277def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
2278          (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
2279
2280
2281
2282multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
2283  // f16 -> pred
2284  def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
2285            (SETP_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
2286        Requires<[useFP16Math,doF32FTZ]>;
2287  def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
2288            (SETP_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
2289        Requires<[useFP16Math]>;
2290  def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
2291            (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
2292        Requires<[useFP16Math,doF32FTZ]>;
2293  def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
2294            (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
2295        Requires<[useFP16Math]>;
2296  def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
2297            (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2298        Requires<[useFP16Math,doF32FTZ]>;
2299  def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
2300            (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>,
2301        Requires<[useFP16Math]>;
2302
2303  // bf16 -> pred
2304  def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
2305            (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
2306        Requires<[hasBF16Math,doF32FTZ]>;
2307  def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
2308            (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
2309        Requires<[hasBF16Math]>;
2310  def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
2311            (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>,
2312        Requires<[hasBF16Math,doF32FTZ]>;
2313  def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
2314            (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>,
2315        Requires<[hasBF16Math]>;
2316  def : Pat<(i1 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
2317            (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2318        Requires<[hasBF16Math,doF32FTZ]>;
2319  def : Pat<(i1 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
2320            (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>,
2321        Requires<[hasBF16Math]>;
2322
2323  // f32 -> pred
2324  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
2325            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
2326        Requires<[doF32FTZ]>;
2327  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
2328            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
2329  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
2330            (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
2331        Requires<[doF32FTZ]>;
2332  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
2333            (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
2334  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
2335            (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
2336        Requires<[doF32FTZ]>;
2337  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
2338            (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
2339
2340  // f64 -> pred
2341  def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),
2342            (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
2343  def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),
2344            (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
2345  def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
2346            (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
2347
2348  // f16 -> i32
2349  def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
2350            (SET_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
2351        Requires<[useFP16Math, doF32FTZ]>;
2352  def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
2353            (SET_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
2354        Requires<[useFP16Math]>;
2355  def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
2356            (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
2357        Requires<[useFP16Math, doF32FTZ]>;
2358  def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
2359            (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
2360        Requires<[useFP16Math]>;
2361  def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
2362            (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2363        Requires<[useFP16Math, doF32FTZ]>;
2364  def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
2365            (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>,
2366        Requires<[useFP16Math]>;
2367
2368  // bf16 -> i32
2369  def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
2370            (SET_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
2371        Requires<[hasBF16Math, doF32FTZ]>;
2372  def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
2373            (SET_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
2374        Requires<[hasBF16Math]>;
2375  def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
2376            (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>,
2377        Requires<[hasBF16Math, doF32FTZ]>;
2378  def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
2379            (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>,
2380        Requires<[hasBF16Math]>;
2381  def : Pat<(i32 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
2382            (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2383        Requires<[hasBF16Math, doF32FTZ]>;
2384  def : Pat<(i32 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
2385            (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>,
2386        Requires<[hasBF16Math]>;
2387
2388  // f32 -> i32
2389  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
2390            (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
2391        Requires<[doF32FTZ]>;
2392  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
2393            (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
2394  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
2395            (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
2396        Requires<[doF32FTZ]>;
2397  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
2398            (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
2399  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
2400            (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
2401        Requires<[doF32FTZ]>;
2402  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
2403            (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
2404
2405  // f64 -> i32
2406  def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)),
2407            (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
2408  def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)),
2409            (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
2410  def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)),
2411            (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
2412}
2413
2414defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
2415defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
2416defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
2417defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
2418defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
2419defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
2420
2421defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>;
2422defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>;
2423defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>;
2424defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>;
2425defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>;
2426defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>;
2427
2428defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>;
2429defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>;
2430defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>;
2431defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>;
2432defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>;
2433defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
2434
2435defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
2436defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
2437
2438// FIXME: What is this doing here?  Can it be deleted?
2439// def ld_param         : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
2440//                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
2441
2442def SDTDeclareParamProfile :
2443  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
2444def SDTDeclareScalarParamProfile :
2445  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
2446def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
2447def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
2448def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
2449def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
2450def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
2451def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
2452def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
2453def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
2454def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
2455def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
2456def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
2457def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
2458def SDTCallValProfile : SDTypeProfile<1, 0, []>;
2459def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
2460def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
2461def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
2462def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
2463def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
2464def SDTProxyRegProfile : SDTypeProfile<1, 1, []>;
2465
2466def DeclareParam :
2467  SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
2468         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2469def DeclareScalarParam :
2470  SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
2471         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2472def DeclareRetParam :
2473  SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile,
2474         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2475def DeclareRet :
2476  SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
2477         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2478def LoadParam :
2479  SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
2480         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
2481def LoadParamV2 :
2482  SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
2483         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
2484def LoadParamV4 :
2485  SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
2486         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
2487def PrintCall :
2488  SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
2489         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2490def PrintConvergentCall :
2491  SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile,
2492         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2493def PrintCallUni :
2494  SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
2495         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2496def PrintConvergentCallUni :
2497  SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile,
2498         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2499def StoreParam :
2500  SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
2501         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2502def StoreParamV2 :
2503  SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
2504         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2505def StoreParamV4 :
2506  SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
2507         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2508def StoreParamU32 :
2509  SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
2510         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2511def StoreParamS32 :
2512  SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
2513         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2514def CallArgBegin :
2515  SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
2516         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2517def CallArg :
2518  SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
2519         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2520def LastCallArg :
2521  SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
2522         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2523def CallArgEnd :
2524  SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
2525         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2526def CallVoid :
2527  SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
2528         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2529def Prototype :
2530  SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
2531         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2532def CallVal :
2533  SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
2534         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2535def MoveParam :
2536  SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
2537def StoreRetval :
2538  SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
2539         [SDNPHasChain, SDNPSideEffect]>;
2540def StoreRetvalV2 :
2541  SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
2542         [SDNPHasChain, SDNPSideEffect]>;
2543def StoreRetvalV4 :
2544  SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
2545         [SDNPHasChain, SDNPSideEffect]>;
2546def PseudoUseParam :
2547  SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile,
2548         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2549def RETURNNode :
2550  SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
2551         [SDNPHasChain, SDNPSideEffect]>;
2552def ProxyReg :
2553  SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile,
2554         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2555
2556let mayLoad = true in {
2557  class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
2558        NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
2559                  !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"),
2560                  []>;
2561
2562  class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
2563        NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
2564                  !strconcat("ld.param.v2", opstr,
2565                             " \t{{$dst, $dst2}}, [retval0+$b];"), []>;
2566
2567  class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
2568        NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
2569                        regclass:$dst4),
2570                  (ins i32imm:$b),
2571                  !strconcat("ld.param.v4", opstr,
2572                             " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"),
2573                  []>;
2574}
2575
2576class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
2577      NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
2578                !strconcat("mov", opstr, " \t$dst, retval$b;"),
2579                [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
2580
2581let mayStore = true in {
2582  class StoreParamInst<NVPTXRegClass regclass, string opstr> :
2583        NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
2584                  !strconcat("st.param", opstr, " \t[param$a+$b], $val;"),
2585                  []>;
2586
2587  class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
2588        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
2589                               i32imm:$a, i32imm:$b),
2590                  !strconcat("st.param.v2", opstr,
2591                             " \t[param$a+$b], {{$val, $val2}};"),
2592                  []>;
2593
2594  class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
2595        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3,
2596                               regclass:$val4, i32imm:$a,
2597                               i32imm:$b),
2598                  !strconcat("st.param.v4", opstr,
2599                             " \t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
2600                  []>;
2601
2602  class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
2603        NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
2604                  !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"),
2605                  []>;
2606
2607  class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
2608        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
2609                  !strconcat("st.param.v2", opstr,
2610                             " \t[func_retval0+$a], {{$val, $val2}};"),
2611                  []>;
2612
2613  class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
2614        NVPTXInst<(outs),
2615                  (ins regclass:$val, regclass:$val2, regclass:$val3,
2616                       regclass:$val4, i32imm:$a),
2617                  !strconcat("st.param.v4", opstr,
2618                             " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
2619                  []>;
2620}
2621
2622let isCall=1 in {
2623  multiclass CALL<string OpcStr, SDNode OpNode> {
2624     def PrintCallNoRetInst : NVPTXInst<(outs), (ins),
2625       !strconcat(OpcStr, " "), [(OpNode (i32 0))]>;
2626     def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
2627       !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>;
2628     def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
2629       !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>;
2630     def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
2631       !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>;
2632     def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
2633       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "),
2634       [(OpNode (i32 4))]>;
2635     def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
2636       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "),
2637       [(OpNode (i32 5))]>;
2638     def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
2639       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2640                            "retval5), "),
2641       [(OpNode (i32 6))]>;
2642     def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
2643       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2644                            "retval5, retval6), "),
2645       [(OpNode (i32 7))]>;
2646     def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
2647       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2648                            "retval5, retval6, retval7), "),
2649       [(OpNode (i32 8))]>;
2650  }
2651}
2652
2653defm Call : CALL<"call", PrintCall>;
2654defm CallUni : CALL<"call.uni", PrintCallUni>;
2655
2656// Convergent call instructions.  These are identical to regular calls, except
2657// they have the isConvergent bit set.
2658let isConvergent=1 in {
2659  defm ConvergentCall : CALL<"call", PrintConvergentCall>;
2660  defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>;
2661}
2662
2663def LoadParamMemI64    : LoadParamMemInst<Int64Regs, ".b64">;
2664def LoadParamMemI32    : LoadParamMemInst<Int32Regs, ".b32">;
2665def LoadParamMemI16    : LoadParamMemInst<Int16Regs, ".b16">;
2666def LoadParamMemI8     : LoadParamMemInst<Int16Regs, ".b8">;
2667def LoadParamMemV2I64  : LoadParamV2MemInst<Int64Regs, ".b64">;
2668def LoadParamMemV2I32  : LoadParamV2MemInst<Int32Regs, ".b32">;
2669def LoadParamMemV2I16  : LoadParamV2MemInst<Int16Regs, ".b16">;
2670def LoadParamMemV2I8   : LoadParamV2MemInst<Int16Regs, ".b8">;
2671def LoadParamMemV4I32  : LoadParamV4MemInst<Int32Regs, ".b32">;
2672def LoadParamMemV4I16  : LoadParamV4MemInst<Int16Regs, ".b16">;
2673def LoadParamMemV4I8   : LoadParamV4MemInst<Int16Regs, ".b8">;
2674def LoadParamMemF32    : LoadParamMemInst<Float32Regs, ".f32">;
2675def LoadParamMemF64    : LoadParamMemInst<Float64Regs, ".f64">;
2676def LoadParamMemV2F32  : LoadParamV2MemInst<Float32Regs, ".f32">;
2677def LoadParamMemV2F64  : LoadParamV2MemInst<Float64Regs, ".f64">;
2678def LoadParamMemV4F32  : LoadParamV4MemInst<Float32Regs, ".f32">;
2679
2680def StoreParamI64    : StoreParamInst<Int64Regs, ".b64">;
2681def StoreParamI32    : StoreParamInst<Int32Regs, ".b32">;
2682
2683def StoreParamI16    : StoreParamInst<Int16Regs, ".b16">;
2684def StoreParamI8     : StoreParamInst<Int16Regs, ".b8">;
2685def StoreParamV2I64  : StoreParamV2Inst<Int64Regs, ".b64">;
2686def StoreParamV2I32  : StoreParamV2Inst<Int32Regs, ".b32">;
2687def StoreParamV2I16  : StoreParamV2Inst<Int16Regs, ".b16">;
2688def StoreParamV2I8   : StoreParamV2Inst<Int16Regs, ".b8">;
2689
2690def StoreParamV4I32  : StoreParamV4Inst<Int32Regs, ".b32">;
2691def StoreParamV4I16  : StoreParamV4Inst<Int16Regs, ".b16">;
2692def StoreParamV4I8   : StoreParamV4Inst<Int16Regs, ".b8">;
2693
2694def StoreParamF32      : StoreParamInst<Float32Regs, ".f32">;
2695def StoreParamF64      : StoreParamInst<Float64Regs, ".f64">;
2696def StoreParamV2F32    : StoreParamV2Inst<Float32Regs, ".f32">;
2697def StoreParamV2F64    : StoreParamV2Inst<Float64Regs, ".f64">;
2698def StoreParamV4F32    : StoreParamV4Inst<Float32Regs, ".f32">;
2699
2700def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
2701def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
2702def StoreRetvalI16    : StoreRetvalInst<Int16Regs, ".b16">;
2703def StoreRetvalI8     : StoreRetvalInst<Int16Regs, ".b8">;
2704def StoreRetvalV2I64  : StoreRetvalV2Inst<Int64Regs, ".b64">;
2705def StoreRetvalV2I32  : StoreRetvalV2Inst<Int32Regs, ".b32">;
2706def StoreRetvalV2I16  : StoreRetvalV2Inst<Int16Regs, ".b16">;
2707def StoreRetvalV2I8   : StoreRetvalV2Inst<Int16Regs, ".b8">;
2708def StoreRetvalV4I32  : StoreRetvalV4Inst<Int32Regs, ".b32">;
2709def StoreRetvalV4I16  : StoreRetvalV4Inst<Int16Regs, ".b16">;
2710def StoreRetvalV4I8   : StoreRetvalV4Inst<Int16Regs, ".b8">;
2711
2712def StoreRetvalF64    : StoreRetvalInst<Float64Regs, ".f64">;
2713def StoreRetvalF32    : StoreRetvalInst<Float32Regs, ".f32">;
2714def StoreRetvalV2F64  : StoreRetvalV2Inst<Float64Regs, ".f64">;
2715def StoreRetvalV2F32  : StoreRetvalV2Inst<Float32Regs, ".f32">;
2716def StoreRetvalV4F32  : StoreRetvalV4Inst<Float32Regs, ".f32">;
2717
2718def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
2719def CallArgEndInst1  : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
2720def CallArgEndInst0  : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
2721def RETURNInst       : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
2722
2723class CallArgInst<NVPTXRegClass regclass> :
2724  NVPTXInst<(outs), (ins regclass:$a), "$a, ",
2725            [(CallArg (i32 0), regclass:$a)]>;
2726
2727class CallArgInstVT<NVPTXRegClass regclass, ValueType vt> :
2728  NVPTXInst<(outs), (ins regclass:$a), "$a, ",
2729            [(CallArg (i32 0), vt:$a)]>;
2730
2731class LastCallArgInst<NVPTXRegClass regclass> :
2732  NVPTXInst<(outs), (ins regclass:$a), "$a",
2733            [(LastCallArg (i32 0), regclass:$a)]>;
2734class LastCallArgInstVT<NVPTXRegClass regclass, ValueType vt> :
2735  NVPTXInst<(outs), (ins regclass:$a), "$a",
2736            [(LastCallArg (i32 0), vt:$a)]>;
2737
2738def CallArgI64     : CallArgInst<Int64Regs>;
2739def CallArgI32     : CallArgInstVT<Int32Regs, i32>;
2740def CallArgI16     : CallArgInstVT<Int16Regs, i16>;
2741def CallArgF64     : CallArgInst<Float64Regs>;
2742def CallArgF32     : CallArgInst<Float32Regs>;
2743
2744def LastCallArgI64 : LastCallArgInst<Int64Regs>;
2745def LastCallArgI32 : LastCallArgInstVT<Int32Regs, i32>;
2746def LastCallArgI16 : LastCallArgInstVT<Int16Regs, i16>;
2747def LastCallArgF64 : LastCallArgInst<Float64Regs>;
2748def LastCallArgF32 : LastCallArgInst<Float32Regs>;
2749
2750def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
2751                              [(CallArg (i32 0), (i32 imm:$a))]>;
2752def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
2753                                  [(LastCallArg (i32 0), (i32 imm:$a))]>;
2754
2755def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
2756                             [(CallArg (i32 1), (i32 imm:$a))]>;
2757def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
2758                                 [(LastCallArg (i32 1), (i32 imm:$a))]>;
2759
2760def CallVoidInst :      NVPTXInst<(outs), (ins imem:$addr), "$addr, ",
2761                                  [(CallVoid (Wrapper tglobaladdr:$addr))]>;
2762def CallVoidInstReg :   NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ",
2763                                  [(CallVoid i32:$addr)]>;
2764def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ",
2765                                  [(CallVoid Int64Regs:$addr)]>;
2766def PrototypeInst :     NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
2767                                  [(Prototype (i32 imm:$val))]>;
2768
2769def DeclareRetMemInst :
2770  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num),
2771            ".param .align $align .b8 retval$num[$size];",
2772            [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
2773def DeclareRetScalarInst :
2774  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
2775            ".param .b$size retval$num;",
2776            [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
2777def DeclareRetRegInst :
2778  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
2779            ".reg .b$size retval$num;",
2780            [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
2781
2782def DeclareParamInst :
2783  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
2784            ".param .align $align .b8 param$a[$size];",
2785            [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
2786def DeclareScalarParamInst :
2787  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
2788            ".param .b$size param$a;",
2789            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
2790def DeclareScalarRegInst :
2791  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
2792            ".reg .b$size param$a;",
2793            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
2794
2795class MoveParamInst<ValueType T, NVPTXRegClass regclass, string asmstr> :
2796  NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
2797            !strconcat("mov", asmstr, " \t$dst, $src;"),
2798            [(set (T regclass:$dst), (MoveParam (T regclass:$src)))]>;
2799
2800class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty, ValueType vt,
2801                          string asmstr> :
2802  NVPTXInst<(outs regclass:$dst), (ins srcty:$src),
2803            !strconcat("mov", asmstr, " \t$dst, $src;"),
2804            [(set vt:$dst, (MoveParam texternalsym:$src))]>;
2805
2806def MoveParamI64 : MoveParamInst<i64, Int64Regs, ".b64">;
2807def MoveParamI32 : MoveParamInst<i32, Int32Regs, ".b32">;
2808
2809def MoveParamSymbolI64 : MoveParamSymbolInst<Int64Regs, i64imm, i64, ".b64">;
2810def MoveParamSymbolI32 : MoveParamSymbolInst<Int32Regs, i32imm, i32, ".b32">;
2811
2812def MoveParamI16 :
2813  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
2814            "cvt.u16.u32 \t$dst, $src;", // ??? Why cvt.u16.u32 ?
2815            [(set i16:$dst, (MoveParam i16:$src))]>;
2816def MoveParamF64 : MoveParamInst<f64, Float64Regs, ".f64">;
2817def MoveParamF32 : MoveParamInst<f32, Float32Regs, ".f32">;
2818
2819class PseudoUseParamInst<NVPTXRegClass regclass, ValueType vt> :
2820  NVPTXInst<(outs), (ins regclass:$src),
2821            "// Pseudo use of $src",
2822            [(PseudoUseParam vt:$src)]>;
2823
2824def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs, i64>;
2825def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs, i32>;
2826def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs, i16>;
2827def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs, f64>;
2828def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs, f32>;
2829
2830class ProxyRegInst<string SzStr, ValueType T, NVPTXRegClass regclass> :
2831  NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
2832            !strconcat("mov.", SzStr, " \t$dst, $src;"),
2833            [(set (T regclass:$dst), (ProxyReg (T regclass:$src)))]>;
2834
2835def ProxyRegI1    : ProxyRegInst<"pred", i1, Int1Regs>;
2836def ProxyRegI16   : ProxyRegInst<"b16",  i16, Int16Regs>;
2837def ProxyRegI32   : ProxyRegInst<"b32",  i32, Int32Regs>;
2838def ProxyRegI64   : ProxyRegInst<"b64",  i64, Int64Regs>;
2839def ProxyRegF32   : ProxyRegInst<"f32",  f32, Float32Regs>;
2840def ProxyRegF64   : ProxyRegInst<"f64",  f64, Float64Regs>;
2841
2842foreach vt = [f16, bf16] in {
2843  def: Pat<(vt (ProxyReg  vt:$src)), (ProxyRegI16 Int16Regs:$src)>;
2844}
2845
2846foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
2847  def: Pat<(vt (ProxyReg  vt:$src)), (ProxyRegI32 Int32Regs:$src)>;
2848}
2849
2850//
2851// Load / Store Handling
2852//
2853multiclass LD<NVPTXRegClass regclass> {
2854  def _avar : NVPTXInst<
2855    (outs regclass:$dst),
2856    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2857         i32imm:$fromWidth, imem:$addr),
2858    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2859    "\t$dst, [$addr];", []>;
2860  def _areg : NVPTXInst<
2861    (outs regclass:$dst),
2862    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2863         i32imm:$fromWidth, Int32Regs:$addr),
2864    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2865    "\t$dst, [$addr];", []>;
2866  def _areg_64 : NVPTXInst<
2867    (outs regclass:$dst),
2868    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2869         i32imm:$fromWidth, Int64Regs:$addr),
2870    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2871    "\t$dst, [$addr];", []>;
2872  def _ari : NVPTXInst<
2873    (outs regclass:$dst),
2874    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2875         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2876    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2877    "\t$dst, [$addr+$offset];", []>;
2878  def _ari_64 : NVPTXInst<
2879    (outs regclass:$dst),
2880    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2881         LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2882    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2883    "\t$dst, [$addr+$offset];", []>;
2884  def _asi : NVPTXInst<
2885    (outs regclass:$dst),
2886    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2887         LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2888    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2889    "\t$dst, [$addr+$offset];", []>;
2890}
2891
2892let mayLoad=1, hasSideEffects=0 in {
2893  defm LD_i8  : LD<Int16Regs>;
2894  defm LD_i16 : LD<Int16Regs>;
2895  defm LD_i32 : LD<Int32Regs>;
2896  defm LD_i64 : LD<Int64Regs>;
2897  defm LD_f32 : LD<Float32Regs>;
2898  defm LD_f64 : LD<Float64Regs>;
2899}
2900
2901multiclass ST<NVPTXRegClass regclass> {
2902  def _avar : NVPTXInst<
2903    (outs),
2904    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2905         LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
2906    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2907    " \t[$addr], $src;", []>;
2908  def _areg : NVPTXInst<
2909    (outs),
2910    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
2911         LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
2912    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2913    " \t[$addr], $src;", []>;
2914  def _areg_64 : NVPTXInst<
2915    (outs),
2916    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2917         LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
2918    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2919    " \t[$addr], $src;", []>;
2920  def _ari : NVPTXInst<
2921    (outs),
2922    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2923         LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
2924    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2925    " \t[$addr+$offset], $src;", []>;
2926  def _ari_64 : NVPTXInst<
2927    (outs),
2928    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2929         LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
2930    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2931    " \t[$addr+$offset], $src;", []>;
2932  def _asi : NVPTXInst<
2933    (outs),
2934    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2935         LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
2936    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2937    " \t[$addr+$offset], $src;", []>;
2938}
2939
2940let mayStore=1, hasSideEffects=0 in {
2941  defm ST_i8  : ST<Int16Regs>;
2942  defm ST_i16 : ST<Int16Regs>;
2943  defm ST_i32 : ST<Int32Regs>;
2944  defm ST_i64 : ST<Int64Regs>;
2945  defm ST_f32 : ST<Float32Regs>;
2946  defm ST_f64 : ST<Float64Regs>;
2947}
2948
2949// The following is used only in and after vector elementizations.  Vector
2950// elementization happens at the machine instruction level, so the following
2951// instructions never appear in the DAG.
2952multiclass LD_VEC<NVPTXRegClass regclass> {
2953  def _v2_avar : NVPTXInst<
2954    (outs regclass:$dst1, regclass:$dst2),
2955    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2956         i32imm:$fromWidth, imem:$addr),
2957    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2958    "\t{{$dst1, $dst2}}, [$addr];", []>;
2959  def _v2_areg : NVPTXInst<
2960    (outs regclass:$dst1, regclass:$dst2),
2961    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2962         i32imm:$fromWidth, Int32Regs:$addr),
2963    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2964    "\t{{$dst1, $dst2}}, [$addr];", []>;
2965  def _v2_areg_64 : NVPTXInst<
2966    (outs regclass:$dst1, regclass:$dst2),
2967    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2968         i32imm:$fromWidth, Int64Regs:$addr),
2969    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2970    "\t{{$dst1, $dst2}}, [$addr];", []>;
2971  def _v2_ari : NVPTXInst<
2972    (outs regclass:$dst1, regclass:$dst2),
2973    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2974         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2975    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2976    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2977  def _v2_ari_64 : NVPTXInst<
2978    (outs regclass:$dst1, regclass:$dst2),
2979    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2980         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2981    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2982    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2983  def _v2_asi : NVPTXInst<
2984    (outs regclass:$dst1, regclass:$dst2),
2985    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2986         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2987    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2988    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2989  def _v4_avar : NVPTXInst<
2990    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2991    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2992         i32imm:$fromWidth, imem:$addr),
2993    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2994    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2995  def _v4_areg : NVPTXInst<
2996    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2997    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2998         i32imm:$fromWidth, Int32Regs:$addr),
2999    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3000    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
3001  def _v4_areg_64 : NVPTXInst<
3002    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
3003    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3004         i32imm:$fromWidth, Int64Regs:$addr),
3005    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3006    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
3007  def _v4_ari : NVPTXInst<
3008    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
3009    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3010         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
3011    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3012    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
3013  def _v4_ari_64 : NVPTXInst<
3014    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
3015    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3016         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
3017    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3018    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
3019  def _v4_asi : NVPTXInst<
3020    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
3021    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3022         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
3023    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3024    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
3025}
3026let mayLoad=1, hasSideEffects=0 in {
3027  defm LDV_i8  : LD_VEC<Int16Regs>;
3028  defm LDV_i16 : LD_VEC<Int16Regs>;
3029  defm LDV_i32 : LD_VEC<Int32Regs>;
3030  defm LDV_i64 : LD_VEC<Int64Regs>;
3031  defm LDV_f32 : LD_VEC<Float32Regs>;
3032  defm LDV_f64 : LD_VEC<Float64Regs>;
3033}
3034
3035multiclass ST_VEC<NVPTXRegClass regclass> {
3036  def _v2_avar : NVPTXInst<
3037    (outs),
3038    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3039         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
3040    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3041    "\t[$addr], {{$src1, $src2}};", []>;
3042  def _v2_areg : NVPTXInst<
3043    (outs),
3044    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3045         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
3046    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3047    "\t[$addr], {{$src1, $src2}};", []>;
3048  def _v2_areg_64 : NVPTXInst<
3049    (outs),
3050    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3051         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
3052    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3053    "\t[$addr], {{$src1, $src2}};", []>;
3054  def _v2_ari : NVPTXInst<
3055    (outs),
3056    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3057         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
3058         i32imm:$offset),
3059    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3060    "\t[$addr+$offset], {{$src1, $src2}};", []>;
3061  def _v2_ari_64 : NVPTXInst<
3062    (outs),
3063    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3064         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
3065         i32imm:$offset),
3066    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3067    "\t[$addr+$offset], {{$src1, $src2}};", []>;
3068  def _v2_asi : NVPTXInst<
3069    (outs),
3070    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3071         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
3072         i32imm:$offset),
3073    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3074    "\t[$addr+$offset], {{$src1, $src2}};", []>;
3075  def _v4_avar : NVPTXInst<
3076    (outs),
3077    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3078         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3079         i32imm:$fromWidth, imem:$addr),
3080    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3081    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
3082  def _v4_areg : NVPTXInst<
3083    (outs),
3084    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3085         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3086         i32imm:$fromWidth, Int32Regs:$addr),
3087    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3088    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
3089  def _v4_areg_64 : NVPTXInst<
3090    (outs),
3091    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3092         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3093         i32imm:$fromWidth, Int64Regs:$addr),
3094    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3095    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
3096  def _v4_ari : NVPTXInst<
3097    (outs),
3098    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3099         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3100         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
3101    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3102    "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
3103  def _v4_ari_64 : NVPTXInst<
3104    (outs),
3105    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3106         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3107         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
3108    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3109    "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
3110  def _v4_asi : NVPTXInst<
3111    (outs),
3112    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3113         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3114         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
3115    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
3116    "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
3117}
3118
3119let mayStore=1, hasSideEffects=0 in {
3120  defm STV_i8  : ST_VEC<Int16Regs>;
3121  defm STV_i16 : ST_VEC<Int16Regs>;
3122  defm STV_i32 : ST_VEC<Int32Regs>;
3123  defm STV_i64 : ST_VEC<Int64Regs>;
3124  defm STV_f32 : ST_VEC<Float32Regs>;
3125  defm STV_f64 : ST_VEC<Float64Regs>;
3126}
3127
3128//---- Conversion ----
3129
3130class F_BITCONVERT<string SzStr, ValueType TIn, ValueType TOut,
3131  NVPTXRegClass regclassIn = ValueToRegClass<TIn>.ret,
3132  NVPTXRegClass regclassOut = ValueToRegClass<TOut>.ret> :
3133           NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
3134           !strconcat("mov.b", SzStr, " \t$d, $a;"),
3135     [(set (TOut regclassOut:$d), (bitconvert (TIn regclassIn:$a)))]>;
3136
3137def BITCONVERT_32_I2F : F_BITCONVERT<"32", i32, f32>;
3138def BITCONVERT_32_F2I : F_BITCONVERT<"32", f32, i32>;
3139def BITCONVERT_64_I2F : F_BITCONVERT<"64", i64, f64>;
3140def BITCONVERT_64_F2I : F_BITCONVERT<"64", f64, i64>;
3141
3142foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
3143def: Pat<(vt (bitconvert (f32 Float32Regs:$a))),
3144         (BITCONVERT_32_F2I Float32Regs:$a)>;
3145def: Pat<(f32 (bitconvert (vt Int32Regs:$a))),
3146         (BITCONVERT_32_I2F Int32Regs:$a)>;
3147}
3148foreach vt = [f16, bf16] in {
3149def: Pat<(vt (bitconvert (i16 UInt16Const:$a))),
3150         (IMOVB16ri UInt16Const:$a)>;
3151def: Pat<(vt (bitconvert (i16 Int16Regs:$a))),
3152         (ProxyRegI16 Int16Regs:$a)>;
3153def: Pat<(i16 (bitconvert (vt Int16Regs:$a))),
3154         (ProxyRegI16 Int16Regs:$a)>;
3155}
3156
3157foreach ta = [v2f16, v2bf16, v2i16, v4i8, i32] in {
3158  def: Pat<(ta (bitconvert (i32 UInt32Const:$a))),
3159           (IMOVB32ri UInt32Const:$a)>;
3160  foreach tb = [v2f16, v2bf16, v2i16, v4i8, i32] in {
3161    if !ne(ta, tb) then {
3162      def: Pat<(ta (bitconvert (tb Int32Regs:$a))),
3163             (ProxyRegI32 Int32Regs:$a)>;
3164    }
3165  }
3166}
3167
3168// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
3169// we cannot specify floating-point literals in isel patterns.  Therefore, we
3170// use an integer selp to select either 1 or 0 and then cvt to floating-point.
3171
3172// sint -> f16
3173def : Pat<(f16 (sint_to_fp Int1Regs:$a)),
3174          (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3175def : Pat<(f16 (sint_to_fp Int16Regs:$a)),
3176          (CVT_f16_s16 Int16Regs:$a, CvtRN)>;
3177def : Pat<(f16 (sint_to_fp Int32Regs:$a)),
3178          (CVT_f16_s32 Int32Regs:$a, CvtRN)>;
3179def : Pat<(f16 (sint_to_fp Int64Regs:$a)),
3180          (CVT_f16_s64 Int64Regs:$a, CvtRN)>;
3181
3182// uint -> f16
3183def : Pat<(f16 (uint_to_fp Int1Regs:$a)),
3184          (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3185def : Pat<(f16 (uint_to_fp Int16Regs:$a)),
3186          (CVT_f16_u16 Int16Regs:$a, CvtRN)>;
3187def : Pat<(f16 (uint_to_fp Int32Regs:$a)),
3188          (CVT_f16_u32 Int32Regs:$a, CvtRN)>;
3189def : Pat<(f16 (uint_to_fp Int64Regs:$a)),
3190          (CVT_f16_u64 Int64Regs:$a, CvtRN)>;
3191
3192// sint -> bf16
3193def : Pat<(bf16 (sint_to_fp Int1Regs:$a)),
3194          (CVT_bf16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3195def : Pat<(bf16 (sint_to_fp Int16Regs:$a)),
3196          (CVT_bf16_s16 Int16Regs:$a, CvtRN)>;
3197def : Pat<(bf16 (sint_to_fp Int32Regs:$a)),
3198          (CVT_bf16_s32 Int32Regs:$a, CvtRN)>;
3199def : Pat<(bf16 (sint_to_fp Int64Regs:$a)),
3200          (CVT_bf16_s64 Int64Regs:$a, CvtRN)>;
3201
3202// uint -> bf16
3203def : Pat<(bf16 (uint_to_fp Int1Regs:$a)),
3204          (CVT_bf16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3205def : Pat<(bf16 (uint_to_fp Int16Regs:$a)),
3206          (CVT_bf16_u16 Int16Regs:$a, CvtRN)>;
3207def : Pat<(bf16 (uint_to_fp Int32Regs:$a)),
3208          (CVT_bf16_u32 Int32Regs:$a, CvtRN)>;
3209def : Pat<(bf16 (uint_to_fp Int64Regs:$a)),
3210          (CVT_bf16_u64 Int64Regs:$a, CvtRN)>;
3211
3212// sint -> f32
3213def : Pat<(f32 (sint_to_fp Int1Regs:$a)),
3214          (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3215def : Pat<(f32 (sint_to_fp Int16Regs:$a)),
3216          (CVT_f32_s16 Int16Regs:$a, CvtRN)>;
3217def : Pat<(f32 (sint_to_fp Int32Regs:$a)),
3218          (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
3219def : Pat<(f32 (sint_to_fp Int64Regs:$a)),
3220          (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
3221
3222// uint -> f32
3223def : Pat<(f32 (uint_to_fp Int1Regs:$a)),
3224          (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3225def : Pat<(f32 (uint_to_fp Int16Regs:$a)),
3226          (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
3227def : Pat<(f32 (uint_to_fp Int32Regs:$a)),
3228          (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
3229def : Pat<(f32 (uint_to_fp Int64Regs:$a)),
3230          (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
3231
3232// sint -> f64
3233def : Pat<(f64 (sint_to_fp Int1Regs:$a)),
3234          (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3235def : Pat<(f64 (sint_to_fp Int16Regs:$a)),
3236          (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
3237def : Pat<(f64 (sint_to_fp Int32Regs:$a)),
3238          (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
3239def : Pat<(f64 (sint_to_fp Int64Regs:$a)),
3240          (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
3241
3242// uint -> f64
3243def : Pat<(f64 (uint_to_fp Int1Regs:$a)),
3244          (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3245def : Pat<(f64 (uint_to_fp Int16Regs:$a)),
3246          (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
3247def : Pat<(f64 (uint_to_fp Int32Regs:$a)),
3248          (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
3249def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
3250          (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
3251
3252
3253// f16 -> sint
3254def : Pat<(i1 (fp_to_sint (f16 Int16Regs:$a))),
3255          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
3256def : Pat<(i16 (fp_to_sint (f16 Int16Regs:$a))),
3257          (CVT_s16_f16 (f16 Int16Regs:$a), CvtRZI)>;
3258def : Pat<(i32 (fp_to_sint (f16 Int16Regs:$a))),
3259          (CVT_s32_f16 (f16 Int16Regs:$a), CvtRZI)>;
3260def : Pat<(i64 (fp_to_sint (f16 Int16Regs:$a))),
3261          (CVT_s64_f16 Int16Regs:$a, CvtRZI)>;
3262
3263// f16 -> uint
3264def : Pat<(i1 (fp_to_uint (f16 Int16Regs:$a))),
3265          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
3266def : Pat<(i16 (fp_to_uint (f16 Int16Regs:$a))),
3267          (CVT_u16_f16 Int16Regs:$a, CvtRZI)>;
3268def : Pat<(i32 (fp_to_uint (f16 Int16Regs:$a))),
3269          (CVT_u32_f16 Int16Regs:$a, CvtRZI)>;
3270def : Pat<(i64 (fp_to_uint (f16 Int16Regs:$a))),
3271          (CVT_u64_f16 Int16Regs:$a, CvtRZI)>;
3272
3273// bf16 -> sint
3274def : Pat<(i1 (fp_to_sint (bf16 Int16Regs:$a))),
3275          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
3276def : Pat<(i16 (fp_to_sint (bf16 Int16Regs:$a))),
3277          (CVT_s16_bf16 (bf16 Int16Regs:$a), CvtRZI)>;
3278def : Pat<(i32 (fp_to_sint (bf16 Int16Regs:$a))),
3279          (CVT_s32_bf16 (bf16 Int16Regs:$a), CvtRZI)>;
3280def : Pat<(i64 (fp_to_sint (bf16 Int16Regs:$a))),
3281          (CVT_s64_bf16 Int16Regs:$a, CvtRZI)>;
3282
3283// bf16 -> uint
3284def : Pat<(i1 (fp_to_uint (bf16 Int16Regs:$a))),
3285          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
3286def : Pat<(i16 (fp_to_uint (bf16 Int16Regs:$a))),
3287          (CVT_u16_bf16 Int16Regs:$a, CvtRZI)>;
3288def : Pat<(i32 (fp_to_uint (bf16 Int16Regs:$a))),
3289          (CVT_u32_bf16 Int16Regs:$a, CvtRZI)>;
3290def : Pat<(i64 (fp_to_uint (bf16 Int16Regs:$a))),
3291          (CVT_u64_bf16 Int16Regs:$a, CvtRZI)>;
3292// f32 -> sint
3293def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
3294          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
3295def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
3296          (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3297def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
3298          (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
3299def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
3300          (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3301def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
3302          (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
3303def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
3304          (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3305def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
3306          (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
3307
3308// f32 -> uint
3309def : Pat<(i1 (fp_to_uint Float32Regs:$a)),
3310          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
3311def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
3312          (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3313def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
3314          (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
3315def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
3316          (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3317def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
3318          (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
3319def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
3320          (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3321def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
3322          (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
3323
3324// f64 -> sint
3325def : Pat<(i1 (fp_to_sint Float64Regs:$a)),
3326          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
3327def : Pat<(i16 (fp_to_sint Float64Regs:$a)),
3328          (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
3329def : Pat<(i32 (fp_to_sint Float64Regs:$a)),
3330          (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
3331def : Pat<(i64 (fp_to_sint Float64Regs:$a)),
3332          (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
3333
3334// f64 -> uint
3335def : Pat<(i1 (fp_to_uint Float64Regs:$a)),
3336          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
3337def : Pat<(i16 (fp_to_uint Float64Regs:$a)),
3338          (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
3339def : Pat<(i32 (fp_to_uint Float64Regs:$a)),
3340          (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
3341def : Pat<(i64 (fp_to_uint Float64Regs:$a)),
3342          (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
3343
3344// sext i1
3345def : Pat<(i16 (sext Int1Regs:$a)),
3346          (SELP_s16ii -1, 0, Int1Regs:$a)>;
3347def : Pat<(i32 (sext Int1Regs:$a)),
3348          (SELP_s32ii -1, 0, Int1Regs:$a)>;
3349def : Pat<(i64 (sext Int1Regs:$a)),
3350          (SELP_s64ii -1, 0, Int1Regs:$a)>;
3351
3352// zext i1
3353def : Pat<(i16 (zext Int1Regs:$a)),
3354          (SELP_u16ii 1, 0, Int1Regs:$a)>;
3355def : Pat<(i32 (zext Int1Regs:$a)),
3356          (SELP_u32ii 1, 0, Int1Regs:$a)>;
3357def : Pat<(i64 (zext Int1Regs:$a)),
3358          (SELP_u64ii 1, 0, Int1Regs:$a)>;
3359
3360// anyext i1
3361def : Pat<(i16 (anyext Int1Regs:$a)),
3362          (SELP_u16ii -1, 0, Int1Regs:$a)>;
3363def : Pat<(i32 (anyext Int1Regs:$a)),
3364          (SELP_u32ii -1, 0, Int1Regs:$a)>;
3365def : Pat<(i64 (anyext Int1Regs:$a)),
3366          (SELP_u64ii -1, 0, Int1Regs:$a)>;
3367
3368// sext i16
3369def : Pat<(i32 (sext Int16Regs:$a)),
3370          (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
3371def : Pat<(i64 (sext Int16Regs:$a)),
3372          (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
3373
3374// zext i16
3375def : Pat<(i32 (zext Int16Regs:$a)),
3376          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
3377def : Pat<(i64 (zext Int16Regs:$a)),
3378          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
3379
3380// anyext i16
3381def : Pat<(i32 (anyext Int16Regs:$a)),
3382          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
3383def : Pat<(i64 (anyext Int16Regs:$a)),
3384          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
3385
3386// sext i32
3387def : Pat<(i64 (sext Int32Regs:$a)),
3388          (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
3389
3390// zext i32
3391def : Pat<(i64 (zext Int32Regs:$a)),
3392          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
3393
3394// anyext i32
3395def : Pat<(i64 (anyext Int32Regs:$a)),
3396          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
3397
3398
3399// truncate i64
3400def : Pat<(i32 (trunc Int64Regs:$a)),
3401          (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
3402def : Pat<(i16 (trunc Int64Regs:$a)),
3403          (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
3404def : Pat<(i1 (trunc Int64Regs:$a)),
3405          (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
3406
3407// truncate i32
3408def : Pat<(i16 (trunc Int32Regs:$a)),
3409          (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
3410def : Pat<(i1 (trunc Int32Regs:$a)),
3411          (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
3412
3413// truncate i16
3414def : Pat<(i1 (trunc Int16Regs:$a)),
3415          (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
3416
3417// sext_inreg
3418def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
3419def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
3420def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
3421def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
3422def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
3423def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
3424
3425
3426// Select instructions with 32-bit predicates
3427def : Pat<(select (i32 Int32Regs:$pred), i16:$a, i16:$b),
3428          (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
3429          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3430def : Pat<(select (i32 Int32Regs:$pred), i32:$a, i32:$b),
3431          (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
3432          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3433def : Pat<(select (i32 Int32Regs:$pred), Int64Regs:$a, Int64Regs:$b),
3434          (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
3435          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3436def : Pat<(select (i32 Int32Regs:$pred), (f16 Int16Regs:$a), (f16 Int16Regs:$b)),
3437          (SELP_f16rr Int16Regs:$a, Int16Regs:$b,
3438          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3439def : Pat<(select (i32 Int32Regs:$pred), (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)),
3440          (SELP_bf16rr Int16Regs:$a, Int16Regs:$b,
3441          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3442def : Pat<(select (i32 Int32Regs:$pred), Float32Regs:$a, Float32Regs:$b),
3443          (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
3444          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3445def : Pat<(select (i32 Int32Regs:$pred), Float64Regs:$a, Float64Regs:$b),
3446          (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
3447          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3448
3449
3450let hasSideEffects = false in {
3451  // pack a set of smaller int registers to a larger int register
3452  def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
3453                             (ins Int16Regs:$s1, Int16Regs:$s2,
3454                                  Int16Regs:$s3, Int16Regs:$s4),
3455                             "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
3456  def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
3457                             (ins Int16Regs:$s1, Int16Regs:$s2),
3458                             "mov.b32 \t$d, {{$s1, $s2}};", []>;
3459  def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
3460                             (ins Int32Regs:$s1, Int32Regs:$s2),
3461                             "mov.b64 \t$d, {{$s1, $s2}};", []>;
3462  def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
3463                             (ins Float32Regs:$s1, Float32Regs:$s2),
3464                             "mov.b64 \t$d, {{$s1, $s2}};", []>;
3465
3466  // unpack a larger int register to a set of smaller int registers
3467  def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
3468                                   Int16Regs:$d3, Int16Regs:$d4),
3469                             (ins Int64Regs:$s),
3470                             "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
3471  def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
3472                             (ins Int32Regs:$s),
3473                             "mov.b32 \t{{$d1, $d2}}, $s;", []>;
3474  def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
3475                             (ins Int64Regs:$s),
3476                             "mov.b64 \t{{$d1, $d2}}, $s;", []>;
3477  def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
3478                             (ins Float64Regs:$s),
3479                             "mov.b64 \t{{$d1, $d2}}, $s;", []>;
3480
3481  def I32toI16H  : NVPTXInst<(outs Int16Regs:$high),
3482                             (ins Int32Regs:$s),
3483                             "{{ .reg .b16 tmp; mov.b32 {tmp, $high}, $s; }}",
3484                             []>;
3485  def I32toI16L  : NVPTXInst<(outs Int16Regs:$low),
3486                             (ins Int32Regs:$s),
3487                             "{{ .reg .b16 tmp; mov.b32 {$low, tmp}, $s; }}",
3488                             []>;
3489  def I64toI32H  : NVPTXInst<(outs Int32Regs:$high),
3490                             (ins Int64Regs:$s),
3491                             "{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}",
3492                             []>;
3493}
3494
3495// Using partial vectorized move produces better SASS code for extraction of
3496// upper/lower parts of an integer.
3497def : Pat<(i16 (trunc (srl Int32Regs:$s, (i32 16)))),
3498          (I32toI16H Int32Regs:$s)>;
3499def : Pat<(i16 (trunc (sra Int32Regs:$s, (i32 16)))),
3500          (I32toI16H Int32Regs:$s)>;
3501def : Pat<(i32 (trunc (srl Int64Regs:$s, (i32 32)))),
3502          (I64toI32H Int64Regs:$s)>;
3503def : Pat<(i32 (trunc (sra Int64Regs:$s, (i32 32)))),
3504          (I64toI32H Int64Regs:$s)>;
3505
3506def: Pat<(i32 (sext (extractelt (v2i16 Int32Regs:$src), 0))),
3507         (CVT_INREG_s32_s16 Int32Regs:$src)>;
3508
3509foreach vt = [v2f16, v2bf16, v2i16] in {
3510def : Pat<(extractelt (vt Int32Regs:$src), 0),
3511          (I32toI16L Int32Regs:$src)>;
3512def : Pat<(extractelt (vt Int32Regs:$src), 1),
3513          (I32toI16H Int32Regs:$src)>;
3514}
3515def : Pat<(v2f16 (build_vector (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
3516          (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
3517def : Pat<(v2bf16 (build_vector (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
3518          (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
3519def : Pat<(v2i16 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b))),
3520          (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
3521
3522def: Pat<(v2i16 (scalar_to_vector (i16 Int16Regs:$a))),
3523         (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
3524
3525// Count leading zeros
3526let hasSideEffects = false in {
3527  def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
3528                         "clz.b32 \t$d, $a;", []>;
3529  def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
3530                         "clz.b64 \t$d, $a;", []>;
3531}
3532
3533// 32-bit has a direct PTX instruction
3534def : Pat<(i32 (ctlz (i32 Int32Regs:$a))), (CLZr32 Int32Regs:$a)>;
3535
3536// The return type of the ctlz ISD node is the same as its input, but the PTX
3537// ctz instruction always returns a 32-bit value.  For ctlz.i64, convert the
3538// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
3539// truncating back down to 32 bits.
3540def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
3541def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
3542
3543// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
3544// result back to 16-bits if necessary.  We also need to subtract 16 because
3545// the high-order 16 zeros were counted.
3546//
3547// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
3548// use to save one SASS instruction (on sm_35 anyway):
3549//
3550//   mov.b32 $tmp, {0xffff, $a}
3551//   ctlz.b32 $result, $tmp
3552//
3553// That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
3554// and then ctlz that value.  This way we don't have to subtract 16 from the
3555// result.  Unfortunately today we don't have a way to generate
3556// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
3557def : Pat<(i16 (ctlz Int16Regs:$a)),
3558          (SUBi16ri (CVT_u16_u32
3559           (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
3560def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))),
3561          (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
3562
3563// Population count
3564let hasSideEffects = false in {
3565  def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
3566                          "popc.b32 \t$d, $a;", []>;
3567  def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
3568                          "popc.b64 \t$d, $a;", []>;
3569}
3570
3571// 32-bit has a direct PTX instruction
3572def : Pat<(i32 (ctpop (i32 Int32Regs:$a))), (POPCr32 Int32Regs:$a)>;
3573
3574// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
3575// to match the LLVM semantics.  Just as with ctlz.i64, we provide a second
3576// pattern that avoids the type conversion if we're truncating the result to
3577// i32 anyway.
3578def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
3579def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>;
3580
3581// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
3582// If we know that we're storing into an i32, we can avoid the final trunc.
3583def : Pat<(ctpop Int16Regs:$a),
3584          (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
3585def : Pat<(i32 (zext (i16 (ctpop Int16Regs:$a)))),
3586          (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>;
3587
3588// fpround f32 -> f16
3589def : Pat<(f16 (fpround Float32Regs:$a)),
3590          (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
3591
3592// fpround f32 -> bf16
3593def : Pat<(bf16 (fpround Float32Regs:$a)),
3594          (CVT_bf16_f32 Float32Regs:$a, CvtRN)>;
3595
3596// fpround f64 -> f16
3597def : Pat<(f16 (fpround Float64Regs:$a)),
3598          (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
3599
3600// fpround f64 -> bf16
3601def : Pat<(bf16 (fpround Float64Regs:$a)),
3602          (CVT_bf16_f64 Float64Regs:$a, CvtRN)>;
3603// fpround f64 -> f32
3604def : Pat<(f32 (fpround Float64Regs:$a)),
3605          (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
3606def : Pat<(f32 (fpround Float64Regs:$a)),
3607          (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
3608
3609// fpextend f16 -> f32
3610def : Pat<(f32 (fpextend (f16 Int16Regs:$a))),
3611          (CVT_f32_f16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
3612def : Pat<(f32 (fpextend (f16 Int16Regs:$a))),
3613          (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
3614// fpextend bf16 -> f32
3615def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))),
3616          (CVT_f32_bf16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
3617def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))),
3618          (CVT_f32_bf16 Int16Regs:$a, CvtNONE)>;
3619
3620// fpextend f16 -> f64
3621def : Pat<(f64 (fpextend (f16 Int16Regs:$a))),
3622          (CVT_f64_f16 Int16Regs:$a, CvtNONE)>;
3623
3624// fpextend bf16 -> f64
3625def : Pat<(f64 (fpextend (bf16 Int16Regs:$a))),
3626          (CVT_f64_bf16 Int16Regs:$a, CvtNONE)>;
3627
3628// fpextend f32 -> f64
3629def : Pat<(f64 (fpextend Float32Regs:$a)),
3630          (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
3631def : Pat<(f64 (fpextend Float32Regs:$a)),
3632          (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
3633
3634def retglue : SDNode<"NVPTXISD::RET_GLUE", SDTNone,
3635                     [SDNPHasChain, SDNPOptInGlue]>;
3636
3637// fceil, ffloor, froundeven, ftrunc.
3638
3639multiclass CVT_ROUND<SDNode OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
3640  def : Pat<(OpNode (f16 Int16Regs:$a)),
3641            (CVT_f16_f16 Int16Regs:$a, Mode)>;
3642  def : Pat<(OpNode (bf16 Int16Regs:$a)),
3643            (CVT_bf16_bf16 Int16Regs:$a, Mode)>;
3644  def : Pat<(OpNode Float32Regs:$a),
3645            (CVT_f32_f32 Float32Regs:$a, ModeFTZ)>, Requires<[doF32FTZ]>;
3646  def : Pat<(OpNode Float32Regs:$a),
3647            (CVT_f32_f32 Float32Regs:$a, Mode)>, Requires<[doNoF32FTZ]>;
3648  def : Pat<(OpNode Float64Regs:$a),
3649            (CVT_f64_f64 Float64Regs:$a, Mode)>;
3650}
3651
3652defm : CVT_ROUND<fceil, CvtRPI, CvtRPI_FTZ>;
3653defm : CVT_ROUND<ffloor, CvtRMI, CvtRMI_FTZ>;
3654defm : CVT_ROUND<froundeven, CvtRNI, CvtRNI_FTZ>;
3655defm : CVT_ROUND<ftrunc, CvtRZI, CvtRZI_FTZ>;
3656
3657// nearbyint and rint are implemented as rounding to nearest even.  This isn't
3658// strictly correct, because it causes us to ignore the rounding mode.  But it
3659// matches what CUDA's "libm" does.
3660
3661defm : CVT_ROUND<fnearbyint, CvtRNI, CvtRNI_FTZ>;
3662defm : CVT_ROUND<frint, CvtRNI, CvtRNI_FTZ>;
3663
3664//-----------------------------------
3665// Control-flow
3666//-----------------------------------
3667
3668let isTerminator=1 in {
3669   let isReturn=1, isBarrier=1 in
3670      def Return : NVPTXInst<(outs), (ins), "ret;", [(retglue)]>;
3671
3672   let isBranch=1 in
3673      def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
3674                              "@$a bra \t$target;",
3675                              [(brcond Int1Regs:$a, bb:$target)]>;
3676   let isBranch=1 in
3677      def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
3678                                   "@!$a bra \t$target;", []>;
3679
3680   let isBranch=1, isBarrier=1 in
3681      def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
3682                           "bra.uni \t$target;", [(br bb:$target)]>;
3683}
3684
3685def : Pat<(brcond (i32 Int32Regs:$a), bb:$target),
3686          (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
3687
3688// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
3689// conditional branch if the target block is the next block so that the code
3690// can fall through to the target block.  The invertion is done by 'xor
3691// condition, 1', which will be translated to (setne condition, -1).  Since ptx
3692// supports '@!pred bra target', we should use it.
3693def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
3694          (CBranchOther Int1Regs:$a, bb:$target)>;
3695
3696// Call
3697def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
3698                                            SDTCisVT<1, i32>]>;
3699def SDT_NVPTXCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
3700
3701def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
3702                           [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
3703def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
3704                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
3705                            SDNPSideEffect]>;
3706
3707def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
3708def call          : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
3709                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
3710def calltarget : Operand<i32>;
3711let isCall=1 in {
3712   def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>;
3713}
3714
3715def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
3716def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>;
3717
3718// Pseudo instructions.
3719class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
3720   : NVPTXInst<outs, ins, asmstr, pattern>;
3721
3722def Callseq_Start :
3723  NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
3724            "\\{ // callseq $amt1, $amt2\n"
3725            "\t.reg .b32 temp_param_reg;",
3726            [(callseq_start timm:$amt1, timm:$amt2)]>;
3727def Callseq_End :
3728  NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
3729            "\\} // callseq $amt1",
3730            [(callseq_end timm:$amt1, timm:$amt2)]>;
3731
3732// trap instruction
3733// Emit an `exit` as well to convey to ptxas that `trap` exits the CFG.
3734// This won't be necessary in a future version of ptxas.
3735def trapinst : NVPTXInst<(outs), (ins), "trap; exit;", [(trap)]>;
3736
3737// Call prototype wrapper
3738def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
3739def CallPrototype :
3740  SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
3741         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
3742def ProtoIdent : Operand<i32> {
3743  let PrintMethod = "printProtoIdent";
3744}
3745def CALL_PROTOTYPE :
3746  NVPTXInst<(outs), (ins ProtoIdent:$ident),
3747            "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
3748
3749
3750include "NVPTXIntrinsics.td"
3751
3752
3753//-----------------------------------
3754// Notes
3755//-----------------------------------
3756// BSWAP is currently expanded. The following is a more efficient
3757// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
3758// - for sm_20, use pmpt (use vector scalar mov to get the pack and
3759//   unpack). sm_20 supports native 32-bit register, but not native 16-bit
3760// register.
3761