1//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file describes the PTX instructions in TableGen format. 10// 11//===----------------------------------------------------------------------===// 12 13include "NVPTXInstrFormats.td" 14 15let OperandType = "OPERAND_IMMEDIATE" in { 16 def f16imm : Operand<f16>; 17 def bf16imm : Operand<bf16>; 18 19} 20 21// List of vector specific properties 22def isVecLD : VecInstTypeEnum<1>; 23def isVecST : VecInstTypeEnum<2>; 24def isVecBuild : VecInstTypeEnum<3>; 25def isVecShuffle : VecInstTypeEnum<4>; 26def isVecExtract : VecInstTypeEnum<5>; 27def isVecInsert : VecInstTypeEnum<6>; 28def isVecDest : VecInstTypeEnum<7>; 29def isVecOther : VecInstTypeEnum<15>; 30 31//===----------------------------------------------------------------------===// 32// NVPTX Operand Definitions. 33//===----------------------------------------------------------------------===// 34 35def brtarget : Operand<OtherVT>; 36 37// CVT conversion modes 38// These must match the enum in NVPTX.h 39def CvtNONE : PatLeaf<(i32 0x0)>; 40def CvtRNI : PatLeaf<(i32 0x1)>; 41def CvtRZI : PatLeaf<(i32 0x2)>; 42def CvtRMI : PatLeaf<(i32 0x3)>; 43def CvtRPI : PatLeaf<(i32 0x4)>; 44def CvtRN : PatLeaf<(i32 0x5)>; 45def CvtRZ : PatLeaf<(i32 0x6)>; 46def CvtRM : PatLeaf<(i32 0x7)>; 47def CvtRP : PatLeaf<(i32 0x8)>; 48def CvtRNA : PatLeaf<(i32 0x9)>; 49 50def CvtNONE_FTZ : PatLeaf<(i32 0x10)>; 51def CvtRNI_FTZ : PatLeaf<(i32 0x11)>; 52def CvtRZI_FTZ : PatLeaf<(i32 0x12)>; 53def CvtRMI_FTZ : PatLeaf<(i32 0x13)>; 54def CvtRPI_FTZ : PatLeaf<(i32 0x14)>; 55def CvtRN_FTZ : PatLeaf<(i32 0x15)>; 56def CvtRZ_FTZ : PatLeaf<(i32 0x16)>; 57def CvtRM_FTZ : PatLeaf<(i32 0x17)>; 58def CvtRP_FTZ : PatLeaf<(i32 0x18)>; 59 60def CvtSAT : PatLeaf<(i32 0x20)>; 61def CvtSAT_FTZ : PatLeaf<(i32 0x30)>; 62 63def CvtNONE_RELU : PatLeaf<(i32 0x40)>; 64def CvtRN_RELU : PatLeaf<(i32 0x45)>; 65def CvtRZ_RELU : PatLeaf<(i32 0x46)>; 66 67def CvtMode : Operand<i32> { 68 let PrintMethod = "printCvtMode"; 69} 70 71// Compare modes 72// These must match the enum in NVPTX.h 73def CmpEQ : PatLeaf<(i32 0)>; 74def CmpNE : PatLeaf<(i32 1)>; 75def CmpLT : PatLeaf<(i32 2)>; 76def CmpLE : PatLeaf<(i32 3)>; 77def CmpGT : PatLeaf<(i32 4)>; 78def CmpGE : PatLeaf<(i32 5)>; 79def CmpLO : PatLeaf<(i32 6)>; 80def CmpLS : PatLeaf<(i32 7)>; 81def CmpHI : PatLeaf<(i32 8)>; 82def CmpHS : PatLeaf<(i32 9)>; 83def CmpEQU : PatLeaf<(i32 10)>; 84def CmpNEU : PatLeaf<(i32 11)>; 85def CmpLTU : PatLeaf<(i32 12)>; 86def CmpLEU : PatLeaf<(i32 13)>; 87def CmpGTU : PatLeaf<(i32 14)>; 88def CmpGEU : PatLeaf<(i32 15)>; 89def CmpNUM : PatLeaf<(i32 16)>; 90def CmpNAN : PatLeaf<(i32 17)>; 91 92def CmpEQ_FTZ : PatLeaf<(i32 0x100)>; 93def CmpNE_FTZ : PatLeaf<(i32 0x101)>; 94def CmpLT_FTZ : PatLeaf<(i32 0x102)>; 95def CmpLE_FTZ : PatLeaf<(i32 0x103)>; 96def CmpGT_FTZ : PatLeaf<(i32 0x104)>; 97def CmpGE_FTZ : PatLeaf<(i32 0x105)>; 98def CmpEQU_FTZ : PatLeaf<(i32 0x10A)>; 99def CmpNEU_FTZ : PatLeaf<(i32 0x10B)>; 100def CmpLTU_FTZ : PatLeaf<(i32 0x10C)>; 101def CmpLEU_FTZ : PatLeaf<(i32 0x10D)>; 102def CmpGTU_FTZ : PatLeaf<(i32 0x10E)>; 103def CmpGEU_FTZ : PatLeaf<(i32 0x10F)>; 104def CmpNUM_FTZ : PatLeaf<(i32 0x110)>; 105def CmpNAN_FTZ : PatLeaf<(i32 0x111)>; 106 107def CmpMode : Operand<i32> { 108 let PrintMethod = "printCmpMode"; 109} 110def VecElement : Operand<i32> { 111 let PrintMethod = "printVecElement"; 112} 113 114// PRMT modes 115// These must match the enum in NVPTX.h 116def PrmtNONE : PatLeaf<(i32 0x0)>; 117def PrmtF4E : PatLeaf<(i32 0x1)>; 118def PrmtB4E : PatLeaf<(i32 0x2)>; 119def PrmtRC8 : PatLeaf<(i32 0x3)>; 120def PrmtECL : PatLeaf<(i32 0x4)>; 121def PrmtECR : PatLeaf<(i32 0x5)>; 122def PrmtRC16 : PatLeaf<(i32 0x6)>; 123 124def PrmtMode : Operand<i32> { 125 let PrintMethod = "printPrmtMode"; 126} 127 128 129//===----------------------------------------------------------------------===// 130// NVPTX Instruction Predicate Definitions 131//===----------------------------------------------------------------------===// 132 133 134def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">; 135def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">; 136def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">; 137def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">; 138def hasVote : Predicate<"Subtarget->hasVote()">; 139def hasDouble : Predicate<"Subtarget->hasDouble()">; 140def hasLDG : Predicate<"Subtarget->hasLDG()">; 141def hasLDU : Predicate<"Subtarget->hasLDU()">; 142 143def doF32FTZ : Predicate<"useF32FTZ()">; 144def doNoF32FTZ : Predicate<"!useF32FTZ()">; 145 146def doMulWide : Predicate<"doMulWide">; 147 148def allowFMA : Predicate<"allowFMA()">; 149def noFMA : Predicate<"!allowFMA()">; 150def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">; 151def noUnsafeFPMath : Predicate<"!allowUnsafeFPMath()">; 152 153def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">; 154def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">; 155 156def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">; 157def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">; 158 159def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; 160def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; 161 162def True : Predicate<"true">; 163 164class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>; 165class hasSM<int version>: Predicate<"Subtarget->getSmVersion() >= " # version>; 166 167// non-sync shfl instructions are not available on sm_70+ in PTX6.4+ 168def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" 169 "&& Subtarget->getPTXVersion() >= 64)">; 170 171def useShortPtr : Predicate<"useShortPointers()">; 172def useFP16Math: Predicate<"Subtarget->allowFP16Math()">; 173def hasBF16Math: Predicate<"Subtarget->hasBF16Math()">; 174 175// Helper class to aid conversion between ValueType and a matching RegisterClass. 176 177class ValueToRegClass<ValueType T> { 178 string name = !cast<string>(T); 179 NVPTXRegClass ret = !cond( 180 !eq(name, "i1"): Int1Regs, 181 !eq(name, "i16"): Int16Regs, 182 !eq(name, "v2i16"): Int32Regs, 183 !eq(name, "i32"): Int32Regs, 184 !eq(name, "i64"): Int64Regs, 185 !eq(name, "f16"): Int16Regs, 186 !eq(name, "v2f16"): Int32Regs, 187 !eq(name, "bf16"): Int16Regs, 188 !eq(name, "v2bf16"): Int32Regs, 189 !eq(name, "f32"): Float32Regs, 190 !eq(name, "f64"): Float64Regs, 191 !eq(name, "ai32"): Int32ArgRegs, 192 !eq(name, "ai64"): Int64ArgRegs, 193 !eq(name, "af32"): Float32ArgRegs, 194 !eq(name, "if64"): Float64ArgRegs, 195 ); 196} 197 198 199//===----------------------------------------------------------------------===// 200// Some Common Instruction Class Templates 201//===----------------------------------------------------------------------===// 202 203// Template for instructions which take three int64, int32, or int16 args. 204// The instructions are named "<OpcStr><Width>" (e.g. "add.s64"). 205multiclass I3<string OpcStr, SDNode OpNode> { 206 def i64rr : 207 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), 208 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 209 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>; 210 def i64ri : 211 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), 212 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 213 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; 214 def i32rr : 215 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 216 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 217 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>; 218 def i32ri : 219 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 220 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 221 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>; 222 def i16rr : 223 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 224 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 225 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; 226 def i16ri : 227 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 228 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 229 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>; 230} 231 232class I16x2<string OpcStr, SDNode OpNode> : 233 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 234 !strconcat(OpcStr, "16x2 \t$dst, $a, $b;"), 235 [(set Int32Regs:$dst, (OpNode (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)))]>, 236 Requires<[hasPTX<80>, hasSM<90>]>; 237 238// Template for instructions which take 3 int args. The instructions are 239// named "<OpcStr>.s32" (e.g. "addc.cc.s32"). 240multiclass ADD_SUB_INT_CARRY<string OpcStr, SDNode OpNode> { 241 let hasSideEffects = 1 in { 242 def i32rr : 243 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 244 !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), 245 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>; 246 def i32ri : 247 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 248 !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), 249 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>; 250 def i64rr : 251 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), 252 !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"), 253 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>, 254 Requires<[hasPTX<43>]>; 255 def i64ri : 256 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), 257 !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"), 258 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>, 259 Requires<[hasPTX<43>]>; 260 } 261} 262 263// Template for instructions which take three fp64 or fp32 args. The 264// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64"). 265// 266// Also defines ftz (flush subnormal inputs and results to sign-preserving 267// zero) variants for fp32 functions. 268// 269// This multiclass should be used for nodes that cannot be folded into FMAs. 270// For nodes that can be folded into FMAs (i.e. adds and muls), use 271// F3_fma_component. 272multiclass F3<string OpcStr, SDNode OpNode> { 273 def f64rr : 274 NVPTXInst<(outs Float64Regs:$dst), 275 (ins Float64Regs:$a, Float64Regs:$b), 276 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 277 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>; 278 def f64ri : 279 NVPTXInst<(outs Float64Regs:$dst), 280 (ins Float64Regs:$a, f64imm:$b), 281 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 282 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>; 283 def f32rr_ftz : 284 NVPTXInst<(outs Float32Regs:$dst), 285 (ins Float32Regs:$a, Float32Regs:$b), 286 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 287 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 288 Requires<[doF32FTZ]>; 289 def f32ri_ftz : 290 NVPTXInst<(outs Float32Regs:$dst), 291 (ins Float32Regs:$a, f32imm:$b), 292 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 293 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 294 Requires<[doF32FTZ]>; 295 def f32rr : 296 NVPTXInst<(outs Float32Regs:$dst), 297 (ins Float32Regs:$a, Float32Regs:$b), 298 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 299 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>; 300 def f32ri : 301 NVPTXInst<(outs Float32Regs:$dst), 302 (ins Float32Regs:$a, f32imm:$b), 303 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 304 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>; 305 306 def f16rr_ftz : 307 NVPTXInst<(outs Int16Regs:$dst), 308 (ins Int16Regs:$a, Int16Regs:$b), 309 !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"), 310 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, 311 Requires<[useFP16Math, doF32FTZ]>; 312 def f16rr : 313 NVPTXInst<(outs Int16Regs:$dst), 314 (ins Int16Regs:$a, Int16Regs:$b), 315 !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"), 316 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, 317 Requires<[useFP16Math]>; 318 319 def f16x2rr_ftz : 320 NVPTXInst<(outs Int32Regs:$dst), 321 (ins Int32Regs:$a, Int32Regs:$b), 322 !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"), 323 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, 324 Requires<[useFP16Math, doF32FTZ]>; 325 def f16x2rr : 326 NVPTXInst<(outs Int32Regs:$dst), 327 (ins Int32Regs:$a, Int32Regs:$b), 328 !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"), 329 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, 330 Requires<[useFP16Math]>; 331 def bf16rr_ftz : 332 NVPTXInst<(outs Int16Regs:$dst), 333 (ins Int16Regs:$a, Int16Regs:$b), 334 !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"), 335 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, 336 Requires<[hasBF16Math, doF32FTZ]>; 337 def bf16rr : 338 NVPTXInst<(outs Int16Regs:$dst), 339 (ins Int16Regs:$a, Int16Regs:$b), 340 !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"), 341 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, 342 Requires<[hasBF16Math]>; 343 344 def bf16x2rr_ftz : 345 NVPTXInst<(outs Int32Regs:$dst), 346 (ins Int32Regs:$a, Int32Regs:$b), 347 !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"), 348 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, 349 Requires<[hasBF16Math, doF32FTZ]>; 350 def bf16x2rr : 351 NVPTXInst<(outs Int32Regs:$dst), 352 (ins Int32Regs:$a, Int32Regs:$b), 353 !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"), 354 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, 355 Requires<[hasBF16Math]>; 356} 357 358// Template for instructions which take three FP args. The 359// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64"). 360// 361// Also defines ftz (flush subnormal inputs and results to sign-preserving 362// zero) variants for fp32/fp16 functions. 363// 364// This multiclass should be used for nodes that can be folded to make fma ops. 365// In this case, we use the ".rn" variant when FMA is disabled, as this behaves 366// just like the non ".rn" op, but prevents ptxas from creating FMAs. 367multiclass F3_fma_component<string OpcStr, SDNode OpNode> { 368 def f64rr : 369 NVPTXInst<(outs Float64Regs:$dst), 370 (ins Float64Regs:$a, Float64Regs:$b), 371 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 372 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>, 373 Requires<[allowFMA]>; 374 def f64ri : 375 NVPTXInst<(outs Float64Regs:$dst), 376 (ins Float64Regs:$a, f64imm:$b), 377 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 378 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>, 379 Requires<[allowFMA]>; 380 def f32rr_ftz : 381 NVPTXInst<(outs Float32Regs:$dst), 382 (ins Float32Regs:$a, Float32Regs:$b), 383 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 384 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 385 Requires<[allowFMA, doF32FTZ]>; 386 def f32ri_ftz : 387 NVPTXInst<(outs Float32Regs:$dst), 388 (ins Float32Regs:$a, f32imm:$b), 389 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 390 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 391 Requires<[allowFMA, doF32FTZ]>; 392 def f32rr : 393 NVPTXInst<(outs Float32Regs:$dst), 394 (ins Float32Regs:$a, Float32Regs:$b), 395 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 396 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 397 Requires<[allowFMA]>; 398 def f32ri : 399 NVPTXInst<(outs Float32Regs:$dst), 400 (ins Float32Regs:$a, f32imm:$b), 401 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 402 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 403 Requires<[allowFMA]>; 404 405 def f16rr_ftz : 406 NVPTXInst<(outs Int16Regs:$dst), 407 (ins Int16Regs:$a, Int16Regs:$b), 408 !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"), 409 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, 410 Requires<[useFP16Math, allowFMA, doF32FTZ]>; 411 def f16rr : 412 NVPTXInst<(outs Int16Regs:$dst), 413 (ins Int16Regs:$a, Int16Regs:$b), 414 !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"), 415 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, 416 Requires<[useFP16Math, allowFMA]>; 417 418 def f16x2rr_ftz : 419 NVPTXInst<(outs Int32Regs:$dst), 420 (ins Int32Regs:$a, Int32Regs:$b), 421 !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"), 422 [(set (v2f16 Int32Regs:$dst), (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, 423 Requires<[useFP16Math, allowFMA, doF32FTZ]>; 424 def f16x2rr : 425 NVPTXInst<(outs Int32Regs:$dst), 426 (ins Int32Regs:$a, Int32Regs:$b), 427 !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"), 428 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, 429 Requires<[useFP16Math, allowFMA]>; 430 def bf16rr_ftz : 431 NVPTXInst<(outs Int16Regs:$dst), 432 (ins Int16Regs:$a, Int16Regs:$b), 433 !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"), 434 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, 435 Requires<[hasBF16Math, allowFMA, doF32FTZ]>; 436 def bf16rr : 437 NVPTXInst<(outs Int16Regs:$dst), 438 (ins Int16Regs:$a, Int16Regs:$b), 439 !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"), 440 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, 441 Requires<[hasBF16Math, allowFMA]>; 442 443 def bf16x2rr_ftz : 444 NVPTXInst<(outs Int32Regs:$dst), 445 (ins Int32Regs:$a, Int32Regs:$b), 446 !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"), 447 [(set (v2bf16 Int32Regs:$dst), (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, 448 Requires<[hasBF16Math, allowFMA, doF32FTZ]>; 449 def bf16x2rr : 450 NVPTXInst<(outs Int32Regs:$dst), 451 (ins Int32Regs:$a, Int32Regs:$b), 452 !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"), 453 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, 454 Requires<[hasBF16Math, allowFMA]>; 455 // These have strange names so we don't perturb existing mir tests. 456 def _rnf64rr : 457 NVPTXInst<(outs Float64Regs:$dst), 458 (ins Float64Regs:$a, Float64Regs:$b), 459 !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), 460 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>, 461 Requires<[noFMA]>; 462 def _rnf64ri : 463 NVPTXInst<(outs Float64Regs:$dst), 464 (ins Float64Regs:$a, f64imm:$b), 465 !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), 466 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>, 467 Requires<[noFMA]>; 468 def _rnf32rr_ftz : 469 NVPTXInst<(outs Float32Regs:$dst), 470 (ins Float32Regs:$a, Float32Regs:$b), 471 !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), 472 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 473 Requires<[noFMA, doF32FTZ]>; 474 def _rnf32ri_ftz : 475 NVPTXInst<(outs Float32Regs:$dst), 476 (ins Float32Regs:$a, f32imm:$b), 477 !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), 478 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 479 Requires<[noFMA, doF32FTZ]>; 480 def _rnf32rr : 481 NVPTXInst<(outs Float32Regs:$dst), 482 (ins Float32Regs:$a, Float32Regs:$b), 483 !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), 484 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 485 Requires<[noFMA]>; 486 def _rnf32ri : 487 NVPTXInst<(outs Float32Regs:$dst), 488 (ins Float32Regs:$a, f32imm:$b), 489 !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), 490 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 491 Requires<[noFMA]>; 492 def _rnf16rr_ftz : 493 NVPTXInst<(outs Int16Regs:$dst), 494 (ins Int16Regs:$a, Int16Regs:$b), 495 !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"), 496 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, 497 Requires<[useFP16Math, noFMA, doF32FTZ]>; 498 def _rnf16rr : 499 NVPTXInst<(outs Int16Regs:$dst), 500 (ins Int16Regs:$a, Int16Regs:$b), 501 !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"), 502 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, 503 Requires<[useFP16Math, noFMA]>; 504 def _rnf16x2rr_ftz : 505 NVPTXInst<(outs Int32Regs:$dst), 506 (ins Int32Regs:$a, Int32Regs:$b), 507 !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"), 508 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, 509 Requires<[useFP16Math, noFMA, doF32FTZ]>; 510 def _rnf16x2rr : 511 NVPTXInst<(outs Int32Regs:$dst), 512 (ins Int32Regs:$a, Int32Regs:$b), 513 !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"), 514 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, 515 Requires<[useFP16Math, noFMA]>; 516 def _rnbf16rr_ftz : 517 NVPTXInst<(outs Int16Regs:$dst), 518 (ins Int16Regs:$a, Int16Regs:$b), 519 !strconcat(OpcStr, ".rn.ftz.bf16 \t$dst, $a, $b;"), 520 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, 521 Requires<[hasBF16Math, noFMA, doF32FTZ]>; 522 def _rnbf16rr : 523 NVPTXInst<(outs Int16Regs:$dst), 524 (ins Int16Regs:$a, Int16Regs:$b), 525 !strconcat(OpcStr, ".rn.bf16 \t$dst, $a, $b;"), 526 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, 527 Requires<[hasBF16Math, noFMA]>; 528 def _rnbf16x2rr_ftz : 529 NVPTXInst<(outs Int32Regs:$dst), 530 (ins Int32Regs:$a, Int32Regs:$b), 531 !strconcat(OpcStr, ".rn.ftz.bf16x2 \t$dst, $a, $b;"), 532 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, 533 Requires<[hasBF16Math, noFMA, doF32FTZ]>; 534 def _rnbf16x2rr : 535 NVPTXInst<(outs Int32Regs:$dst), 536 (ins Int32Regs:$a, Int32Regs:$b), 537 !strconcat(OpcStr, ".rn.bf16x2 \t$dst, $a, $b;"), 538 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, 539 Requires<[hasBF16Math, noFMA]>; 540} 541 542// Template for operations which take two f32 or f64 operands. Provides three 543// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush 544// subnormal inputs and results to zero). 545multiclass F2<string OpcStr, SDNode OpNode> { 546 def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a), 547 !strconcat(OpcStr, ".f64 \t$dst, $a;"), 548 [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>; 549 def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), 550 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"), 551 [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>, 552 Requires<[doF32FTZ]>; 553 def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), 554 !strconcat(OpcStr, ".f32 \t$dst, $a;"), 555 [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>; 556} 557 558multiclass F2_Support_Half<string OpcStr, SDNode OpNode> { 559 def bf16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a), 560 !strconcat(OpcStr, ".bf16 \t$dst, $a;"), 561 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a)))]>, 562 Requires<[hasSM<80>, hasPTX<70>]>; 563 def bf16x2 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), 564 !strconcat(OpcStr, ".bf16x2 \t$dst, $a;"), 565 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a)))]>, 566 Requires<[hasSM<80>, hasPTX<70>]>; 567 def f16_ftz : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a), 568 !strconcat(OpcStr, ".ftz.f16 \t$dst, $a;"), 569 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a)))]>, 570 Requires<[hasSM<53>, hasPTX<65>, doF32FTZ]>; 571 def f16x2_ftz : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), 572 !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a;"), 573 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a)))]>, 574 Requires<[hasSM<53>, hasPTX<65>, doF32FTZ]>; 575 def f16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a), 576 !strconcat(OpcStr, ".f16 \t$dst, $a;"), 577 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a)))]>, 578 Requires<[hasSM<53>, hasPTX<65>]>; 579 def f16x2 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), 580 !strconcat(OpcStr, ".f16x2 \t$dst, $a;"), 581 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a)))]>, 582 Requires<[hasSM<53>, hasPTX<65>]>; 583 584} 585 586//===----------------------------------------------------------------------===// 587// NVPTX Instructions. 588//===----------------------------------------------------------------------===// 589 590//----------------------------------- 591// Type Conversion 592//----------------------------------- 593 594let hasSideEffects = false in { 595 // Generate a cvt to the given type from all possible types. Each instance 596 // takes a CvtMode immediate that defines the conversion mode to use. It can 597 // be CvtNONE to omit a conversion mode. 598 multiclass CVT_FROM_ALL<string ToType, RegisterClass RC, list<Predicate> Preds = []> { 599 def _s8 : 600 NVPTXInst<(outs RC:$dst), 601 (ins Int16Regs:$src, CvtMode:$mode), 602 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 603 ToType, ".s8 \t$dst, $src;"), []>, 604 Requires<Preds>; 605 def _u8 : 606 NVPTXInst<(outs RC:$dst), 607 (ins Int16Regs:$src, CvtMode:$mode), 608 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 609 ToType, ".u8 \t$dst, $src;"), []>, 610 Requires<Preds>; 611 def _s16 : 612 NVPTXInst<(outs RC:$dst), 613 (ins Int16Regs:$src, CvtMode:$mode), 614 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 615 ToType, ".s16 \t$dst, $src;"), []>, 616 Requires<Preds>; 617 def _u16 : 618 NVPTXInst<(outs RC:$dst), 619 (ins Int16Regs:$src, CvtMode:$mode), 620 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 621 ToType, ".u16 \t$dst, $src;"), []>, 622 Requires<Preds>; 623 def _s32 : 624 NVPTXInst<(outs RC:$dst), 625 (ins Int32Regs:$src, CvtMode:$mode), 626 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 627 ToType, ".s32 \t$dst, $src;"), []>, 628 Requires<Preds>; 629 def _u32 : 630 NVPTXInst<(outs RC:$dst), 631 (ins Int32Regs:$src, CvtMode:$mode), 632 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 633 ToType, ".u32 \t$dst, $src;"), []>, 634 Requires<Preds>; 635 def _s64 : 636 NVPTXInst<(outs RC:$dst), 637 (ins Int64Regs:$src, CvtMode:$mode), 638 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 639 ToType, ".s64 \t$dst, $src;"), []>, 640 Requires<Preds>; 641 def _u64 : 642 NVPTXInst<(outs RC:$dst), 643 (ins Int64Regs:$src, CvtMode:$mode), 644 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 645 ToType, ".u64 \t$dst, $src;"), []>, 646 Requires<Preds>; 647 def _f16 : 648 NVPTXInst<(outs RC:$dst), 649 (ins Int16Regs:$src, CvtMode:$mode), 650 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 651 ToType, ".f16 \t$dst, $src;"), []>, 652 Requires<Preds>; 653 def _bf16 : 654 NVPTXInst<(outs RC:$dst), 655 (ins Int16Regs:$src, CvtMode:$mode), 656 !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.", 657 ToType, ".bf16 \t$dst, $src;"), []>, 658 Requires<!if(!eq(ToType, "f32"), 659 // bf16->f32 was introduced early. 660 [hasPTX<71>, hasSM<80>], 661 // bf16->everything else needs sm90/ptx78 662 [hasPTX<78>, hasSM<90>])>; 663 def _f32 : 664 NVPTXInst<(outs RC:$dst), 665 (ins Float32Regs:$src, CvtMode:$mode), 666 !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.", 667 ToType, ".f32 \t$dst, $src;"), []>, 668 Requires<!if(!eq(ToType, "bf16"), 669 // f32->bf16 was introduced early. 670 [hasPTX<70>, hasSM<80>], 671 Preds)>; 672 def _f64 : 673 NVPTXInst<(outs RC:$dst), 674 (ins Float64Regs:$src, CvtMode:$mode), 675 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 676 ToType, ".f64 \t$dst, $src;"), []>, 677 Requires<Preds>; 678 } 679 680 // Generate cvts from all types to all types. 681 defm CVT_s8 : CVT_FROM_ALL<"s8", Int16Regs>; 682 defm CVT_u8 : CVT_FROM_ALL<"u8", Int16Regs>; 683 defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>; 684 defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>; 685 defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>; 686 defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>; 687 defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>; 688 defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>; 689 defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>; 690 defm CVT_bf16 : CVT_FROM_ALL<"bf16", Int16Regs, [hasPTX<78>, hasSM<90>]>; 691 defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>; 692 defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>; 693 694 // These cvts are different from those above: The source and dest registers 695 // are of the same type. 696 def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 697 "cvt.s16.s8 \t$dst, $src;", []>; 698 def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 699 "cvt.s32.s8 \t$dst, $src;", []>; 700 def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 701 "cvt.s32.s16 \t$dst, $src;", []>; 702 def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 703 "cvt.s64.s8 \t$dst, $src;", []>; 704 def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 705 "cvt.s64.s16 \t$dst, $src;", []>; 706 def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 707 "cvt.s64.s32 \t$dst, $src;", []>; 708 709 multiclass CVT_FROM_FLOAT_V2_SM80<string FromName, RegisterClass RC> { 710 def _f32 : 711 NVPTXInst<(outs RC:$dst), 712 (ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode), 713 !strconcat("cvt${mode:base}${mode:relu}.", 714 FromName, ".f32 \t$dst, $src1, $src2;"), []>, 715 Requires<[hasPTX<70>, hasSM<80>]>; 716 } 717 718 defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Int32Regs>; 719 defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>; 720} 721 722//----------------------------------- 723// Selection instructions (selp) 724//----------------------------------- 725 726// TODO: Missing slct 727 728// selp instructions that don't have any pattern matches; we explicitly use 729// them within this file. 730let hasSideEffects = false in { 731 multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> { 732 def rr : NVPTXInst<(outs RC:$dst), 733 (ins RC:$a, RC:$b, Int1Regs:$p), 734 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 735 def ri : NVPTXInst<(outs RC:$dst), 736 (ins RC:$a, ImmCls:$b, Int1Regs:$p), 737 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 738 def ir : NVPTXInst<(outs RC:$dst), 739 (ins ImmCls:$a, RC:$b, Int1Regs:$p), 740 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 741 def ii : NVPTXInst<(outs RC:$dst), 742 (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), 743 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 744 } 745 746 multiclass SELP_PATTERN<string TypeStr, ValueType T, RegisterClass RC, 747 Operand ImmCls, SDNode ImmNode> { 748 def rr : 749 NVPTXInst<(outs RC:$dst), 750 (ins RC:$a, RC:$b, Int1Regs:$p), 751 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 752 [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T RC:$b)))]>; 753 def ri : 754 NVPTXInst<(outs RC:$dst), 755 (ins RC:$a, ImmCls:$b, Int1Regs:$p), 756 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 757 [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T ImmNode:$b)))]>; 758 def ir : 759 NVPTXInst<(outs RC:$dst), 760 (ins ImmCls:$a, RC:$b, Int1Regs:$p), 761 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 762 [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, (T RC:$b)))]>; 763 def ii : 764 NVPTXInst<(outs RC:$dst), 765 (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), 766 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 767 [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>; 768 } 769} 770 771// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as 772// good. 773defm SELP_b16 : SELP_PATTERN<"b16", i16, Int16Regs, i16imm, imm>; 774defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>; 775defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>; 776defm SELP_b32 : SELP_PATTERN<"b32", i32, Int32Regs, i32imm, imm>; 777defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>; 778defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>; 779defm SELP_b64 : SELP_PATTERN<"b64", i64, Int64Regs, i64imm, imm>; 780defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>; 781defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>; 782defm SELP_f16 : SELP_PATTERN<"b16", f16, Int16Regs, f16imm, fpimm>; 783defm SELP_bf16 : SELP_PATTERN<"b16", bf16, Int16Regs, bf16imm, fpimm>; 784 785defm SELP_f32 : SELP_PATTERN<"f32", f32, Float32Regs, f32imm, fpimm>; 786defm SELP_f64 : SELP_PATTERN<"f64", f64, Float64Regs, f64imm, fpimm>; 787 788// This does not work as tablegen fails to infer the type of 'imm'. 789// def v2f16imm : Operand<v2f16>; 790// defm SELP_f16x2 : SELP_PATTERN<"b32", v2f16, Int32Regs, v2f16imm, imm>; 791 792foreach vt = [v2f16, v2bf16, v2i16, v4i8] in { 793def : Pat<(vt (select Int1Regs:$p, (vt Int32Regs:$a), (vt Int32Regs:$b))), 794 (SELP_b32rr Int32Regs:$a, Int32Regs:$b, Int1Regs:$p)>; 795} 796 797//----------------------------------- 798// Test Instructions 799//----------------------------------- 800 801def TESTINF_f32r : NVPTXInst<(outs Int1Regs:$p), (ins Float32Regs:$a), 802 "testp.infinite.f32 \t$p, $a;", 803 []>; 804def TESTINF_f32i : NVPTXInst<(outs Int1Regs:$p), (ins f32imm:$a), 805 "testp.infinite.f32 \t$p, $a;", 806 []>; 807def TESTINF_f64r : NVPTXInst<(outs Int1Regs:$p), (ins Float64Regs:$a), 808 "testp.infinite.f64 \t$p, $a;", 809 []>; 810def TESTINF_f64i : NVPTXInst<(outs Int1Regs:$p), (ins f64imm:$a), 811 "testp.infinite.f64 \t$p, $a;", 812 []>; 813 814//----------------------------------- 815// Integer Arithmetic 816//----------------------------------- 817 818// Template for xor masquerading as int1 arithmetic. 819multiclass ADD_SUB_i1<SDNode OpNode> { 820 def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), 821 "xor.pred \t$dst, $a, $b;", 822 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; 823 def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), 824 "xor.pred \t$dst, $a, $b;", 825 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>; 826} 827 828// int1 addition and subtraction are both just xor. 829defm ADD_i1 : ADD_SUB_i1<add>; 830defm SUB_i1 : ADD_SUB_i1<sub>; 831 832// int16, int32, and int64 signed addition. Since nvptx is 2's complement, we 833// also use these for unsigned arithmetic. 834defm ADD : I3<"add.s", add>; 835defm SUB : I3<"sub.s", sub>; 836 837def ADD16x2 : I16x2<"add.s", add>; 838def SUB16x2 : I16x2<"sub.s", sub>; 839 840// in32 and int64 addition and subtraction with carry-out. 841defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc>; 842defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>; 843 844// int32 and int64 addition and subtraction with carry-in and carry-out. 845defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>; 846defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>; 847 848defm MULT : I3<"mul.lo.s", mul>; 849 850defm MULTHS : I3<"mul.hi.s", mulhs>; 851defm MULTHU : I3<"mul.hi.u", mulhu>; 852 853defm SDIV : I3<"div.s", sdiv>; 854defm UDIV : I3<"div.u", udiv>; 855 856// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM 857// will lower it. 858defm SREM : I3<"rem.s", srem>; 859defm UREM : I3<"rem.u", urem>; 860 861// Integer absolute value. NumBits should be one minus the bit width of RC. 862// This idiom implements the algorithm at 863// http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs. 864multiclass ABS<ValueType T, RegisterClass RC, string SizeName> { 865 def : NVPTXInst<(outs RC:$dst), (ins RC:$a), 866 !strconcat("abs", SizeName, " \t$dst, $a;"), 867 [(set (T RC:$dst), (abs (T RC:$a)))]>; 868} 869defm ABS_16 : ABS<i16, Int16Regs, ".s16">; 870defm ABS_32 : ABS<i32, Int32Regs, ".s32">; 871defm ABS_64 : ABS<i64, Int64Regs, ".s64">; 872 873// Integer min/max. 874defm SMAX : I3<"max.s", smax>; 875defm UMAX : I3<"max.u", umax>; 876defm SMIN : I3<"min.s", smin>; 877defm UMIN : I3<"min.u", umin>; 878 879def SMAX16x2 : I16x2<"max.s", smax>; 880def UMAX16x2 : I16x2<"max.u", umax>; 881def SMIN16x2 : I16x2<"min.s", smin>; 882def UMIN16x2 : I16x2<"min.u", umin>; 883 884 885// 886// Wide multiplication 887// 888def MULWIDES64 : 889 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 890 "mul.wide.s32 \t$dst, $a, $b;", []>; 891def MULWIDES64Imm : 892 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 893 "mul.wide.s32 \t$dst, $a, $b;", []>; 894def MULWIDES64Imm64 : 895 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), 896 "mul.wide.s32 \t$dst, $a, $b;", []>; 897 898def MULWIDEU64 : 899 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 900 "mul.wide.u32 \t$dst, $a, $b;", []>; 901def MULWIDEU64Imm : 902 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 903 "mul.wide.u32 \t$dst, $a, $b;", []>; 904def MULWIDEU64Imm64 : 905 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), 906 "mul.wide.u32 \t$dst, $a, $b;", []>; 907 908def MULWIDES32 : 909 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 910 "mul.wide.s16 \t$dst, $a, $b;", []>; 911def MULWIDES32Imm : 912 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 913 "mul.wide.s16 \t$dst, $a, $b;", []>; 914def MULWIDES32Imm32 : 915 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), 916 "mul.wide.s16 \t$dst, $a, $b;", []>; 917 918def MULWIDEU32 : 919 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 920 "mul.wide.u16 \t$dst, $a, $b;", []>; 921def MULWIDEU32Imm : 922 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 923 "mul.wide.u16 \t$dst, $a, $b;", []>; 924def MULWIDEU32Imm32 : 925 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), 926 "mul.wide.u16 \t$dst, $a, $b;", []>; 927 928def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>; 929def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>; 930def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>; 931 932// Matchers for signed, unsigned mul.wide ISD nodes. 933def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)), 934 (MULWIDES32 i16:$a, i16:$b)>, 935 Requires<[doMulWide]>; 936def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)), 937 (MULWIDES32Imm Int16Regs:$a, imm:$b)>, 938 Requires<[doMulWide]>; 939def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)), 940 (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, 941 Requires<[doMulWide]>; 942def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)), 943 (MULWIDEU32Imm Int16Regs:$a, imm:$b)>, 944 Requires<[doMulWide]>; 945 946def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)), 947 (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, 948 Requires<[doMulWide]>; 949def : Pat<(i64 (mul_wide_signed (i32 Int32Regs:$a), imm:$b)), 950 (MULWIDES64Imm Int32Regs:$a, imm:$b)>, 951 Requires<[doMulWide]>; 952def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)), 953 (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, 954 Requires<[doMulWide]>; 955def : Pat<(i64 (mul_wide_unsigned (i32 Int32Regs:$a), imm:$b)), 956 (MULWIDEU64Imm Int32Regs:$a, imm:$b)>, 957 Requires<[doMulWide]>; 958 959// Predicates used for converting some patterns to mul.wide. 960def SInt32Const : PatLeaf<(imm), [{ 961 const APInt &v = N->getAPIntValue(); 962 return v.isSignedIntN(32); 963}]>; 964 965def UInt32Const : PatLeaf<(imm), [{ 966 const APInt &v = N->getAPIntValue(); 967 return v.isIntN(32); 968}]>; 969 970def SInt16Const : PatLeaf<(imm), [{ 971 const APInt &v = N->getAPIntValue(); 972 return v.isSignedIntN(16); 973}]>; 974 975def UInt16Const : PatLeaf<(imm), [{ 976 const APInt &v = N->getAPIntValue(); 977 return v.isIntN(16); 978}]>; 979 980def IntConst_0_30 : PatLeaf<(imm), [{ 981 // Check if 0 <= v < 31; only then will the result of (x << v) be an int32. 982 const APInt &v = N->getAPIntValue(); 983 return v.sge(0) && v.slt(31); 984}]>; 985 986def IntConst_0_14 : PatLeaf<(imm), [{ 987 // Check if 0 <= v < 15; only then will the result of (x << v) be an int16. 988 const APInt &v = N->getAPIntValue(); 989 return v.sge(0) && v.slt(15); 990}]>; 991 992def SHL2MUL32 : SDNodeXForm<imm, [{ 993 const APInt &v = N->getAPIntValue(); 994 APInt temp(32, 1); 995 return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32); 996}]>; 997 998def SHL2MUL16 : SDNodeXForm<imm, [{ 999 const APInt &v = N->getAPIntValue(); 1000 APInt temp(16, 1); 1001 return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16); 1002}]>; 1003 1004// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide. 1005def : Pat<(shl (sext Int32Regs:$a), (i32 IntConst_0_30:$b)), 1006 (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, 1007 Requires<[doMulWide]>; 1008def : Pat<(shl (zext Int32Regs:$a), (i32 IntConst_0_30:$b)), 1009 (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, 1010 Requires<[doMulWide]>; 1011 1012def : Pat<(shl (sext Int16Regs:$a), (i16 IntConst_0_14:$b)), 1013 (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>, 1014 Requires<[doMulWide]>; 1015def : Pat<(shl (zext Int16Regs:$a), (i16 IntConst_0_14:$b)), 1016 (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>, 1017 Requires<[doMulWide]>; 1018 1019// Convert "sign/zero-extend then multiply" to mul.wide. 1020def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)), 1021 (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, 1022 Requires<[doMulWide]>; 1023def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)), 1024 (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>, 1025 Requires<[doMulWide]>; 1026 1027def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)), 1028 (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, 1029 Requires<[doMulWide]>; 1030def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)), 1031 (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>, 1032 Requires<[doMulWide]>; 1033 1034def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)), 1035 (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, 1036 Requires<[doMulWide]>; 1037def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)), 1038 (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>, 1039 Requires<[doMulWide]>; 1040 1041def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)), 1042 (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, 1043 Requires<[doMulWide]>; 1044def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)), 1045 (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>, 1046 Requires<[doMulWide]>; 1047 1048// 1049// Integer multiply-add 1050// 1051def SDTIMAD : 1052 SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>, 1053 SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>; 1054def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>; 1055 1056def MAD16rrr : 1057 NVPTXInst<(outs Int16Regs:$dst), 1058 (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), 1059 "mad.lo.s16 \t$dst, $a, $b, $c;", 1060 [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>; 1061def MAD16rri : 1062 NVPTXInst<(outs Int16Regs:$dst), 1063 (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c), 1064 "mad.lo.s16 \t$dst, $a, $b, $c;", 1065 [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>; 1066def MAD16rir : 1067 NVPTXInst<(outs Int16Regs:$dst), 1068 (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c), 1069 "mad.lo.s16 \t$dst, $a, $b, $c;", 1070 [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>; 1071def MAD16rii : 1072 NVPTXInst<(outs Int16Regs:$dst), 1073 (ins Int16Regs:$a, i16imm:$b, i16imm:$c), 1074 "mad.lo.s16 \t$dst, $a, $b, $c;", 1075 [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>; 1076 1077def MAD32rrr : 1078 NVPTXInst<(outs Int32Regs:$dst), 1079 (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c), 1080 "mad.lo.s32 \t$dst, $a, $b, $c;", 1081 [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), (i32 Int32Regs:$b), (i32 Int32Regs:$c)))]>; 1082def MAD32rri : 1083 NVPTXInst<(outs Int32Regs:$dst), 1084 (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c), 1085 "mad.lo.s32 \t$dst, $a, $b, $c;", 1086 [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), (i32 Int32Regs:$b), imm:$c))]>; 1087def MAD32rir : 1088 NVPTXInst<(outs Int32Regs:$dst), 1089 (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c), 1090 "mad.lo.s32 \t$dst, $a, $b, $c;", 1091 [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), imm:$b, (i32 Int32Regs:$c)))]>; 1092def MAD32rii : 1093 NVPTXInst<(outs Int32Regs:$dst), 1094 (ins Int32Regs:$a, i32imm:$b, i32imm:$c), 1095 "mad.lo.s32 \t$dst, $a, $b, $c;", 1096 [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), imm:$b, imm:$c))]>; 1097 1098def MAD64rrr : 1099 NVPTXInst<(outs Int64Regs:$dst), 1100 (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c), 1101 "mad.lo.s64 \t$dst, $a, $b, $c;", 1102 [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>; 1103def MAD64rri : 1104 NVPTXInst<(outs Int64Regs:$dst), 1105 (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c), 1106 "mad.lo.s64 \t$dst, $a, $b, $c;", 1107 [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>; 1108def MAD64rir : 1109 NVPTXInst<(outs Int64Regs:$dst), 1110 (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c), 1111 "mad.lo.s64 \t$dst, $a, $b, $c;", 1112 [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>; 1113def MAD64rii : 1114 NVPTXInst<(outs Int64Regs:$dst), 1115 (ins Int64Regs:$a, i64imm:$b, i64imm:$c), 1116 "mad.lo.s64 \t$dst, $a, $b, $c;", 1117 [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>; 1118 1119def INEG16 : 1120 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 1121 "neg.s16 \t$dst, $src;", 1122 [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>; 1123def INEG32 : 1124 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 1125 "neg.s32 \t$dst, $src;", 1126 [(set (i32 Int32Regs:$dst), (ineg (i32 Int32Regs:$src)))]>; 1127def INEG64 : 1128 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 1129 "neg.s64 \t$dst, $src;", 1130 [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>; 1131 1132//----------------------------------- 1133// Floating Point Arithmetic 1134//----------------------------------- 1135 1136// Constant 1.0f 1137def FloatConst1 : PatLeaf<(fpimm), [{ 1138 return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() && 1139 N->getValueAPF().convertToFloat() == 1.0f; 1140}]>; 1141// Constant 1.0 (double) 1142def DoubleConst1 : PatLeaf<(fpimm), [{ 1143 return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() && 1144 N->getValueAPF().convertToDouble() == 1.0; 1145}]>; 1146 1147// Loads FP16 constant into a register. 1148// 1149// ptxas does not have hex representation for fp16, so we can't use 1150// fp16 immediate values in .f16 instructions. Instead we have to load 1151// the constant into a register using mov.b16. 1152def LOAD_CONST_F16 : 1153 NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$a), 1154 "mov.b16 \t$dst, $a;", []>; 1155def LOAD_CONST_BF16 : 1156 NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$a), 1157 "mov.b16 \t$dst, $a;", []>; 1158defm FADD : F3_fma_component<"add", fadd>; 1159defm FSUB : F3_fma_component<"sub", fsub>; 1160defm FMUL : F3_fma_component<"mul", fmul>; 1161 1162defm FMIN : F3<"min", fminnum>; 1163defm FMAX : F3<"max", fmaxnum>; 1164// Note: min.NaN.f64 and max.NaN.f64 do not actually exist. 1165defm FMINNAN : F3<"min.NaN", fminimum>; 1166defm FMAXNAN : F3<"max.NaN", fmaximum>; 1167 1168defm FABS : F2<"abs", fabs>; 1169defm FNEG : F2<"neg", fneg>; 1170defm FABS_H: F2_Support_Half<"abs", fabs>; 1171defm FNEG_H: F2_Support_Half<"neg", fneg>; 1172 1173defm FSQRT : F2<"sqrt.rn", fsqrt>; 1174 1175// 1176// F16 NEG 1177// 1178class FNEG_F16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> : 1179 NVPTXInst<(outs RC:$dst), (ins RC:$src), 1180 !strconcat(OpcStr, " \t$dst, $src;"), 1181 [(set RC:$dst, (fneg (T RC:$src)))]>, 1182 Requires<[useFP16Math, hasPTX<60>, hasSM<53>, Pred]>; 1183def FNEG16_ftz : FNEG_F16_F16X2<"neg.ftz.f16", f16, Int16Regs, doF32FTZ>; 1184def FNEG16 : FNEG_F16_F16X2<"neg.f16", f16, Int16Regs, True>; 1185def FNEG16x2_ftz : FNEG_F16_F16X2<"neg.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>; 1186def FNEG16x2 : FNEG_F16_F16X2<"neg.f16x2", v2f16, Int32Regs, True>; 1187 1188// 1189// BF16 NEG 1190// 1191 1192class FNEG_BF16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> : 1193 NVPTXInst<(outs RC:$dst), (ins RC:$src), 1194 !strconcat(OpcStr, " \t$dst, $src;"), 1195 [(set RC:$dst, (fneg (T RC:$src)))]>, 1196 Requires<[hasBF16Math, hasPTX<70>, hasSM<80>, Pred]>; 1197def BFNEG16_ftz : FNEG_BF16_F16X2<"neg.ftz.bf16", bf16, Int16Regs, doF32FTZ>; 1198def BFNEG16 : FNEG_BF16_F16X2<"neg.bf16", bf16, Int16Regs, True>; 1199def BFNEG16x2_ftz : FNEG_BF16_F16X2<"neg.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>; 1200def BFNEG16x2 : FNEG_BF16_F16X2<"neg.bf16x2", v2bf16, Int32Regs, True>; 1201 1202// 1203// F64 division 1204// 1205def FDIV641r : 1206 NVPTXInst<(outs Float64Regs:$dst), 1207 (ins f64imm:$a, Float64Regs:$b), 1208 "rcp.rn.f64 \t$dst, $b;", 1209 [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>; 1210def FDIV64rr : 1211 NVPTXInst<(outs Float64Regs:$dst), 1212 (ins Float64Regs:$a, Float64Regs:$b), 1213 "div.rn.f64 \t$dst, $a, $b;", 1214 [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>; 1215def FDIV64ri : 1216 NVPTXInst<(outs Float64Regs:$dst), 1217 (ins Float64Regs:$a, f64imm:$b), 1218 "div.rn.f64 \t$dst, $a, $b;", 1219 [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>; 1220 1221// 1222// F32 Approximate reciprocal 1223// 1224def FDIV321r_ftz : 1225 NVPTXInst<(outs Float32Regs:$dst), 1226 (ins f32imm:$a, Float32Regs:$b), 1227 "rcp.approx.ftz.f32 \t$dst, $b;", 1228 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1229 Requires<[do_DIVF32_APPROX, doF32FTZ]>; 1230def FDIV321r : 1231 NVPTXInst<(outs Float32Regs:$dst), 1232 (ins f32imm:$a, Float32Regs:$b), 1233 "rcp.approx.f32 \t$dst, $b;", 1234 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1235 Requires<[do_DIVF32_APPROX]>; 1236// 1237// F32 Approximate division 1238// 1239def FDIV32approxrr_ftz : 1240 NVPTXInst<(outs Float32Regs:$dst), 1241 (ins Float32Regs:$a, Float32Regs:$b), 1242 "div.approx.ftz.f32 \t$dst, $a, $b;", 1243 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1244 Requires<[do_DIVF32_APPROX, doF32FTZ]>; 1245def FDIV32approxri_ftz : 1246 NVPTXInst<(outs Float32Regs:$dst), 1247 (ins Float32Regs:$a, f32imm:$b), 1248 "div.approx.ftz.f32 \t$dst, $a, $b;", 1249 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1250 Requires<[do_DIVF32_APPROX, doF32FTZ]>; 1251def FDIV32approxrr : 1252 NVPTXInst<(outs Float32Regs:$dst), 1253 (ins Float32Regs:$a, Float32Regs:$b), 1254 "div.approx.f32 \t$dst, $a, $b;", 1255 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1256 Requires<[do_DIVF32_APPROX]>; 1257def FDIV32approxri : 1258 NVPTXInst<(outs Float32Regs:$dst), 1259 (ins Float32Regs:$a, f32imm:$b), 1260 "div.approx.f32 \t$dst, $a, $b;", 1261 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1262 Requires<[do_DIVF32_APPROX]>; 1263// 1264// F32 Semi-accurate reciprocal 1265// 1266// rcp.approx gives the same result as div.full(1.0f, a) and is faster. 1267// 1268def FDIV321r_approx_ftz : 1269 NVPTXInst<(outs Float32Regs:$dst), 1270 (ins f32imm:$a, Float32Regs:$b), 1271 "rcp.approx.ftz.f32 \t$dst, $b;", 1272 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1273 Requires<[do_DIVF32_FULL, doF32FTZ]>; 1274def FDIV321r_approx : 1275 NVPTXInst<(outs Float32Regs:$dst), 1276 (ins f32imm:$a, Float32Regs:$b), 1277 "rcp.approx.f32 \t$dst, $b;", 1278 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1279 Requires<[do_DIVF32_FULL]>; 1280// 1281// F32 Semi-accurate division 1282// 1283def FDIV32rr_ftz : 1284 NVPTXInst<(outs Float32Regs:$dst), 1285 (ins Float32Regs:$a, Float32Regs:$b), 1286 "div.full.ftz.f32 \t$dst, $a, $b;", 1287 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1288 Requires<[do_DIVF32_FULL, doF32FTZ]>; 1289def FDIV32ri_ftz : 1290 NVPTXInst<(outs Float32Regs:$dst), 1291 (ins Float32Regs:$a, f32imm:$b), 1292 "div.full.ftz.f32 \t$dst, $a, $b;", 1293 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1294 Requires<[do_DIVF32_FULL, doF32FTZ]>; 1295def FDIV32rr : 1296 NVPTXInst<(outs Float32Regs:$dst), 1297 (ins Float32Regs:$a, Float32Regs:$b), 1298 "div.full.f32 \t$dst, $a, $b;", 1299 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1300 Requires<[do_DIVF32_FULL]>; 1301def FDIV32ri : 1302 NVPTXInst<(outs Float32Regs:$dst), 1303 (ins Float32Regs:$a, f32imm:$b), 1304 "div.full.f32 \t$dst, $a, $b;", 1305 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1306 Requires<[do_DIVF32_FULL]>; 1307// 1308// F32 Accurate reciprocal 1309// 1310def FDIV321r_prec_ftz : 1311 NVPTXInst<(outs Float32Regs:$dst), 1312 (ins f32imm:$a, Float32Regs:$b), 1313 "rcp.rn.ftz.f32 \t$dst, $b;", 1314 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1315 Requires<[doF32FTZ]>; 1316def FDIV321r_prec : 1317 NVPTXInst<(outs Float32Regs:$dst), 1318 (ins f32imm:$a, Float32Regs:$b), 1319 "rcp.rn.f32 \t$dst, $b;", 1320 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>; 1321// 1322// F32 Accurate division 1323// 1324def FDIV32rr_prec_ftz : 1325 NVPTXInst<(outs Float32Regs:$dst), 1326 (ins Float32Regs:$a, Float32Regs:$b), 1327 "div.rn.ftz.f32 \t$dst, $a, $b;", 1328 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1329 Requires<[doF32FTZ]>; 1330def FDIV32ri_prec_ftz : 1331 NVPTXInst<(outs Float32Regs:$dst), 1332 (ins Float32Regs:$a, f32imm:$b), 1333 "div.rn.ftz.f32 \t$dst, $a, $b;", 1334 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1335 Requires<[doF32FTZ]>; 1336def FDIV32rr_prec : 1337 NVPTXInst<(outs Float32Regs:$dst), 1338 (ins Float32Regs:$a, Float32Regs:$b), 1339 "div.rn.f32 \t$dst, $a, $b;", 1340 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>; 1341def FDIV32ri_prec : 1342 NVPTXInst<(outs Float32Regs:$dst), 1343 (ins Float32Regs:$a, f32imm:$b), 1344 "div.rn.f32 \t$dst, $a, $b;", 1345 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>; 1346 1347// 1348// FMA 1349// 1350 1351multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> { 1352 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), 1353 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1354 [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>, 1355 Requires<[Pred]>; 1356 def rri : NVPTXInst<(outs RC:$dst), 1357 (ins RC:$a, RC:$b, ImmCls:$c), 1358 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1359 [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>, 1360 Requires<[Pred]>; 1361 def rir : NVPTXInst<(outs RC:$dst), 1362 (ins RC:$a, ImmCls:$b, RC:$c), 1363 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1364 [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>, 1365 Requires<[Pred]>; 1366 def rii : NVPTXInst<(outs RC:$dst), 1367 (ins RC:$a, ImmCls:$b, ImmCls:$c), 1368 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1369 [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>, 1370 Requires<[Pred]>; 1371} 1372 1373multiclass FMA_F16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> { 1374 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), 1375 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1376 [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>, 1377 Requires<[useFP16Math, Pred]>; 1378} 1379 1380multiclass FMA_BF16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> { 1381 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), 1382 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1383 [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>, 1384 Requires<[hasBF16Math, Pred]>; 1385} 1386 1387defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", f16, Int16Regs, doF32FTZ>; 1388defm FMA16 : FMA_F16<"fma.rn.f16", f16, Int16Regs, True>; 1389defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>; 1390defm FMA16x2 : FMA_F16<"fma.rn.f16x2", v2f16, Int32Regs, True>; 1391defm BFMA16_ftz : FMA_BF16<"fma.rn.ftz.bf16", bf16, Int16Regs, doF32FTZ>; 1392defm BFMA16 : FMA_BF16<"fma.rn.bf16", bf16, Int16Regs, True>; 1393defm BFMA16x2_ftz : FMA_BF16<"fma.rn.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>; 1394defm BFMA16x2 : FMA_BF16<"fma.rn.bf16x2", v2bf16, Int32Regs, True>; 1395defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>; 1396defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, True>; 1397defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, True>; 1398 1399// sin/cos 1400def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), 1401 "sin.approx.f32 \t$dst, $src;", 1402 [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>, 1403 Requires<[allowUnsafeFPMath]>; 1404def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), 1405 "cos.approx.f32 \t$dst, $src;", 1406 [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>, 1407 Requires<[allowUnsafeFPMath]>; 1408 1409// Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)), 1410// i.e. "poor man's fmod()". When y is infinite, x is returned. This matches the 1411// semantics of LLVM's frem. 1412 1413// frem - f32 FTZ 1414def : Pat<(frem Float32Regs:$x, Float32Regs:$y), 1415 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32 1416 (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ), 1417 Float32Regs:$y))>, 1418 Requires<[doF32FTZ, allowUnsafeFPMath]>; 1419def : Pat<(frem Float32Regs:$x, fpimm:$y), 1420 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32 1421 (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ), 1422 fpimm:$y))>, 1423 Requires<[doF32FTZ, allowUnsafeFPMath]>; 1424 1425def : Pat<(frem Float32Regs:$x, Float32Regs:$y), 1426 (SELP_f32rr Float32Regs:$x, 1427 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32 1428 (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ), 1429 Float32Regs:$y)), 1430 (TESTINF_f32r Float32Regs:$y))>, 1431 Requires<[doF32FTZ, noUnsafeFPMath]>; 1432def : Pat<(frem Float32Regs:$x, fpimm:$y), 1433 (SELP_f32rr Float32Regs:$x, 1434 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32 1435 (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ), 1436 fpimm:$y)), 1437 (TESTINF_f32i fpimm:$y))>, 1438 Requires<[doF32FTZ, noUnsafeFPMath]>; 1439 1440// frem - f32 1441def : Pat<(frem Float32Regs:$x, Float32Regs:$y), 1442 (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32 1443 (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI), 1444 Float32Regs:$y))>, 1445 Requires<[allowUnsafeFPMath]>; 1446def : Pat<(frem Float32Regs:$x, fpimm:$y), 1447 (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32 1448 (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI), 1449 fpimm:$y))>, 1450 Requires<[allowUnsafeFPMath]>; 1451 1452def : Pat<(frem Float32Regs:$x, Float32Regs:$y), 1453 (SELP_f32rr Float32Regs:$x, 1454 (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32 1455 (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI), 1456 Float32Regs:$y)), 1457 (TESTINF_f32r Float32Regs:$y))>, 1458 Requires<[noUnsafeFPMath]>; 1459def : Pat<(frem Float32Regs:$x, fpimm:$y), 1460 (SELP_f32rr Float32Regs:$x, 1461 (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32 1462 (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI), 1463 fpimm:$y)), 1464 (TESTINF_f32i fpimm:$y))>, 1465 Requires<[noUnsafeFPMath]>; 1466 1467// frem - f64 1468def : Pat<(frem Float64Regs:$x, Float64Regs:$y), 1469 (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64 1470 (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI), 1471 Float64Regs:$y))>, 1472 Requires<[allowUnsafeFPMath]>; 1473def : Pat<(frem Float64Regs:$x, fpimm:$y), 1474 (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64 1475 (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI), 1476 fpimm:$y))>, 1477 Requires<[allowUnsafeFPMath]>; 1478 1479def : Pat<(frem Float64Regs:$x, Float64Regs:$y), 1480 (SELP_f64rr Float64Regs:$x, 1481 (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64 1482 (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI), 1483 Float64Regs:$y)), 1484 (TESTINF_f64r Float64Regs:$y))>, 1485 Requires<[noUnsafeFPMath]>; 1486def : Pat<(frem Float64Regs:$x, fpimm:$y), 1487 (SELP_f64rr Float64Regs:$x, 1488 (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64 1489 (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI), 1490 fpimm:$y)), 1491 (TESTINF_f64r Float64Regs:$y))>, 1492 Requires<[noUnsafeFPMath]>; 1493 1494//----------------------------------- 1495// Bitwise operations 1496//----------------------------------- 1497 1498// Template for three-arg bitwise operations. Takes three args, Creates .b16, 1499// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr. 1500multiclass BITWISE<string OpcStr, SDNode OpNode> { 1501 def b1rr : 1502 NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), 1503 !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), 1504 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; 1505 def b1ri : 1506 NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), 1507 !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), 1508 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>; 1509 def b16rr : 1510 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 1511 !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), 1512 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; 1513 def b16ri : 1514 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 1515 !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), 1516 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>; 1517 def b32rr : 1518 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 1519 !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), 1520 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>; 1521 def b32ri : 1522 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 1523 !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), 1524 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>; 1525 def b64rr : 1526 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), 1527 !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), 1528 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>; 1529 def b64ri : 1530 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), 1531 !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), 1532 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; 1533} 1534 1535defm OR : BITWISE<"or", or>; 1536defm AND : BITWISE<"and", and>; 1537defm XOR : BITWISE<"xor", xor>; 1538 1539// Lower logical v2i16/v4i8 ops as bitwise ops on b32. 1540foreach vt = [v2i16, v4i8] in { 1541 def: Pat<(or (vt Int32Regs:$a), (vt Int32Regs:$b)), 1542 (ORb32rr Int32Regs:$a, Int32Regs:$b)>; 1543 def: Pat<(xor (vt Int32Regs:$a), (vt Int32Regs:$b)), 1544 (XORb32rr Int32Regs:$a, Int32Regs:$b)>; 1545 def: Pat<(and (vt Int32Regs:$a), (vt Int32Regs:$b)), 1546 (ANDb32rr Int32Regs:$a, Int32Regs:$b)>; 1547 1548 // The constants get legalized into a bitcast from i32, so that's what we need 1549 // to match here. 1550 def: Pat<(or Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))), 1551 (ORb32ri Int32Regs:$a, imm:$b)>; 1552 def: Pat<(xor Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))), 1553 (XORb32ri Int32Regs:$a, imm:$b)>; 1554 def: Pat<(and Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))), 1555 (ANDb32ri Int32Regs:$a, imm:$b)>; 1556} 1557 1558def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src), 1559 "not.pred \t$dst, $src;", 1560 [(set Int1Regs:$dst, (not Int1Regs:$src))]>; 1561def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 1562 "not.b16 \t$dst, $src;", 1563 [(set Int16Regs:$dst, (not Int16Regs:$src))]>; 1564def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 1565 "not.b32 \t$dst, $src;", 1566 [(set (i32 Int32Regs:$dst), (not (i32 Int32Regs:$src)))]>; 1567def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 1568 "not.b64 \t$dst, $src;", 1569 [(set Int64Regs:$dst, (not Int64Regs:$src))]>; 1570 1571// Template for left/right shifts. Takes three operands, 1572// [dest (reg), src (reg), shift (reg or imm)]. 1573// dest and src may be int64, int32, or int16, but shift is always int32. 1574// 1575// This template also defines a 32-bit shift (imm, imm) instruction. 1576multiclass SHIFT<string OpcStr, SDNode OpNode> { 1577 def i64rr : 1578 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b), 1579 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 1580 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 Int32Regs:$b)))]>; 1581 def i64ri : 1582 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b), 1583 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 1584 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>; 1585 def i32rr : 1586 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 1587 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 1588 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>; 1589 def i32ri : 1590 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 1591 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 1592 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 imm:$b)))]>; 1593 def i32ii : 1594 NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b), 1595 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 1596 [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>; 1597 def i16rr : 1598 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b), 1599 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 1600 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 Int32Regs:$b)))]>; 1601 def i16ri : 1602 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b), 1603 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 1604 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>; 1605} 1606 1607defm SHL : SHIFT<"shl.b", shl>; 1608defm SRA : SHIFT<"shr.s", sra>; 1609defm SRL : SHIFT<"shr.u", srl>; 1610 1611// Bit-reverse 1612def BREV32 : 1613 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), 1614 "brev.b32 \t$dst, $a;", 1615 [(set Int32Regs:$dst, (bitreverse (i32 Int32Regs:$a)))]>; 1616def BREV64 : 1617 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a), 1618 "brev.b64 \t$dst, $a;", 1619 [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>; 1620 1621// 1622// Rotate: Use ptx shf instruction if available. 1623// 1624 1625// 32 bit r2 = rotl r1, n 1626// => 1627// r2 = shf.l r1, r1, n 1628def ROTL32imm_hw : 1629 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), 1630 "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", 1631 [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 imm:$amt)))]>, 1632 Requires<[hasHWROT32]>; 1633 1634def ROTL32reg_hw : 1635 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), 1636 "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", 1637 [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>, 1638 Requires<[hasHWROT32]>; 1639 1640// 32 bit r2 = rotr r1, n 1641// => 1642// r2 = shf.r r1, r1, n 1643def ROTR32imm_hw : 1644 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), 1645 "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", 1646 [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 imm:$amt)))]>, 1647 Requires<[hasHWROT32]>; 1648 1649def ROTR32reg_hw : 1650 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), 1651 "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", 1652 [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>, 1653 Requires<[hasHWROT32]>; 1654 1655// 32-bit software rotate by immediate. $amt2 should equal 32 - $amt1. 1656def ROT32imm_sw : 1657 NVPTXInst<(outs Int32Regs:$dst), 1658 (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2), 1659 "{{\n\t" 1660 ".reg .b32 %lhs;\n\t" 1661 ".reg .b32 %rhs;\n\t" 1662 "shl.b32 \t%lhs, $src, $amt1;\n\t" 1663 "shr.b32 \t%rhs, $src, $amt2;\n\t" 1664 "add.u32 \t$dst, %lhs, %rhs;\n\t" 1665 "}}", 1666 []>; 1667 1668def SUB_FRM_32 : SDNodeXForm<imm, [{ 1669 return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32); 1670}]>; 1671 1672def : Pat<(rotl (i32 Int32Regs:$src), (i32 imm:$amt)), 1673 (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, 1674 Requires<[noHWROT32]>; 1675def : Pat<(rotr (i32 Int32Regs:$src), (i32 imm:$amt)), 1676 (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>, 1677 Requires<[noHWROT32]>; 1678 1679// 32-bit software rotate left by register. 1680def ROTL32reg_sw : 1681 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), 1682 "{{\n\t" 1683 ".reg .b32 %lhs;\n\t" 1684 ".reg .b32 %rhs;\n\t" 1685 ".reg .b32 %amt2;\n\t" 1686 "shl.b32 \t%lhs, $src, $amt;\n\t" 1687 "sub.s32 \t%amt2, 32, $amt;\n\t" 1688 "shr.b32 \t%rhs, $src, %amt2;\n\t" 1689 "add.u32 \t$dst, %lhs, %rhs;\n\t" 1690 "}}", 1691 [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>, 1692 Requires<[noHWROT32]>; 1693 1694// 32-bit software rotate right by register. 1695def ROTR32reg_sw : 1696 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), 1697 "{{\n\t" 1698 ".reg .b32 %lhs;\n\t" 1699 ".reg .b32 %rhs;\n\t" 1700 ".reg .b32 %amt2;\n\t" 1701 "shr.b32 \t%lhs, $src, $amt;\n\t" 1702 "sub.s32 \t%amt2, 32, $amt;\n\t" 1703 "shl.b32 \t%rhs, $src, %amt2;\n\t" 1704 "add.u32 \t$dst, %lhs, %rhs;\n\t" 1705 "}}", 1706 [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>, 1707 Requires<[noHWROT32]>; 1708 1709// 64-bit software rotate by immediate. $amt2 should equal 64 - $amt1. 1710def ROT64imm_sw : 1711 NVPTXInst<(outs Int64Regs:$dst), 1712 (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2), 1713 "{{\n\t" 1714 ".reg .b64 %lhs;\n\t" 1715 ".reg .b64 %rhs;\n\t" 1716 "shl.b64 \t%lhs, $src, $amt1;\n\t" 1717 "shr.b64 \t%rhs, $src, $amt2;\n\t" 1718 "add.u64 \t$dst, %lhs, %rhs;\n\t" 1719 "}}", 1720 []>; 1721 1722def SUB_FRM_64 : SDNodeXForm<imm, [{ 1723 return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32); 1724}]>; 1725 1726def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)), 1727 (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>; 1728def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)), 1729 (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>; 1730 1731// 64-bit software rotate left by register. 1732def ROTL64reg_sw : 1733 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), 1734 "{{\n\t" 1735 ".reg .b64 %lhs;\n\t" 1736 ".reg .b64 %rhs;\n\t" 1737 ".reg .u32 %amt2;\n\t" 1738 "shl.b64 \t%lhs, $src, $amt;\n\t" 1739 "sub.u32 \t%amt2, 64, $amt;\n\t" 1740 "shr.b64 \t%rhs, $src, %amt2;\n\t" 1741 "add.u64 \t$dst, %lhs, %rhs;\n\t" 1742 "}}", 1743 [(set Int64Regs:$dst, (rotl Int64Regs:$src, (i32 Int32Regs:$amt)))]>; 1744 1745def ROTR64reg_sw : 1746 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), 1747 "{{\n\t" 1748 ".reg .b64 %lhs;\n\t" 1749 ".reg .b64 %rhs;\n\t" 1750 ".reg .u32 %amt2;\n\t" 1751 "shr.b64 \t%lhs, $src, $amt;\n\t" 1752 "sub.u32 \t%amt2, 64, $amt;\n\t" 1753 "shl.b64 \t%rhs, $src, %amt2;\n\t" 1754 "add.u64 \t$dst, %lhs, %rhs;\n\t" 1755 "}}", 1756 [(set Int64Regs:$dst, (rotr Int64Regs:$src, (i32 Int32Regs:$amt)))]>; 1757 1758// 1759// Funnnel shift in clamp mode 1760// 1761 1762// Create SDNodes so they can be used in the DAG code, e.g. 1763// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts) 1764def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>; 1765def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>; 1766 1767def FUNSHFLCLAMP : 1768 NVPTXInst<(outs Int32Regs:$dst), 1769 (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), 1770 "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;", 1771 [(set Int32Regs:$dst, 1772 (FUN_SHFL_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>; 1773 1774def FUNSHFRCLAMP : 1775 NVPTXInst<(outs Int32Regs:$dst), 1776 (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), 1777 "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;", 1778 [(set Int32Regs:$dst, 1779 (FUN_SHFR_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>; 1780 1781// 1782// BFE - bit-field extract 1783// 1784 1785// Template for BFE/BFI instructions. 1786// Args: [dest (reg), src (reg), start (reg or imm), end (reg or imm)]. 1787// Start may be an imm only if end is also an imm. FIXME: Is this a 1788// restriction in PTX? 1789// 1790// dest and src may be int32 or int64, but start and end are always int32. 1791def SDTBFE : 1792 SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, 1793 SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; 1794def bfe : SDNode<"NVPTXISD::BFE", SDTBFE>; 1795 1796def SDTBFI : 1797 SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, 1798 SDTCisVT<3, i32>, SDTCisVT<4, i32>]>; 1799def bfi : SDNode<"NVPTXISD::BFI", SDTBFI>; 1800 1801def SDTPRMT : 1802 SDTypeProfile<1, 4, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, 1803 SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>,]>; 1804def prmt : SDNode<"NVPTXISD::PRMT", SDTPRMT>; 1805 1806multiclass BFE<string Instr, ValueType T, RegisterClass RC> { 1807 def rrr 1808 : NVPTXInst<(outs RC:$d), 1809 (ins RC:$a, Int32Regs:$b, Int32Regs:$c), 1810 !strconcat(Instr, " \t$d, $a, $b, $c;"), 1811 [(set (T RC:$d), (bfe (T RC:$a), (i32 Int32Regs:$b), (i32 Int32Regs:$c)))]>; 1812 def rri 1813 : NVPTXInst<(outs RC:$d), 1814 (ins RC:$a, Int32Regs:$b, i32imm:$c), 1815 !strconcat(Instr, " \t$d, $a, $b, $c;"), 1816 [(set (T RC:$d), (bfe (T RC:$a), (i32 Int32Regs:$b), (i32 imm:$c)))]>; 1817 def rii 1818 : NVPTXInst<(outs RC:$d), 1819 (ins RC:$a, i32imm:$b, i32imm:$c), 1820 !strconcat(Instr, " \t$d, $a, $b, $c;"), 1821 [(set (T RC:$d), (bfe (T RC:$a), (i32 imm:$b), (i32 imm:$c)))]>; 1822} 1823 1824multiclass BFI<string Instr, ValueType T, RegisterClass RC, Operand ImmCls> { 1825 def rrrr 1826 : NVPTXInst<(outs RC:$f), 1827 (ins RC:$a, RC:$b, Int32Regs:$c, Int32Regs:$d), 1828 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), 1829 [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 Int32Regs:$d)))]>; 1830 def rrri 1831 : NVPTXInst<(outs RC:$f), 1832 (ins RC:$a, RC:$b, Int32Regs:$c, i32imm:$d), 1833 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), 1834 [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 imm:$d)))]>; 1835 def rrii 1836 : NVPTXInst<(outs RC:$f), 1837 (ins RC:$a, RC:$b, i32imm:$c, i32imm:$d), 1838 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), 1839 [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 imm:$c), (i32 imm:$d)))]>; 1840 def irrr 1841 : NVPTXInst<(outs RC:$f), 1842 (ins ImmCls:$a, RC:$b, Int32Regs:$c, Int32Regs:$d), 1843 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), 1844 [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 Int32Regs:$d)))]>; 1845 def irri 1846 : NVPTXInst<(outs RC:$f), 1847 (ins ImmCls:$a, RC:$b, Int32Regs:$c, i32imm:$d), 1848 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), 1849 [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 imm:$d)))]>; 1850 def irii 1851 : NVPTXInst<(outs RC:$f), 1852 (ins ImmCls:$a, RC:$b, i32imm:$c, i32imm:$d), 1853 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), 1854 [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 imm:$c), (i32 imm:$d)))]>; 1855} 1856 1857multiclass PRMT<ValueType T, RegisterClass RC> { 1858 def rrr 1859 : NVPTXInst<(outs RC:$d), 1860 (ins RC:$a, Int32Regs:$b, Int32Regs:$c, PrmtMode:$mode), 1861 !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"), 1862 [(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), imm:$mode))]>; 1863 def rri 1864 : NVPTXInst<(outs RC:$d), 1865 (ins RC:$a, Int32Regs:$b, i32imm:$c, PrmtMode:$mode), 1866 !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"), 1867 [(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 imm:$c), imm:$mode))]>; 1868 def rii 1869 : NVPTXInst<(outs RC:$d), 1870 (ins RC:$a, i32imm:$b, i32imm:$c, PrmtMode:$mode), 1871 !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"), 1872 [(set (T RC:$d), (prmt (T RC:$a), (T imm:$b), (i32 imm:$c), imm:$mode))]>; 1873} 1874 1875let hasSideEffects = false in { 1876 defm BFE_S32 : BFE<"bfe.s32", i32, Int32Regs>; 1877 defm BFE_U32 : BFE<"bfe.u32", i32, Int32Regs>; 1878 defm BFE_S64 : BFE<"bfe.s64", i64, Int64Regs>; 1879 defm BFE_U64 : BFE<"bfe.u64", i64, Int64Regs>; 1880 1881 defm BFI_B32 : BFI<"bfi.b32", i32, Int32Regs, i32imm>; 1882 defm BFI_B64 : BFI<"bfi.b64", i64, Int64Regs, i64imm>; 1883 1884 defm PRMT_B32 : PRMT<i32, Int32Regs>; 1885} 1886 1887 1888// byte extraction + signed/unsigned extension to i32. 1889def : Pat<(i32 (sext_inreg (bfe (i32 Int32Regs:$s), (i32 Int32Regs:$o), 8), i8)), 1890 (BFE_S32rri Int32Regs:$s, Int32Regs:$o, 8)>; 1891def : Pat<(i32 (sext_inreg (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8), i8)), 1892 (BFE_S32rii Int32Regs:$s, imm:$o, 8)>; 1893def : Pat<(i32 (and (bfe (i32 Int32Regs:$s), (i32 Int32Regs:$o), 8), 255)), 1894 (BFE_U32rri Int32Regs:$s, Int32Regs:$o, 8)>; 1895def : Pat<(i32 (and (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8), 255)), 1896 (BFE_U32rii Int32Regs:$s, imm:$o, 8)>; 1897 1898// byte extraction + signed extension to i16 1899def : Pat<(i16 (sext_inreg (trunc (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8)), i8)), 1900 (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, imm:$o, 8), CvtNONE)>; 1901 1902 1903// Byte extraction via shift/trunc/sext 1904def : Pat<(i16 (sext_inreg (trunc Int32Regs:$s), i8)), 1905 (CVT_s8_s32 Int32Regs:$s, CvtNONE)>; 1906def : Pat<(i16 (sext_inreg (trunc (srl (i32 Int32Regs:$s), (i32 imm:$o))), i8)), 1907 (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, imm:$o, 8), CvtNONE)>; 1908def : Pat<(sext_inreg (srl (i32 Int32Regs:$s), (i32 imm:$o)), i8), 1909 (BFE_S32rii Int32Regs:$s, imm:$o, 8)>; 1910def : Pat<(i16 (sra (i16 (trunc Int32Regs:$s)), (i32 8))), 1911 (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, 8, 8), CvtNONE)>; 1912def : Pat<(sext_inreg (srl (i64 Int64Regs:$s), (i32 imm:$o)), i8), 1913 (BFE_S64rii Int64Regs:$s, imm:$o, 8)>; 1914def : Pat<(i16 (sext_inreg (trunc Int64Regs:$s), i8)), 1915 (CVT_s8_s64 Int64Regs:$s, CvtNONE)>; 1916def : Pat<(i16 (sext_inreg (trunc (srl (i64 Int64Regs:$s), (i32 imm:$o))), i8)), 1917 (CVT_s8_s64 (BFE_S64rii Int64Regs:$s, imm:$o, 8), CvtNONE)>; 1918 1919//----------------------------------- 1920// Comparison instructions (setp, set) 1921//----------------------------------- 1922 1923// FIXME: This doesn't cover versions of set and setp that combine with a 1924// boolean predicate, e.g. setp.eq.and.b16. 1925 1926let hasSideEffects = false in { 1927 multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> { 1928 def rr : 1929 NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp), 1930 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, 1931 " \t$dst, $a, $b;"), []>; 1932 def ri : 1933 NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp), 1934 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, 1935 " \t$dst, $a, $b;"), []>; 1936 def ir : 1937 NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp), 1938 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, 1939 " \t$dst, $a, $b;"), []>; 1940 } 1941} 1942 1943defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>; 1944defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>; 1945defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>; 1946defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>; 1947defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>; 1948defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>; 1949defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>; 1950defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>; 1951defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>; 1952defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>; 1953defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>; 1954def SETP_f16rr : 1955 NVPTXInst<(outs Int1Regs:$dst), 1956 (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp), 1957 "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;", 1958 []>, Requires<[useFP16Math]>; 1959 1960def SETP_f16x2rr : 1961 NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q), 1962 (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp), 1963 "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;", 1964 []>, 1965 Requires<[useFP16Math]>; 1966def SETP_bf16rr : 1967 NVPTXInst<(outs Int1Regs:$dst), 1968 (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp), 1969 "setp${cmp:base}${cmp:ftz}.bf16 \t$dst, $a, $b;", 1970 []>, Requires<[hasBF16Math, hasPTX<78>, hasSM<90>]>; 1971 1972def SETP_bf16x2rr : 1973 NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q), 1974 (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp), 1975 "setp${cmp:base}${cmp:ftz}.bf16x2 \t$p|$q, $a, $b;", 1976 []>, 1977 Requires<[hasBF16Math, hasPTX<78>, hasSM<90>]>; 1978 1979 1980// FIXME: This doesn't appear to be correct. The "set" mnemonic has the form 1981// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination 1982// reg, either u32, s32, or f32. Anyway these aren't used at the moment. 1983 1984let hasSideEffects = false in { 1985 multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> { 1986 def rr : NVPTXInst<(outs Int32Regs:$dst), 1987 (ins RC:$a, RC:$b, CmpMode:$cmp), 1988 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; 1989 def ri : NVPTXInst<(outs Int32Regs:$dst), 1990 (ins RC:$a, ImmCls:$b, CmpMode:$cmp), 1991 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; 1992 def ir : NVPTXInst<(outs Int32Regs:$dst), 1993 (ins ImmCls:$a, RC:$b, CmpMode:$cmp), 1994 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; 1995 } 1996} 1997 1998defm SET_b16 : SET<"b16", Int16Regs, i16imm>; 1999defm SET_s16 : SET<"s16", Int16Regs, i16imm>; 2000defm SET_u16 : SET<"u16", Int16Regs, i16imm>; 2001defm SET_b32 : SET<"b32", Int32Regs, i32imm>; 2002defm SET_s32 : SET<"s32", Int32Regs, i32imm>; 2003defm SET_u32 : SET<"u32", Int32Regs, i32imm>; 2004defm SET_b64 : SET<"b64", Int64Regs, i64imm>; 2005defm SET_s64 : SET<"s64", Int64Regs, i64imm>; 2006defm SET_u64 : SET<"u64", Int64Regs, i64imm>; 2007defm SET_f16 : SET<"f16", Int16Regs, f16imm>; 2008defm SET_bf16 : SET<"bf16", Int16Regs, bf16imm>, Requires<[hasPTX<78>, hasSM<90>]>; 2009defm SET_f32 : SET<"f32", Float32Regs, f32imm>; 2010defm SET_f64 : SET<"f64", Float64Regs, f64imm>; 2011 2012//----------------------------------- 2013// Data Movement (Load / Store, Move) 2014//----------------------------------- 2015 2016def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex], 2017 [SDNPWantRoot]>; 2018def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex], 2019 [SDNPWantRoot]>; 2020def ADDRvar : ComplexPattern<iPTR, 1, "SelectDirectAddr", [], []>; 2021 2022def MEMri : Operand<i32> { 2023 let PrintMethod = "printMemOperand"; 2024 let MIOperandInfo = (ops Int32Regs, i32imm); 2025} 2026def MEMri64 : Operand<i64> { 2027 let PrintMethod = "printMemOperand"; 2028 let MIOperandInfo = (ops Int64Regs, i64imm); 2029} 2030 2031def imem : Operand<iPTR> { 2032 let PrintMethod = "printOperand"; 2033} 2034 2035def imemAny : Operand<iPTRAny> { 2036 let PrintMethod = "printOperand"; 2037} 2038 2039def LdStCode : Operand<i32> { 2040 let PrintMethod = "printLdStCode"; 2041} 2042 2043def MmaCode : Operand<i32> { 2044 let PrintMethod = "printMmaCode"; 2045} 2046 2047def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; 2048def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>; 2049 2050// Load a memory address into a u32 or u64 register. 2051def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a), 2052 "mov.u32 \t$dst, $a;", 2053 [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>; 2054def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a), 2055 "mov.u64 \t$dst, $a;", 2056 [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>; 2057 2058// Get pointer to local stack. 2059let hasSideEffects = false in { 2060 def MOV_DEPOT_ADDR : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num), 2061 "mov.u32 \t$d, __local_depot$num;", []>; 2062 def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num), 2063 "mov.u64 \t$d, __local_depot$num;", []>; 2064} 2065 2066 2067// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp 2068let IsSimpleMove=1, hasSideEffects=0 in { 2069 def IMOV1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss), 2070 "mov.pred \t$dst, $sss;", []>; 2071 def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss), 2072 "mov.u16 \t$dst, $sss;", []>; 2073 def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss), 2074 "mov.u32 \t$dst, $sss;", []>; 2075 def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss), 2076 "mov.u64 \t$dst, $sss;", []>; 2077 2078 def IMOVB16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss), 2079 "mov.b16 \t$dst, $sss;", []>; 2080 def IMOVB32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss), 2081 "mov.b32 \t$dst, $sss;", []>; 2082 def IMOVB64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss), 2083 "mov.b64 \t$dst, $sss;", []>; 2084 2085 def FMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 2086 // We have to use .b16 here as there's no mov.f16. 2087 "mov.b16 \t$dst, $src;", []>; 2088 def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), 2089 "mov.f32 \t$dst, $src;", []>; 2090 def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src), 2091 "mov.f64 \t$dst, $src;", []>; 2092} 2093 2094def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src), 2095 "mov.pred \t$dst, $src;", 2096 [(set Int1Regs:$dst, imm:$src)]>; 2097def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), 2098 "mov.u16 \t$dst, $src;", 2099 [(set Int16Regs:$dst, imm:$src)]>; 2100def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src), 2101 "mov.u32 \t$dst, $src;", 2102 [(set (i32 Int32Regs:$dst), imm:$src)]>; 2103def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src), 2104 "mov.u64 \t$dst, $src;", 2105 [(set Int64Regs:$dst, imm:$src)]>; 2106 2107def IMOVB16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), 2108 "mov.b16 \t$dst, $src;", []>; 2109def IMOVB32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src), 2110 "mov.b32 \t$dst, $src;", []>; 2111def IMOVB64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src), 2112 "mov.b64 \t$dst, $src;", []>; 2113 2114def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src), 2115 "mov.f32 \t$dst, $src;", 2116 [(set Float32Regs:$dst, fpimm:$src)]>; 2117def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src), 2118 "mov.f64 \t$dst, $src;", 2119 [(set Float64Regs:$dst, fpimm:$src)]>; 2120 2121def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>; 2122def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64ri texternalsym:$dst)>; 2123 2124//---- Copy Frame Index ---- 2125def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr), 2126 "add.u32 \t$dst, ${addr:add};", 2127 [(set Int32Regs:$dst, ADDRri:$addr)]>; 2128def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr), 2129 "add.u64 \t$dst, ${addr:add};", 2130 [(set Int64Regs:$dst, ADDRri64:$addr)]>; 2131 2132//----------------------------------- 2133// Comparison and Selection 2134//----------------------------------- 2135 2136multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode, 2137 Instruction setp_16rr, 2138 Instruction setp_16ri, 2139 Instruction setp_16ir, 2140 Instruction setp_32rr, 2141 Instruction setp_32ri, 2142 Instruction setp_32ir, 2143 Instruction setp_64rr, 2144 Instruction setp_64ri, 2145 Instruction setp_64ir, 2146 Instruction set_16rr, 2147 Instruction set_16ri, 2148 Instruction set_16ir, 2149 Instruction set_32rr, 2150 Instruction set_32ri, 2151 Instruction set_32ir, 2152 Instruction set_64rr, 2153 Instruction set_64ri, 2154 Instruction set_64ir> { 2155 // i16 -> pred 2156 def : Pat<(i1 (OpNode i16:$a, i16:$b)), 2157 (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>; 2158 def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)), 2159 (setp_16ri Int16Regs:$a, imm:$b, Mode)>; 2160 def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)), 2161 (setp_16ir imm:$a, Int16Regs:$b, Mode)>; 2162 // i32 -> pred 2163 def : Pat<(i1 (OpNode i32:$a, i32:$b)), 2164 (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>; 2165 def : Pat<(i1 (OpNode (i32 Int32Regs:$a), imm:$b)), 2166 (setp_32ri Int32Regs:$a, imm:$b, Mode)>; 2167 def : Pat<(i1 (OpNode imm:$a, (i32 Int32Regs:$b))), 2168 (setp_32ir imm:$a, Int32Regs:$b, Mode)>; 2169 // i64 -> pred 2170 def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)), 2171 (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>; 2172 def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)), 2173 (setp_64ri Int64Regs:$a, imm:$b, Mode)>; 2174 def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)), 2175 (setp_64ir imm:$a, Int64Regs:$b, Mode)>; 2176 2177 // i16 -> i32 2178 def : Pat<(i32 (OpNode i16:$a, i16:$b)), 2179 (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>; 2180 def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)), 2181 (set_16ri Int16Regs:$a, imm:$b, Mode)>; 2182 def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)), 2183 (set_16ir imm:$a, Int16Regs:$b, Mode)>; 2184 // i32 -> i32 2185 def : Pat<(i32 (OpNode i32:$a, i32:$b)), 2186 (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>; 2187 def : Pat<(i32 (OpNode (i32 Int32Regs:$a), imm:$b)), 2188 (set_32ri Int32Regs:$a, imm:$b, Mode)>; 2189 def : Pat<(i32 (OpNode imm:$a, (i32 Int32Regs:$b))), 2190 (set_32ir imm:$a, Int32Regs:$b, Mode)>; 2191 // i64 -> i32 2192 def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)), 2193 (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>; 2194 def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)), 2195 (set_64ri Int64Regs:$a, imm:$b, Mode)>; 2196 def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)), 2197 (set_64ir imm:$a, Int64Regs:$b, Mode)>; 2198} 2199 2200multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode> 2201 : ISET_FORMAT<OpNode, Mode, 2202 SETP_s16rr, SETP_s16ri, SETP_s16ir, 2203 SETP_s32rr, SETP_s32ri, SETP_s32ir, 2204 SETP_s64rr, SETP_s64ri, SETP_s64ir, 2205 SET_s16rr, SET_s16ri, SET_s16ir, 2206 SET_s32rr, SET_s32ri, SET_s32ir, 2207 SET_s64rr, SET_s64ri, SET_s64ir> { 2208 // TableGen doesn't like empty multiclasses. 2209 def : PatLeaf<(i32 0)>; 2210} 2211 2212multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode> 2213 : ISET_FORMAT<OpNode, Mode, 2214 SETP_u16rr, SETP_u16ri, SETP_u16ir, 2215 SETP_u32rr, SETP_u32ri, SETP_u32ir, 2216 SETP_u64rr, SETP_u64ri, SETP_u64ir, 2217 SET_u16rr, SET_u16ri, SET_u16ir, 2218 SET_u32rr, SET_u32ri, SET_u32ir, 2219 SET_u64rr, SET_u64ri, SET_u64ir> { 2220 // TableGen doesn't like empty multiclasses. 2221 def : PatLeaf<(i32 0)>; 2222} 2223 2224defm : ISET_FORMAT_SIGNED<setgt, CmpGT>; 2225defm : ISET_FORMAT_SIGNED<setlt, CmpLT>; 2226defm : ISET_FORMAT_SIGNED<setge, CmpGE>; 2227defm : ISET_FORMAT_SIGNED<setle, CmpLE>; 2228defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>; 2229defm : ISET_FORMAT_SIGNED<setne, CmpNE>; 2230defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>; 2231defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>; 2232defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>; 2233defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>; 2234defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>; 2235defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>; 2236 2237// i1 compares 2238def : Pat<(setne Int1Regs:$a, Int1Regs:$b), 2239 (XORb1rr Int1Regs:$a, Int1Regs:$b)>; 2240def : Pat<(setune Int1Regs:$a, Int1Regs:$b), 2241 (XORb1rr Int1Regs:$a, Int1Regs:$b)>; 2242 2243def : Pat<(seteq Int1Regs:$a, Int1Regs:$b), 2244 (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>; 2245def : Pat<(setueq Int1Regs:$a, Int1Regs:$b), 2246 (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>; 2247 2248// comparisons of i8 extracted with BFE as i32 2249def: Pat<(setgt (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)), 2250 (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpGT)>; 2251def: Pat<(setge (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)), 2252 (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpGE)>; 2253def: Pat<(setlt (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)), 2254 (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpLT)>; 2255def: Pat<(setle (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)), 2256 (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpLE)>; 2257 2258def: Pat<(setugt (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))), 2259 (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpHI)>; 2260def: Pat<(setuge (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))), 2261 (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpHS)>; 2262def: Pat<(setult (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))), 2263 (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLO)>; 2264def: Pat<(setule (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))), 2265 (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLS)>; 2266def: Pat<(seteq (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))), 2267 (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpEQ)>; 2268def: Pat<(setne (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))), 2269 (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpNE)>; 2270 2271// i1 compare -> i32 2272def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)), 2273 (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>; 2274def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)), 2275 (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>; 2276 2277 2278 2279multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> { 2280 // f16 -> pred 2281 def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))), 2282 (SETP_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>, 2283 Requires<[useFP16Math,doF32FTZ]>; 2284 def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))), 2285 (SETP_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>, 2286 Requires<[useFP16Math]>; 2287 def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)), 2288 (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>, 2289 Requires<[useFP16Math,doF32FTZ]>; 2290 def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)), 2291 (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>, 2292 Requires<[useFP16Math]>; 2293 def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))), 2294 (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>, 2295 Requires<[useFP16Math,doF32FTZ]>; 2296 def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))), 2297 (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>, 2298 Requires<[useFP16Math]>; 2299 2300 // bf16 -> pred 2301 def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), 2302 (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>, 2303 Requires<[hasBF16Math,doF32FTZ]>; 2304 def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), 2305 (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>, 2306 Requires<[hasBF16Math]>; 2307 def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), fpimm:$b)), 2308 (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>, 2309 Requires<[hasBF16Math,doF32FTZ]>; 2310 def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), fpimm:$b)), 2311 (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>, 2312 Requires<[hasBF16Math]>; 2313 def : Pat<(i1 (OpNode fpimm:$a, (bf16 Int16Regs:$b))), 2314 (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>, 2315 Requires<[hasBF16Math,doF32FTZ]>; 2316 def : Pat<(i1 (OpNode fpimm:$a, (bf16 Int16Regs:$b))), 2317 (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>, 2318 Requires<[hasBF16Math]>; 2319 2320 // f32 -> pred 2321 def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)), 2322 (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>, 2323 Requires<[doF32FTZ]>; 2324 def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)), 2325 (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>; 2326 def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)), 2327 (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>, 2328 Requires<[doF32FTZ]>; 2329 def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)), 2330 (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>; 2331 def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)), 2332 (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>, 2333 Requires<[doF32FTZ]>; 2334 def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)), 2335 (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>; 2336 2337 // f64 -> pred 2338 def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)), 2339 (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>; 2340 def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)), 2341 (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>; 2342 def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)), 2343 (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>; 2344 2345 // f16 -> i32 2346 def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))), 2347 (SET_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>, 2348 Requires<[useFP16Math, doF32FTZ]>; 2349 def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))), 2350 (SET_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>, 2351 Requires<[useFP16Math]>; 2352 def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)), 2353 (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>, 2354 Requires<[useFP16Math, doF32FTZ]>; 2355 def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)), 2356 (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>, 2357 Requires<[useFP16Math]>; 2358 def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))), 2359 (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>, 2360 Requires<[useFP16Math, doF32FTZ]>; 2361 def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))), 2362 (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>, 2363 Requires<[useFP16Math]>; 2364 2365 // bf16 -> i32 2366 def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), 2367 (SET_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>, 2368 Requires<[hasBF16Math, doF32FTZ]>; 2369 def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), 2370 (SET_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>, 2371 Requires<[hasBF16Math]>; 2372 def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), fpimm:$b)), 2373 (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>, 2374 Requires<[hasBF16Math, doF32FTZ]>; 2375 def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), fpimm:$b)), 2376 (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>, 2377 Requires<[hasBF16Math]>; 2378 def : Pat<(i32 (OpNode fpimm:$a, (bf16 Int16Regs:$b))), 2379 (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>, 2380 Requires<[hasBF16Math, doF32FTZ]>; 2381 def : Pat<(i32 (OpNode fpimm:$a, (bf16 Int16Regs:$b))), 2382 (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>, 2383 Requires<[hasBF16Math]>; 2384 2385 // f32 -> i32 2386 def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)), 2387 (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>, 2388 Requires<[doF32FTZ]>; 2389 def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)), 2390 (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>; 2391 def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)), 2392 (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>, 2393 Requires<[doF32FTZ]>; 2394 def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)), 2395 (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>; 2396 def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)), 2397 (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>, 2398 Requires<[doF32FTZ]>; 2399 def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)), 2400 (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>; 2401 2402 // f64 -> i32 2403 def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)), 2404 (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>; 2405 def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)), 2406 (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>; 2407 def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)), 2408 (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>; 2409} 2410 2411defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>; 2412defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>; 2413defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>; 2414defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>; 2415defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>; 2416defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>; 2417 2418defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>; 2419defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>; 2420defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>; 2421defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>; 2422defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>; 2423defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>; 2424 2425defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>; 2426defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>; 2427defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>; 2428defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>; 2429defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>; 2430defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>; 2431 2432defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>; 2433defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>; 2434 2435// FIXME: What is this doing here? Can it be deleted? 2436// def ld_param : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad, 2437// [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; 2438 2439def SDTDeclareParamProfile : 2440 SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>; 2441def SDTDeclareScalarParamProfile : 2442 SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>; 2443def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>; 2444def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>; 2445def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>; 2446def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>; 2447def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>; 2448def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; 2449def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>; 2450def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>; 2451def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; 2452def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>; 2453def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>; 2454def SDTCallVoidProfile : SDTypeProfile<0, 1, []>; 2455def SDTCallValProfile : SDTypeProfile<1, 0, []>; 2456def SDTMoveParamProfile : SDTypeProfile<1, 1, []>; 2457def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>; 2458def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>; 2459def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>; 2460def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>; 2461def SDTProxyRegProfile : SDTypeProfile<1, 1, []>; 2462 2463def DeclareParam : 2464 SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile, 2465 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2466def DeclareScalarParam : 2467 SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile, 2468 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2469def DeclareRetParam : 2470 SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile, 2471 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2472def DeclareRet : 2473 SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile, 2474 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2475def LoadParam : 2476 SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile, 2477 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; 2478def LoadParamV2 : 2479 SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile, 2480 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; 2481def LoadParamV4 : 2482 SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile, 2483 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; 2484def PrintCall : 2485 SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile, 2486 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2487def PrintConvergentCall : 2488 SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile, 2489 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2490def PrintCallUni : 2491 SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile, 2492 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2493def PrintConvergentCallUni : 2494 SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile, 2495 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2496def StoreParam : 2497 SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile, 2498 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2499def StoreParamV2 : 2500 SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile, 2501 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2502def StoreParamV4 : 2503 SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile, 2504 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2505def StoreParamU32 : 2506 SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile, 2507 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2508def StoreParamS32 : 2509 SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile, 2510 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2511def CallArgBegin : 2512 SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile, 2513 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2514def CallArg : 2515 SDNode<"NVPTXISD::CallArg", SDTCallArgProfile, 2516 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2517def LastCallArg : 2518 SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile, 2519 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2520def CallArgEnd : 2521 SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile, 2522 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2523def CallVoid : 2524 SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile, 2525 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2526def Prototype : 2527 SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile, 2528 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2529def CallVal : 2530 SDNode<"NVPTXISD::CallVal", SDTCallValProfile, 2531 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2532def MoveParam : 2533 SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>; 2534def StoreRetval : 2535 SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile, 2536 [SDNPHasChain, SDNPSideEffect]>; 2537def StoreRetvalV2 : 2538 SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile, 2539 [SDNPHasChain, SDNPSideEffect]>; 2540def StoreRetvalV4 : 2541 SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile, 2542 [SDNPHasChain, SDNPSideEffect]>; 2543def PseudoUseParam : 2544 SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile, 2545 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2546def RETURNNode : 2547 SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile, 2548 [SDNPHasChain, SDNPSideEffect]>; 2549def ProxyReg : 2550 SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile, 2551 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2552 2553let mayLoad = true in { 2554 class LoadParamMemInst<NVPTXRegClass regclass, string opstr> : 2555 NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), 2556 !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"), 2557 []>; 2558 2559 class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> : 2560 NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b), 2561 !strconcat("ld.param.v2", opstr, 2562 " \t{{$dst, $dst2}}, [retval0+$b];"), []>; 2563 2564 class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> : 2565 NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3, 2566 regclass:$dst4), 2567 (ins i32imm:$b), 2568 !strconcat("ld.param.v4", opstr, 2569 " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"), 2570 []>; 2571} 2572 2573class LoadParamRegInst<NVPTXRegClass regclass, string opstr> : 2574 NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), 2575 !strconcat("mov", opstr, " \t$dst, retval$b;"), 2576 [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>; 2577 2578let mayStore = true in { 2579 class StoreParamInst<NVPTXRegClass regclass, string opstr> : 2580 NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b), 2581 !strconcat("st.param", opstr, " \t[param$a+$b], $val;"), 2582 []>; 2583 2584 class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> : 2585 NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, 2586 i32imm:$a, i32imm:$b), 2587 !strconcat("st.param.v2", opstr, 2588 " \t[param$a+$b], {{$val, $val2}};"), 2589 []>; 2590 2591 class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> : 2592 NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3, 2593 regclass:$val4, i32imm:$a, 2594 i32imm:$b), 2595 !strconcat("st.param.v4", opstr, 2596 " \t[param$a+$b], {{$val, $val2, $val3, $val4}};"), 2597 []>; 2598 2599 class StoreRetvalInst<NVPTXRegClass regclass, string opstr> : 2600 NVPTXInst<(outs), (ins regclass:$val, i32imm:$a), 2601 !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"), 2602 []>; 2603 2604 class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> : 2605 NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a), 2606 !strconcat("st.param.v2", opstr, 2607 " \t[func_retval0+$a], {{$val, $val2}};"), 2608 []>; 2609 2610 class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> : 2611 NVPTXInst<(outs), 2612 (ins regclass:$val, regclass:$val2, regclass:$val3, 2613 regclass:$val4, i32imm:$a), 2614 !strconcat("st.param.v4", opstr, 2615 " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"), 2616 []>; 2617} 2618 2619let isCall=1 in { 2620 multiclass CALL<string OpcStr, SDNode OpNode> { 2621 def PrintCallNoRetInst : NVPTXInst<(outs), (ins), 2622 !strconcat(OpcStr, " "), [(OpNode (i32 0))]>; 2623 def PrintCallRetInst1 : NVPTXInst<(outs), (ins), 2624 !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>; 2625 def PrintCallRetInst2 : NVPTXInst<(outs), (ins), 2626 !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>; 2627 def PrintCallRetInst3 : NVPTXInst<(outs), (ins), 2628 !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>; 2629 def PrintCallRetInst4 : NVPTXInst<(outs), (ins), 2630 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "), 2631 [(OpNode (i32 4))]>; 2632 def PrintCallRetInst5 : NVPTXInst<(outs), (ins), 2633 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "), 2634 [(OpNode (i32 5))]>; 2635 def PrintCallRetInst6 : NVPTXInst<(outs), (ins), 2636 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " 2637 "retval5), "), 2638 [(OpNode (i32 6))]>; 2639 def PrintCallRetInst7 : NVPTXInst<(outs), (ins), 2640 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " 2641 "retval5, retval6), "), 2642 [(OpNode (i32 7))]>; 2643 def PrintCallRetInst8 : NVPTXInst<(outs), (ins), 2644 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " 2645 "retval5, retval6, retval7), "), 2646 [(OpNode (i32 8))]>; 2647 } 2648} 2649 2650defm Call : CALL<"call", PrintCall>; 2651defm CallUni : CALL<"call.uni", PrintCallUni>; 2652 2653// Convergent call instructions. These are identical to regular calls, except 2654// they have the isConvergent bit set. 2655let isConvergent=1 in { 2656 defm ConvergentCall : CALL<"call", PrintConvergentCall>; 2657 defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>; 2658} 2659 2660def LoadParamMemI64 : LoadParamMemInst<Int64Regs, ".b64">; 2661def LoadParamMemI32 : LoadParamMemInst<Int32Regs, ".b32">; 2662def LoadParamMemI16 : LoadParamMemInst<Int16Regs, ".b16">; 2663def LoadParamMemI8 : LoadParamMemInst<Int16Regs, ".b8">; 2664def LoadParamMemV2I64 : LoadParamV2MemInst<Int64Regs, ".b64">; 2665def LoadParamMemV2I32 : LoadParamV2MemInst<Int32Regs, ".b32">; 2666def LoadParamMemV2I16 : LoadParamV2MemInst<Int16Regs, ".b16">; 2667def LoadParamMemV2I8 : LoadParamV2MemInst<Int16Regs, ".b8">; 2668def LoadParamMemV4I32 : LoadParamV4MemInst<Int32Regs, ".b32">; 2669def LoadParamMemV4I16 : LoadParamV4MemInst<Int16Regs, ".b16">; 2670def LoadParamMemV4I8 : LoadParamV4MemInst<Int16Regs, ".b8">; 2671def LoadParamMemF32 : LoadParamMemInst<Float32Regs, ".f32">; 2672def LoadParamMemF64 : LoadParamMemInst<Float64Regs, ".f64">; 2673def LoadParamMemV2F32 : LoadParamV2MemInst<Float32Regs, ".f32">; 2674def LoadParamMemV2F64 : LoadParamV2MemInst<Float64Regs, ".f64">; 2675def LoadParamMemV4F32 : LoadParamV4MemInst<Float32Regs, ".f32">; 2676 2677def StoreParamI64 : StoreParamInst<Int64Regs, ".b64">; 2678def StoreParamI32 : StoreParamInst<Int32Regs, ".b32">; 2679 2680def StoreParamI16 : StoreParamInst<Int16Regs, ".b16">; 2681def StoreParamI8 : StoreParamInst<Int16Regs, ".b8">; 2682def StoreParamV2I64 : StoreParamV2Inst<Int64Regs, ".b64">; 2683def StoreParamV2I32 : StoreParamV2Inst<Int32Regs, ".b32">; 2684def StoreParamV2I16 : StoreParamV2Inst<Int16Regs, ".b16">; 2685def StoreParamV2I8 : StoreParamV2Inst<Int16Regs, ".b8">; 2686 2687def StoreParamV4I32 : StoreParamV4Inst<Int32Regs, ".b32">; 2688def StoreParamV4I16 : StoreParamV4Inst<Int16Regs, ".b16">; 2689def StoreParamV4I8 : StoreParamV4Inst<Int16Regs, ".b8">; 2690 2691def StoreParamF32 : StoreParamInst<Float32Regs, ".f32">; 2692def StoreParamF64 : StoreParamInst<Float64Regs, ".f64">; 2693def StoreParamV2F32 : StoreParamV2Inst<Float32Regs, ".f32">; 2694def StoreParamV2F64 : StoreParamV2Inst<Float64Regs, ".f64">; 2695def StoreParamV4F32 : StoreParamV4Inst<Float32Regs, ".f32">; 2696 2697def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">; 2698def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">; 2699def StoreRetvalI16 : StoreRetvalInst<Int16Regs, ".b16">; 2700def StoreRetvalI8 : StoreRetvalInst<Int16Regs, ".b8">; 2701def StoreRetvalV2I64 : StoreRetvalV2Inst<Int64Regs, ".b64">; 2702def StoreRetvalV2I32 : StoreRetvalV2Inst<Int32Regs, ".b32">; 2703def StoreRetvalV2I16 : StoreRetvalV2Inst<Int16Regs, ".b16">; 2704def StoreRetvalV2I8 : StoreRetvalV2Inst<Int16Regs, ".b8">; 2705def StoreRetvalV4I32 : StoreRetvalV4Inst<Int32Regs, ".b32">; 2706def StoreRetvalV4I16 : StoreRetvalV4Inst<Int16Regs, ".b16">; 2707def StoreRetvalV4I8 : StoreRetvalV4Inst<Int16Regs, ".b8">; 2708 2709def StoreRetvalF64 : StoreRetvalInst<Float64Regs, ".f64">; 2710def StoreRetvalF32 : StoreRetvalInst<Float32Regs, ".f32">; 2711def StoreRetvalV2F64 : StoreRetvalV2Inst<Float64Regs, ".f64">; 2712def StoreRetvalV2F32 : StoreRetvalV2Inst<Float32Regs, ".f32">; 2713def StoreRetvalV4F32 : StoreRetvalV4Inst<Float32Regs, ".f32">; 2714 2715def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>; 2716def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>; 2717def CallArgEndInst0 : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>; 2718def RETURNInst : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>; 2719 2720class CallArgInst<NVPTXRegClass regclass> : 2721 NVPTXInst<(outs), (ins regclass:$a), "$a, ", 2722 [(CallArg (i32 0), regclass:$a)]>; 2723 2724class CallArgInstVT<NVPTXRegClass regclass, ValueType vt> : 2725 NVPTXInst<(outs), (ins regclass:$a), "$a, ", 2726 [(CallArg (i32 0), vt:$a)]>; 2727 2728class LastCallArgInst<NVPTXRegClass regclass> : 2729 NVPTXInst<(outs), (ins regclass:$a), "$a", 2730 [(LastCallArg (i32 0), regclass:$a)]>; 2731class LastCallArgInstVT<NVPTXRegClass regclass, ValueType vt> : 2732 NVPTXInst<(outs), (ins regclass:$a), "$a", 2733 [(LastCallArg (i32 0), vt:$a)]>; 2734 2735def CallArgI64 : CallArgInst<Int64Regs>; 2736def CallArgI32 : CallArgInstVT<Int32Regs, i32>; 2737def CallArgI16 : CallArgInstVT<Int16Regs, i16>; 2738def CallArgF64 : CallArgInst<Float64Regs>; 2739def CallArgF32 : CallArgInst<Float32Regs>; 2740 2741def LastCallArgI64 : LastCallArgInst<Int64Regs>; 2742def LastCallArgI32 : LastCallArgInstVT<Int32Regs, i32>; 2743def LastCallArgI16 : LastCallArgInstVT<Int16Regs, i16>; 2744def LastCallArgF64 : LastCallArgInst<Float64Regs>; 2745def LastCallArgF32 : LastCallArgInst<Float32Regs>; 2746 2747def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ", 2748 [(CallArg (i32 0), (i32 imm:$a))]>; 2749def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a", 2750 [(LastCallArg (i32 0), (i32 imm:$a))]>; 2751 2752def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ", 2753 [(CallArg (i32 1), (i32 imm:$a))]>; 2754def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a", 2755 [(LastCallArg (i32 1), (i32 imm:$a))]>; 2756 2757def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr), "$addr, ", 2758 [(CallVoid (Wrapper tglobaladdr:$addr))]>; 2759def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ", 2760 [(CallVoid i32:$addr)]>; 2761def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ", 2762 [(CallVoid Int64Regs:$addr)]>; 2763def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;", 2764 [(Prototype (i32 imm:$val))]>; 2765 2766def DeclareRetMemInst : 2767 NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num), 2768 ".param .align $align .b8 retval$num[$size];", 2769 [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>; 2770def DeclareRetScalarInst : 2771 NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), 2772 ".param .b$size retval$num;", 2773 [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>; 2774def DeclareRetRegInst : 2775 NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), 2776 ".reg .b$size retval$num;", 2777 [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>; 2778 2779def DeclareParamInst : 2780 NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size), 2781 ".param .align $align .b8 param$a[$size];", 2782 [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>; 2783def DeclareScalarParamInst : 2784 NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), 2785 ".param .b$size param$a;", 2786 [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>; 2787def DeclareScalarRegInst : 2788 NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), 2789 ".reg .b$size param$a;", 2790 [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>; 2791 2792class MoveParamInst<ValueType T, NVPTXRegClass regclass, string asmstr> : 2793 NVPTXInst<(outs regclass:$dst), (ins regclass:$src), 2794 !strconcat("mov", asmstr, " \t$dst, $src;"), 2795 [(set (T regclass:$dst), (MoveParam (T regclass:$src)))]>; 2796 2797class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty, ValueType vt, 2798 string asmstr> : 2799 NVPTXInst<(outs regclass:$dst), (ins srcty:$src), 2800 !strconcat("mov", asmstr, " \t$dst, $src;"), 2801 [(set vt:$dst, (MoveParam texternalsym:$src))]>; 2802 2803def MoveParamI64 : MoveParamInst<i64, Int64Regs, ".b64">; 2804def MoveParamI32 : MoveParamInst<i32, Int32Regs, ".b32">; 2805 2806def MoveParamSymbolI64 : MoveParamSymbolInst<Int64Regs, i64imm, i64, ".b64">; 2807def MoveParamSymbolI32 : MoveParamSymbolInst<Int32Regs, i32imm, i32, ".b32">; 2808 2809def MoveParamI16 : 2810 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 2811 "cvt.u16.u32 \t$dst, $src;", // ??? Why cvt.u16.u32 ? 2812 [(set i16:$dst, (MoveParam i16:$src))]>; 2813def MoveParamF64 : MoveParamInst<f64, Float64Regs, ".f64">; 2814def MoveParamF32 : MoveParamInst<f32, Float32Regs, ".f32">; 2815 2816class PseudoUseParamInst<NVPTXRegClass regclass, ValueType vt> : 2817 NVPTXInst<(outs), (ins regclass:$src), 2818 "// Pseudo use of $src", 2819 [(PseudoUseParam vt:$src)]>; 2820 2821def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs, i64>; 2822def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs, i32>; 2823def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs, i16>; 2824def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs, f64>; 2825def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs, f32>; 2826 2827class ProxyRegInst<string SzStr, ValueType T, NVPTXRegClass regclass> : 2828 NVPTXInst<(outs regclass:$dst), (ins regclass:$src), 2829 !strconcat("mov.", SzStr, " \t$dst, $src;"), 2830 [(set (T regclass:$dst), (ProxyReg (T regclass:$src)))]>; 2831 2832def ProxyRegI1 : ProxyRegInst<"pred", i1, Int1Regs>; 2833def ProxyRegI16 : ProxyRegInst<"b16", i16, Int16Regs>; 2834def ProxyRegI32 : ProxyRegInst<"b32", i32, Int32Regs>; 2835def ProxyRegI64 : ProxyRegInst<"b64", i64, Int64Regs>; 2836def ProxyRegF32 : ProxyRegInst<"f32", f32, Float32Regs>; 2837def ProxyRegF64 : ProxyRegInst<"f64", f64, Float64Regs>; 2838 2839foreach vt = [f16, bf16] in { 2840 def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI16 Int16Regs:$src)>; 2841} 2842 2843foreach vt = [v2f16, v2bf16, v2i16, v4i8] in { 2844 def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI32 Int32Regs:$src)>; 2845} 2846 2847// 2848// Load / Store Handling 2849// 2850multiclass LD<NVPTXRegClass regclass> { 2851 def _avar : NVPTXInst< 2852 (outs regclass:$dst), 2853 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2854 i32imm:$fromWidth, imem:$addr), 2855 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2856 "\t$dst, [$addr];", []>; 2857 def _areg : NVPTXInst< 2858 (outs regclass:$dst), 2859 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2860 i32imm:$fromWidth, Int32Regs:$addr), 2861 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2862 "\t$dst, [$addr];", []>; 2863 def _areg_64 : NVPTXInst< 2864 (outs regclass:$dst), 2865 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2866 i32imm:$fromWidth, Int64Regs:$addr), 2867 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2868 "\t$dst, [$addr];", []>; 2869 def _ari : NVPTXInst< 2870 (outs regclass:$dst), 2871 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2872 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), 2873 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2874 "\t$dst, [$addr+$offset];", []>; 2875 def _ari_64 : NVPTXInst< 2876 (outs regclass:$dst), 2877 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2878 LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), 2879 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2880 "\t$dst, [$addr+$offset];", []>; 2881 def _asi : NVPTXInst< 2882 (outs regclass:$dst), 2883 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2884 LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), 2885 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2886 "\t$dst, [$addr+$offset];", []>; 2887} 2888 2889let mayLoad=1, hasSideEffects=0 in { 2890 defm LD_i8 : LD<Int16Regs>; 2891 defm LD_i16 : LD<Int16Regs>; 2892 defm LD_i32 : LD<Int32Regs>; 2893 defm LD_i64 : LD<Int64Regs>; 2894 defm LD_f32 : LD<Float32Regs>; 2895 defm LD_f64 : LD<Float64Regs>; 2896} 2897 2898multiclass ST<NVPTXRegClass regclass> { 2899 def _avar : NVPTXInst< 2900 (outs), 2901 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2902 LdStCode:$Sign, i32imm:$toWidth, imem:$addr), 2903 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2904 " \t[$addr], $src;", []>; 2905 def _areg : NVPTXInst< 2906 (outs), 2907 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, 2908 LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr), 2909 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2910 " \t[$addr], $src;", []>; 2911 def _areg_64 : NVPTXInst< 2912 (outs), 2913 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2914 LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr), 2915 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2916 " \t[$addr], $src;", []>; 2917 def _ari : NVPTXInst< 2918 (outs), 2919 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2920 LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset), 2921 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2922 " \t[$addr+$offset], $src;", []>; 2923 def _ari_64 : NVPTXInst< 2924 (outs), 2925 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2926 LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset), 2927 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2928 " \t[$addr+$offset], $src;", []>; 2929 def _asi : NVPTXInst< 2930 (outs), 2931 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2932 LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset), 2933 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2934 " \t[$addr+$offset], $src;", []>; 2935} 2936 2937let mayStore=1, hasSideEffects=0 in { 2938 defm ST_i8 : ST<Int16Regs>; 2939 defm ST_i16 : ST<Int16Regs>; 2940 defm ST_i32 : ST<Int32Regs>; 2941 defm ST_i64 : ST<Int64Regs>; 2942 defm ST_f32 : ST<Float32Regs>; 2943 defm ST_f64 : ST<Float64Regs>; 2944} 2945 2946// The following is used only in and after vector elementizations. Vector 2947// elementization happens at the machine instruction level, so the following 2948// instructions never appear in the DAG. 2949multiclass LD_VEC<NVPTXRegClass regclass> { 2950 def _v2_avar : NVPTXInst< 2951 (outs regclass:$dst1, regclass:$dst2), 2952 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2953 i32imm:$fromWidth, imem:$addr), 2954 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2955 "\t{{$dst1, $dst2}}, [$addr];", []>; 2956 def _v2_areg : NVPTXInst< 2957 (outs regclass:$dst1, regclass:$dst2), 2958 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2959 i32imm:$fromWidth, Int32Regs:$addr), 2960 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2961 "\t{{$dst1, $dst2}}, [$addr];", []>; 2962 def _v2_areg_64 : NVPTXInst< 2963 (outs regclass:$dst1, regclass:$dst2), 2964 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2965 i32imm:$fromWidth, Int64Regs:$addr), 2966 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2967 "\t{{$dst1, $dst2}}, [$addr];", []>; 2968 def _v2_ari : NVPTXInst< 2969 (outs regclass:$dst1, regclass:$dst2), 2970 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2971 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), 2972 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2973 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; 2974 def _v2_ari_64 : NVPTXInst< 2975 (outs regclass:$dst1, regclass:$dst2), 2976 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2977 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), 2978 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2979 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; 2980 def _v2_asi : NVPTXInst< 2981 (outs regclass:$dst1, regclass:$dst2), 2982 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2983 i32imm:$fromWidth, imem:$addr, i32imm:$offset), 2984 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2985 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; 2986 def _v4_avar : NVPTXInst< 2987 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2988 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2989 i32imm:$fromWidth, imem:$addr), 2990 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2991 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; 2992 def _v4_areg : NVPTXInst< 2993 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2994 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2995 i32imm:$fromWidth, Int32Regs:$addr), 2996 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2997 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; 2998 def _v4_areg_64 : NVPTXInst< 2999 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 3000 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3001 i32imm:$fromWidth, Int64Regs:$addr), 3002 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3003 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; 3004 def _v4_ari : NVPTXInst< 3005 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 3006 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3007 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), 3008 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3009 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; 3010 def _v4_ari_64 : NVPTXInst< 3011 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 3012 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3013 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), 3014 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3015 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; 3016 def _v4_asi : NVPTXInst< 3017 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 3018 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3019 i32imm:$fromWidth, imem:$addr, i32imm:$offset), 3020 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3021 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; 3022} 3023let mayLoad=1, hasSideEffects=0 in { 3024 defm LDV_i8 : LD_VEC<Int16Regs>; 3025 defm LDV_i16 : LD_VEC<Int16Regs>; 3026 defm LDV_i32 : LD_VEC<Int32Regs>; 3027 defm LDV_i64 : LD_VEC<Int64Regs>; 3028 defm LDV_f32 : LD_VEC<Float32Regs>; 3029 defm LDV_f64 : LD_VEC<Float64Regs>; 3030} 3031 3032multiclass ST_VEC<NVPTXRegClass regclass> { 3033 def _v2_avar : NVPTXInst< 3034 (outs), 3035 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 3036 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), 3037 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3038 "\t[$addr], {{$src1, $src2}};", []>; 3039 def _v2_areg : NVPTXInst< 3040 (outs), 3041 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 3042 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), 3043 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3044 "\t[$addr], {{$src1, $src2}};", []>; 3045 def _v2_areg_64 : NVPTXInst< 3046 (outs), 3047 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 3048 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), 3049 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3050 "\t[$addr], {{$src1, $src2}};", []>; 3051 def _v2_ari : NVPTXInst< 3052 (outs), 3053 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 3054 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, 3055 i32imm:$offset), 3056 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3057 "\t[$addr+$offset], {{$src1, $src2}};", []>; 3058 def _v2_ari_64 : NVPTXInst< 3059 (outs), 3060 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 3061 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, 3062 i32imm:$offset), 3063 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3064 "\t[$addr+$offset], {{$src1, $src2}};", []>; 3065 def _v2_asi : NVPTXInst< 3066 (outs), 3067 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 3068 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, 3069 i32imm:$offset), 3070 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3071 "\t[$addr+$offset], {{$src1, $src2}};", []>; 3072 def _v4_avar : NVPTXInst< 3073 (outs), 3074 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 3075 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3076 i32imm:$fromWidth, imem:$addr), 3077 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3078 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; 3079 def _v4_areg : NVPTXInst< 3080 (outs), 3081 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 3082 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3083 i32imm:$fromWidth, Int32Regs:$addr), 3084 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3085 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; 3086 def _v4_areg_64 : NVPTXInst< 3087 (outs), 3088 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 3089 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3090 i32imm:$fromWidth, Int64Regs:$addr), 3091 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3092 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; 3093 def _v4_ari : NVPTXInst< 3094 (outs), 3095 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 3096 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3097 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), 3098 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3099 "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; 3100 def _v4_ari_64 : NVPTXInst< 3101 (outs), 3102 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 3103 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3104 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), 3105 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3106 "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; 3107 def _v4_asi : NVPTXInst< 3108 (outs), 3109 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 3110 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3111 i32imm:$fromWidth, imem:$addr, i32imm:$offset), 3112 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}" 3113 "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; 3114} 3115 3116let mayStore=1, hasSideEffects=0 in { 3117 defm STV_i8 : ST_VEC<Int16Regs>; 3118 defm STV_i16 : ST_VEC<Int16Regs>; 3119 defm STV_i32 : ST_VEC<Int32Regs>; 3120 defm STV_i64 : ST_VEC<Int64Regs>; 3121 defm STV_f32 : ST_VEC<Float32Regs>; 3122 defm STV_f64 : ST_VEC<Float64Regs>; 3123} 3124 3125//---- Conversion ---- 3126 3127class F_BITCONVERT<string SzStr, ValueType TIn, ValueType TOut, 3128 NVPTXRegClass regclassIn = ValueToRegClass<TIn>.ret, 3129 NVPTXRegClass regclassOut = ValueToRegClass<TOut>.ret> : 3130 NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a), 3131 !strconcat("mov.b", SzStr, " \t$d, $a;"), 3132 [(set (TOut regclassOut:$d), (bitconvert (TIn regclassIn:$a)))]>; 3133 3134def BITCONVERT_32_I2F : F_BITCONVERT<"32", i32, f32>; 3135def BITCONVERT_32_F2I : F_BITCONVERT<"32", f32, i32>; 3136def BITCONVERT_64_I2F : F_BITCONVERT<"64", i64, f64>; 3137def BITCONVERT_64_F2I : F_BITCONVERT<"64", f64, i64>; 3138 3139foreach vt = [v2f16, v2bf16, v2i16, v4i8] in { 3140def: Pat<(vt (bitconvert (f32 Float32Regs:$a))), 3141 (BITCONVERT_32_F2I Float32Regs:$a)>; 3142def: Pat<(f32 (bitconvert (vt Int32Regs:$a))), 3143 (BITCONVERT_32_I2F Int32Regs:$a)>; 3144} 3145foreach vt = [f16, bf16] in { 3146def: Pat<(vt (bitconvert (i16 UInt16Const:$a))), 3147 (IMOVB16ri UInt16Const:$a)>; 3148def: Pat<(vt (bitconvert (i16 Int16Regs:$a))), 3149 (ProxyRegI16 Int16Regs:$a)>; 3150def: Pat<(i16 (bitconvert (vt Int16Regs:$a))), 3151 (ProxyRegI16 Int16Regs:$a)>; 3152} 3153 3154foreach ta = [v2f16, v2bf16, v2i16, v4i8, i32] in { 3155 def: Pat<(ta (bitconvert (i32 UInt32Const:$a))), 3156 (IMOVB32ri UInt32Const:$a)>; 3157 foreach tb = [v2f16, v2bf16, v2i16, v4i8, i32] in { 3158 if !ne(ta, tb) then { 3159 def: Pat<(ta (bitconvert (tb Int32Regs:$a))), 3160 (ProxyRegI32 Int32Regs:$a)>; 3161 } 3162 } 3163} 3164 3165// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where 3166// we cannot specify floating-point literals in isel patterns. Therefore, we 3167// use an integer selp to select either 1 or 0 and then cvt to floating-point. 3168 3169// sint -> f16 3170def : Pat<(f16 (sint_to_fp Int1Regs:$a)), 3171 (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3172def : Pat<(f16 (sint_to_fp Int16Regs:$a)), 3173 (CVT_f16_s16 Int16Regs:$a, CvtRN)>; 3174def : Pat<(f16 (sint_to_fp Int32Regs:$a)), 3175 (CVT_f16_s32 Int32Regs:$a, CvtRN)>; 3176def : Pat<(f16 (sint_to_fp Int64Regs:$a)), 3177 (CVT_f16_s64 Int64Regs:$a, CvtRN)>; 3178 3179// uint -> f16 3180def : Pat<(f16 (uint_to_fp Int1Regs:$a)), 3181 (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3182def : Pat<(f16 (uint_to_fp Int16Regs:$a)), 3183 (CVT_f16_u16 Int16Regs:$a, CvtRN)>; 3184def : Pat<(f16 (uint_to_fp Int32Regs:$a)), 3185 (CVT_f16_u32 Int32Regs:$a, CvtRN)>; 3186def : Pat<(f16 (uint_to_fp Int64Regs:$a)), 3187 (CVT_f16_u64 Int64Regs:$a, CvtRN)>; 3188 3189// sint -> bf16 3190def : Pat<(bf16 (sint_to_fp Int1Regs:$a)), 3191 (CVT_bf16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3192def : Pat<(bf16 (sint_to_fp Int16Regs:$a)), 3193 (CVT_bf16_s16 Int16Regs:$a, CvtRN)>; 3194def : Pat<(bf16 (sint_to_fp Int32Regs:$a)), 3195 (CVT_bf16_s32 Int32Regs:$a, CvtRN)>; 3196def : Pat<(bf16 (sint_to_fp Int64Regs:$a)), 3197 (CVT_bf16_s64 Int64Regs:$a, CvtRN)>; 3198 3199// uint -> bf16 3200def : Pat<(bf16 (uint_to_fp Int1Regs:$a)), 3201 (CVT_bf16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3202def : Pat<(bf16 (uint_to_fp Int16Regs:$a)), 3203 (CVT_bf16_u16 Int16Regs:$a, CvtRN)>; 3204def : Pat<(bf16 (uint_to_fp Int32Regs:$a)), 3205 (CVT_bf16_u32 Int32Regs:$a, CvtRN)>; 3206def : Pat<(bf16 (uint_to_fp Int64Regs:$a)), 3207 (CVT_bf16_u64 Int64Regs:$a, CvtRN)>; 3208 3209// sint -> f32 3210def : Pat<(f32 (sint_to_fp Int1Regs:$a)), 3211 (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3212def : Pat<(f32 (sint_to_fp Int16Regs:$a)), 3213 (CVT_f32_s16 Int16Regs:$a, CvtRN)>; 3214def : Pat<(f32 (sint_to_fp Int32Regs:$a)), 3215 (CVT_f32_s32 Int32Regs:$a, CvtRN)>; 3216def : Pat<(f32 (sint_to_fp Int64Regs:$a)), 3217 (CVT_f32_s64 Int64Regs:$a, CvtRN)>; 3218 3219// uint -> f32 3220def : Pat<(f32 (uint_to_fp Int1Regs:$a)), 3221 (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3222def : Pat<(f32 (uint_to_fp Int16Regs:$a)), 3223 (CVT_f32_u16 Int16Regs:$a, CvtRN)>; 3224def : Pat<(f32 (uint_to_fp Int32Regs:$a)), 3225 (CVT_f32_u32 Int32Regs:$a, CvtRN)>; 3226def : Pat<(f32 (uint_to_fp Int64Regs:$a)), 3227 (CVT_f32_u64 Int64Regs:$a, CvtRN)>; 3228 3229// sint -> f64 3230def : Pat<(f64 (sint_to_fp Int1Regs:$a)), 3231 (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3232def : Pat<(f64 (sint_to_fp Int16Regs:$a)), 3233 (CVT_f64_s16 Int16Regs:$a, CvtRN)>; 3234def : Pat<(f64 (sint_to_fp Int32Regs:$a)), 3235 (CVT_f64_s32 Int32Regs:$a, CvtRN)>; 3236def : Pat<(f64 (sint_to_fp Int64Regs:$a)), 3237 (CVT_f64_s64 Int64Regs:$a, CvtRN)>; 3238 3239// uint -> f64 3240def : Pat<(f64 (uint_to_fp Int1Regs:$a)), 3241 (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3242def : Pat<(f64 (uint_to_fp Int16Regs:$a)), 3243 (CVT_f64_u16 Int16Regs:$a, CvtRN)>; 3244def : Pat<(f64 (uint_to_fp Int32Regs:$a)), 3245 (CVT_f64_u32 Int32Regs:$a, CvtRN)>; 3246def : Pat<(f64 (uint_to_fp Int64Regs:$a)), 3247 (CVT_f64_u64 Int64Regs:$a, CvtRN)>; 3248 3249 3250// f16 -> sint 3251def : Pat<(i1 (fp_to_sint (f16 Int16Regs:$a))), 3252 (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>; 3253def : Pat<(i16 (fp_to_sint (f16 Int16Regs:$a))), 3254 (CVT_s16_f16 (f16 Int16Regs:$a), CvtRZI)>; 3255def : Pat<(i32 (fp_to_sint (f16 Int16Regs:$a))), 3256 (CVT_s32_f16 (f16 Int16Regs:$a), CvtRZI)>; 3257def : Pat<(i64 (fp_to_sint (f16 Int16Regs:$a))), 3258 (CVT_s64_f16 Int16Regs:$a, CvtRZI)>; 3259 3260// f16 -> uint 3261def : Pat<(i1 (fp_to_uint (f16 Int16Regs:$a))), 3262 (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>; 3263def : Pat<(i16 (fp_to_uint (f16 Int16Regs:$a))), 3264 (CVT_u16_f16 Int16Regs:$a, CvtRZI)>; 3265def : Pat<(i32 (fp_to_uint (f16 Int16Regs:$a))), 3266 (CVT_u32_f16 Int16Regs:$a, CvtRZI)>; 3267def : Pat<(i64 (fp_to_uint (f16 Int16Regs:$a))), 3268 (CVT_u64_f16 Int16Regs:$a, CvtRZI)>; 3269 3270// bf16 -> sint 3271def : Pat<(i1 (fp_to_sint (bf16 Int16Regs:$a))), 3272 (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>; 3273def : Pat<(i16 (fp_to_sint (bf16 Int16Regs:$a))), 3274 (CVT_s16_bf16 (bf16 Int16Regs:$a), CvtRZI)>; 3275def : Pat<(i32 (fp_to_sint (bf16 Int16Regs:$a))), 3276 (CVT_s32_bf16 (bf16 Int16Regs:$a), CvtRZI)>; 3277def : Pat<(i64 (fp_to_sint (bf16 Int16Regs:$a))), 3278 (CVT_s64_bf16 Int16Regs:$a, CvtRZI)>; 3279 3280// bf16 -> uint 3281def : Pat<(i1 (fp_to_uint (bf16 Int16Regs:$a))), 3282 (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>; 3283def : Pat<(i16 (fp_to_uint (bf16 Int16Regs:$a))), 3284 (CVT_u16_bf16 Int16Regs:$a, CvtRZI)>; 3285def : Pat<(i32 (fp_to_uint (bf16 Int16Regs:$a))), 3286 (CVT_u32_bf16 Int16Regs:$a, CvtRZI)>; 3287def : Pat<(i64 (fp_to_uint (bf16 Int16Regs:$a))), 3288 (CVT_u64_bf16 Int16Regs:$a, CvtRZI)>; 3289// f32 -> sint 3290def : Pat<(i1 (fp_to_sint Float32Regs:$a)), 3291 (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>; 3292def : Pat<(i16 (fp_to_sint Float32Regs:$a)), 3293 (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3294def : Pat<(i16 (fp_to_sint Float32Regs:$a)), 3295 (CVT_s16_f32 Float32Regs:$a, CvtRZI)>; 3296def : Pat<(i32 (fp_to_sint Float32Regs:$a)), 3297 (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3298def : Pat<(i32 (fp_to_sint Float32Regs:$a)), 3299 (CVT_s32_f32 Float32Regs:$a, CvtRZI)>; 3300def : Pat<(i64 (fp_to_sint Float32Regs:$a)), 3301 (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3302def : Pat<(i64 (fp_to_sint Float32Regs:$a)), 3303 (CVT_s64_f32 Float32Regs:$a, CvtRZI)>; 3304 3305// f32 -> uint 3306def : Pat<(i1 (fp_to_uint Float32Regs:$a)), 3307 (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>; 3308def : Pat<(i16 (fp_to_uint Float32Regs:$a)), 3309 (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3310def : Pat<(i16 (fp_to_uint Float32Regs:$a)), 3311 (CVT_u16_f32 Float32Regs:$a, CvtRZI)>; 3312def : Pat<(i32 (fp_to_uint Float32Regs:$a)), 3313 (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3314def : Pat<(i32 (fp_to_uint Float32Regs:$a)), 3315 (CVT_u32_f32 Float32Regs:$a, CvtRZI)>; 3316def : Pat<(i64 (fp_to_uint Float32Regs:$a)), 3317 (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3318def : Pat<(i64 (fp_to_uint Float32Regs:$a)), 3319 (CVT_u64_f32 Float32Regs:$a, CvtRZI)>; 3320 3321// f64 -> sint 3322def : Pat<(i1 (fp_to_sint Float64Regs:$a)), 3323 (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>; 3324def : Pat<(i16 (fp_to_sint Float64Regs:$a)), 3325 (CVT_s16_f64 Float64Regs:$a, CvtRZI)>; 3326def : Pat<(i32 (fp_to_sint Float64Regs:$a)), 3327 (CVT_s32_f64 Float64Regs:$a, CvtRZI)>; 3328def : Pat<(i64 (fp_to_sint Float64Regs:$a)), 3329 (CVT_s64_f64 Float64Regs:$a, CvtRZI)>; 3330 3331// f64 -> uint 3332def : Pat<(i1 (fp_to_uint Float64Regs:$a)), 3333 (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>; 3334def : Pat<(i16 (fp_to_uint Float64Regs:$a)), 3335 (CVT_u16_f64 Float64Regs:$a, CvtRZI)>; 3336def : Pat<(i32 (fp_to_uint Float64Regs:$a)), 3337 (CVT_u32_f64 Float64Regs:$a, CvtRZI)>; 3338def : Pat<(i64 (fp_to_uint Float64Regs:$a)), 3339 (CVT_u64_f64 Float64Regs:$a, CvtRZI)>; 3340 3341// sext i1 3342def : Pat<(i16 (sext Int1Regs:$a)), 3343 (SELP_s16ii -1, 0, Int1Regs:$a)>; 3344def : Pat<(i32 (sext Int1Regs:$a)), 3345 (SELP_s32ii -1, 0, Int1Regs:$a)>; 3346def : Pat<(i64 (sext Int1Regs:$a)), 3347 (SELP_s64ii -1, 0, Int1Regs:$a)>; 3348 3349// zext i1 3350def : Pat<(i16 (zext Int1Regs:$a)), 3351 (SELP_u16ii 1, 0, Int1Regs:$a)>; 3352def : Pat<(i32 (zext Int1Regs:$a)), 3353 (SELP_u32ii 1, 0, Int1Regs:$a)>; 3354def : Pat<(i64 (zext Int1Regs:$a)), 3355 (SELP_u64ii 1, 0, Int1Regs:$a)>; 3356 3357// anyext i1 3358def : Pat<(i16 (anyext Int1Regs:$a)), 3359 (SELP_u16ii -1, 0, Int1Regs:$a)>; 3360def : Pat<(i32 (anyext Int1Regs:$a)), 3361 (SELP_u32ii -1, 0, Int1Regs:$a)>; 3362def : Pat<(i64 (anyext Int1Regs:$a)), 3363 (SELP_u64ii -1, 0, Int1Regs:$a)>; 3364 3365// sext i16 3366def : Pat<(i32 (sext Int16Regs:$a)), 3367 (CVT_s32_s16 Int16Regs:$a, CvtNONE)>; 3368def : Pat<(i64 (sext Int16Regs:$a)), 3369 (CVT_s64_s16 Int16Regs:$a, CvtNONE)>; 3370 3371// zext i16 3372def : Pat<(i32 (zext Int16Regs:$a)), 3373 (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; 3374def : Pat<(i64 (zext Int16Regs:$a)), 3375 (CVT_u64_u16 Int16Regs:$a, CvtNONE)>; 3376 3377// anyext i16 3378def : Pat<(i32 (anyext Int16Regs:$a)), 3379 (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; 3380def : Pat<(i64 (anyext Int16Regs:$a)), 3381 (CVT_u64_u16 Int16Regs:$a, CvtNONE)>; 3382 3383// sext i32 3384def : Pat<(i64 (sext Int32Regs:$a)), 3385 (CVT_s64_s32 Int32Regs:$a, CvtNONE)>; 3386 3387// zext i32 3388def : Pat<(i64 (zext Int32Regs:$a)), 3389 (CVT_u64_u32 Int32Regs:$a, CvtNONE)>; 3390 3391// anyext i32 3392def : Pat<(i64 (anyext Int32Regs:$a)), 3393 (CVT_u64_u32 Int32Regs:$a, CvtNONE)>; 3394 3395 3396// truncate i64 3397def : Pat<(i32 (trunc Int64Regs:$a)), 3398 (CVT_u32_u64 Int64Regs:$a, CvtNONE)>; 3399def : Pat<(i16 (trunc Int64Regs:$a)), 3400 (CVT_u16_u64 Int64Regs:$a, CvtNONE)>; 3401def : Pat<(i1 (trunc Int64Regs:$a)), 3402 (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>; 3403 3404// truncate i32 3405def : Pat<(i16 (trunc Int32Regs:$a)), 3406 (CVT_u16_u32 Int32Regs:$a, CvtNONE)>; 3407def : Pat<(i1 (trunc Int32Regs:$a)), 3408 (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>; 3409 3410// truncate i16 3411def : Pat<(i1 (trunc Int16Regs:$a)), 3412 (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>; 3413 3414// sext_inreg 3415def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>; 3416def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>; 3417def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>; 3418def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>; 3419def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>; 3420def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>; 3421 3422 3423// Select instructions with 32-bit predicates 3424def : Pat<(select (i32 Int32Regs:$pred), i16:$a, i16:$b), 3425 (SELP_b16rr Int16Regs:$a, Int16Regs:$b, 3426 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3427def : Pat<(select (i32 Int32Regs:$pred), i32:$a, i32:$b), 3428 (SELP_b32rr Int32Regs:$a, Int32Regs:$b, 3429 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3430def : Pat<(select (i32 Int32Regs:$pred), Int64Regs:$a, Int64Regs:$b), 3431 (SELP_b64rr Int64Regs:$a, Int64Regs:$b, 3432 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3433def : Pat<(select (i32 Int32Regs:$pred), (f16 Int16Regs:$a), (f16 Int16Regs:$b)), 3434 (SELP_f16rr Int16Regs:$a, Int16Regs:$b, 3435 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3436def : Pat<(select (i32 Int32Regs:$pred), (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)), 3437 (SELP_bf16rr Int16Regs:$a, Int16Regs:$b, 3438 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3439def : Pat<(select (i32 Int32Regs:$pred), Float32Regs:$a, Float32Regs:$b), 3440 (SELP_f32rr Float32Regs:$a, Float32Regs:$b, 3441 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3442def : Pat<(select (i32 Int32Regs:$pred), Float64Regs:$a, Float64Regs:$b), 3443 (SELP_f64rr Float64Regs:$a, Float64Regs:$b, 3444 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3445 3446 3447let hasSideEffects = false in { 3448 // pack a set of smaller int registers to a larger int register 3449 def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d), 3450 (ins Int16Regs:$s1, Int16Regs:$s2, 3451 Int16Regs:$s3, Int16Regs:$s4), 3452 "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>; 3453 def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d), 3454 (ins Int16Regs:$s1, Int16Regs:$s2), 3455 "mov.b32 \t$d, {{$s1, $s2}};", []>; 3456 def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d), 3457 (ins Int32Regs:$s1, Int32Regs:$s2), 3458 "mov.b64 \t$d, {{$s1, $s2}};", []>; 3459 def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d), 3460 (ins Float32Regs:$s1, Float32Regs:$s2), 3461 "mov.b64 \t$d, {{$s1, $s2}};", []>; 3462 3463 // unpack a larger int register to a set of smaller int registers 3464 def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2, 3465 Int16Regs:$d3, Int16Regs:$d4), 3466 (ins Int64Regs:$s), 3467 "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>; 3468 def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2), 3469 (ins Int32Regs:$s), 3470 "mov.b32 \t{{$d1, $d2}}, $s;", []>; 3471 def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2), 3472 (ins Int64Regs:$s), 3473 "mov.b64 \t{{$d1, $d2}}, $s;", []>; 3474 def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2), 3475 (ins Float64Regs:$s), 3476 "mov.b64 \t{{$d1, $d2}}, $s;", []>; 3477 3478 def I32toI16H : NVPTXInst<(outs Int16Regs:$high), 3479 (ins Int32Regs:$s), 3480 "{{ .reg .b16 tmp; mov.b32 {tmp, $high}, $s; }}", 3481 []>; 3482 def I32toI16L : NVPTXInst<(outs Int16Regs:$low), 3483 (ins Int32Regs:$s), 3484 "{{ .reg .b16 tmp; mov.b32 {$low, tmp}, $s; }}", 3485 []>; 3486 def I64toI32H : NVPTXInst<(outs Int32Regs:$high), 3487 (ins Int64Regs:$s), 3488 "{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}", 3489 []>; 3490} 3491 3492// Using partial vectorized move produces better SASS code for extraction of 3493// upper/lower parts of an integer. 3494def : Pat<(i16 (trunc (srl Int32Regs:$s, (i32 16)))), 3495 (I32toI16H Int32Regs:$s)>; 3496def : Pat<(i16 (trunc (sra Int32Regs:$s, (i32 16)))), 3497 (I32toI16H Int32Regs:$s)>; 3498def : Pat<(i32 (trunc (srl Int64Regs:$s, (i32 32)))), 3499 (I64toI32H Int64Regs:$s)>; 3500def : Pat<(i32 (trunc (sra Int64Regs:$s, (i32 32)))), 3501 (I64toI32H Int64Regs:$s)>; 3502 3503def: Pat<(i32 (sext (extractelt (v2i16 Int32Regs:$src), 0))), 3504 (CVT_INREG_s32_s16 Int32Regs:$src)>; 3505 3506foreach vt = [v2f16, v2bf16, v2i16] in { 3507def : Pat<(extractelt (vt Int32Regs:$src), 0), 3508 (I32toI16L Int32Regs:$src)>; 3509def : Pat<(extractelt (vt Int32Regs:$src), 1), 3510 (I32toI16H Int32Regs:$src)>; 3511} 3512def : Pat<(v2f16 (build_vector (f16 Int16Regs:$a), (f16 Int16Regs:$b))), 3513 (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>; 3514def : Pat<(v2bf16 (build_vector (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), 3515 (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>; 3516def : Pat<(v2i16 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b))), 3517 (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>; 3518 3519def: Pat<(v2i16 (scalar_to_vector (i16 Int16Regs:$a))), 3520 (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; 3521 3522// Count leading zeros 3523let hasSideEffects = false in { 3524 def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), 3525 "clz.b32 \t$d, $a;", []>; 3526 def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), 3527 "clz.b64 \t$d, $a;", []>; 3528} 3529 3530// 32-bit has a direct PTX instruction 3531def : Pat<(i32 (ctlz (i32 Int32Regs:$a))), (CLZr32 Int32Regs:$a)>; 3532 3533// The return type of the ctlz ISD node is the same as its input, but the PTX 3534// ctz instruction always returns a 32-bit value. For ctlz.i64, convert the 3535// ptx value to 64 bits to match the ISD node's semantics, unless we know we're 3536// truncating back down to 32 bits. 3537def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>; 3538def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>; 3539 3540// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the 3541// result back to 16-bits if necessary. We also need to subtract 16 because 3542// the high-order 16 zeros were counted. 3543// 3544// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could 3545// use to save one SASS instruction (on sm_35 anyway): 3546// 3547// mov.b32 $tmp, {0xffff, $a} 3548// ctlz.b32 $result, $tmp 3549// 3550// That is, instead of zero-extending the input to 32 bits, we'd "one-extend" 3551// and then ctlz that value. This way we don't have to subtract 16 from the 3552// result. Unfortunately today we don't have a way to generate 3553// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization. 3554def : Pat<(i16 (ctlz Int16Regs:$a)), 3555 (SUBi16ri (CVT_u16_u32 3556 (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>; 3557def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))), 3558 (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>; 3559 3560// Population count 3561let hasSideEffects = false in { 3562 def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), 3563 "popc.b32 \t$d, $a;", []>; 3564 def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), 3565 "popc.b64 \t$d, $a;", []>; 3566} 3567 3568// 32-bit has a direct PTX instruction 3569def : Pat<(i32 (ctpop (i32 Int32Regs:$a))), (POPCr32 Int32Regs:$a)>; 3570 3571// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit 3572// to match the LLVM semantics. Just as with ctlz.i64, we provide a second 3573// pattern that avoids the type conversion if we're truncating the result to 3574// i32 anyway. 3575def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>; 3576def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>; 3577 3578// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits. 3579// If we know that we're storing into an i32, we can avoid the final trunc. 3580def : Pat<(ctpop Int16Regs:$a), 3581 (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>; 3582def : Pat<(i32 (zext (i16 (ctpop Int16Regs:$a)))), 3583 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>; 3584 3585// fpround f32 -> f16 3586def : Pat<(f16 (fpround Float32Regs:$a)), 3587 (CVT_f16_f32 Float32Regs:$a, CvtRN)>; 3588 3589// fpround f32 -> bf16 3590def : Pat<(bf16 (fpround Float32Regs:$a)), 3591 (CVT_bf16_f32 Float32Regs:$a, CvtRN)>; 3592 3593// fpround f64 -> f16 3594def : Pat<(f16 (fpround Float64Regs:$a)), 3595 (CVT_f16_f64 Float64Regs:$a, CvtRN)>; 3596 3597// fpround f64 -> bf16 3598def : Pat<(bf16 (fpround Float64Regs:$a)), 3599 (CVT_bf16_f64 Float64Regs:$a, CvtRN)>; 3600// fpround f64 -> f32 3601def : Pat<(f32 (fpround Float64Regs:$a)), 3602 (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>; 3603def : Pat<(f32 (fpround Float64Regs:$a)), 3604 (CVT_f32_f64 Float64Regs:$a, CvtRN)>; 3605 3606// fpextend f16 -> f32 3607def : Pat<(f32 (fpextend (f16 Int16Regs:$a))), 3608 (CVT_f32_f16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; 3609def : Pat<(f32 (fpextend (f16 Int16Regs:$a))), 3610 (CVT_f32_f16 Int16Regs:$a, CvtNONE)>; 3611// fpextend bf16 -> f32 3612def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))), 3613 (CVT_f32_bf16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; 3614def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))), 3615 (CVT_f32_bf16 Int16Regs:$a, CvtNONE)>; 3616 3617// fpextend f16 -> f64 3618def : Pat<(f64 (fpextend (f16 Int16Regs:$a))), 3619 (CVT_f64_f16 Int16Regs:$a, CvtNONE)>; 3620 3621// fpextend bf16 -> f64 3622def : Pat<(f64 (fpextend (bf16 Int16Regs:$a))), 3623 (CVT_f64_bf16 Int16Regs:$a, CvtNONE)>; 3624 3625// fpextend f32 -> f64 3626def : Pat<(f64 (fpextend Float32Regs:$a)), 3627 (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; 3628def : Pat<(f64 (fpextend Float32Regs:$a)), 3629 (CVT_f64_f32 Float32Regs:$a, CvtNONE)>; 3630 3631def retglue : SDNode<"NVPTXISD::RET_GLUE", SDTNone, 3632 [SDNPHasChain, SDNPOptInGlue]>; 3633 3634// fceil, ffloor, froundeven, ftrunc. 3635 3636multiclass CVT_ROUND<SDNode OpNode, PatLeaf Mode, PatLeaf ModeFTZ> { 3637 def : Pat<(OpNode (f16 Int16Regs:$a)), 3638 (CVT_f16_f16 Int16Regs:$a, Mode)>; 3639 def : Pat<(OpNode (bf16 Int16Regs:$a)), 3640 (CVT_bf16_bf16 Int16Regs:$a, Mode)>; 3641 def : Pat<(OpNode Float32Regs:$a), 3642 (CVT_f32_f32 Float32Regs:$a, ModeFTZ)>, Requires<[doF32FTZ]>; 3643 def : Pat<(OpNode Float32Regs:$a), 3644 (CVT_f32_f32 Float32Regs:$a, Mode)>, Requires<[doNoF32FTZ]>; 3645 def : Pat<(OpNode Float64Regs:$a), 3646 (CVT_f64_f64 Float64Regs:$a, Mode)>; 3647} 3648 3649defm : CVT_ROUND<fceil, CvtRPI, CvtRPI_FTZ>; 3650defm : CVT_ROUND<ffloor, CvtRMI, CvtRMI_FTZ>; 3651defm : CVT_ROUND<froundeven, CvtRNI, CvtRNI_FTZ>; 3652defm : CVT_ROUND<ftrunc, CvtRZI, CvtRZI_FTZ>; 3653 3654// nearbyint and rint are implemented as rounding to nearest even. This isn't 3655// strictly correct, because it causes us to ignore the rounding mode. But it 3656// matches what CUDA's "libm" does. 3657 3658defm : CVT_ROUND<fnearbyint, CvtRNI, CvtRNI_FTZ>; 3659defm : CVT_ROUND<frint, CvtRNI, CvtRNI_FTZ>; 3660 3661//----------------------------------- 3662// Control-flow 3663//----------------------------------- 3664 3665let isTerminator=1 in { 3666 let isReturn=1, isBarrier=1 in 3667 def Return : NVPTXInst<(outs), (ins), "ret;", [(retglue)]>; 3668 3669 let isBranch=1 in 3670 def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), 3671 "@$a bra \t$target;", 3672 [(brcond Int1Regs:$a, bb:$target)]>; 3673 let isBranch=1 in 3674 def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), 3675 "@!$a bra \t$target;", []>; 3676 3677 let isBranch=1, isBarrier=1 in 3678 def GOTO : NVPTXInst<(outs), (ins brtarget:$target), 3679 "bra.uni \t$target;", [(br bb:$target)]>; 3680} 3681 3682def : Pat<(brcond (i32 Int32Regs:$a), bb:$target), 3683 (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>; 3684 3685// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a 3686// conditional branch if the target block is the next block so that the code 3687// can fall through to the target block. The invertion is done by 'xor 3688// condition, 1', which will be translated to (setne condition, -1). Since ptx 3689// supports '@!pred bra target', we should use it. 3690def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target), 3691 (CBranchOther Int1Regs:$a, bb:$target)>; 3692 3693// Call 3694def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, 3695 SDTCisVT<1, i32>]>; 3696def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; 3697 3698def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart, 3699 [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; 3700def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd, 3701 [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, 3702 SDNPSideEffect]>; 3703 3704def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>; 3705def call : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall, 3706 [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; 3707def calltarget : Operand<i32>; 3708let isCall=1 in { 3709 def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>; 3710} 3711 3712def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>; 3713def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>; 3714 3715// Pseudo instructions. 3716class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern> 3717 : NVPTXInst<outs, ins, asmstr, pattern>; 3718 3719def Callseq_Start : 3720 NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), 3721 "\\{ // callseq $amt1, $amt2\n" 3722 "\t.reg .b32 temp_param_reg;", 3723 [(callseq_start timm:$amt1, timm:$amt2)]>; 3724def Callseq_End : 3725 NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), 3726 "\\} // callseq $amt1", 3727 [(callseq_end timm:$amt1, timm:$amt2)]>; 3728 3729// trap instruction 3730// Emit an `exit` as well to convey to ptxas that `trap` exits the CFG. 3731// This won't be necessary in a future version of ptxas. 3732def trapinst : NVPTXInst<(outs), (ins), "trap; exit;", [(trap)]>; 3733 3734// Call prototype wrapper 3735def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>; 3736def CallPrototype : 3737 SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype, 3738 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 3739def ProtoIdent : Operand<i32> { 3740 let PrintMethod = "printProtoIdent"; 3741} 3742def CALL_PROTOTYPE : 3743 NVPTXInst<(outs), (ins ProtoIdent:$ident), 3744 "$ident", [(CallPrototype (i32 texternalsym:$ident))]>; 3745 3746 3747include "NVPTXIntrinsics.td" 3748 3749 3750//----------------------------------- 3751// Notes 3752//----------------------------------- 3753// BSWAP is currently expanded. The following is a more efficient 3754// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register 3755// - for sm_20, use pmpt (use vector scalar mov to get the pack and 3756// unpack). sm_20 supports native 32-bit register, but not native 16-bit 3757// register. 3758