1//===-- SIInstructions.td - SI Instruction Definitions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8// This file was originally auto-generated from a GPU register header file and
9// all the instruction definitions were originally commented out.  Instructions
10// that are not yet supported remain commented out.
11//===----------------------------------------------------------------------===//
12
13class GCNPat<dag pattern, dag result> : Pat<pattern, result>, GCNPredicateControl {
14
15}
16
17include "SOPInstructions.td"
18include "VOPInstructions.td"
19include "SMInstructions.td"
20include "FLATInstructions.td"
21include "BUFInstructions.td"
22include "EXPInstructions.td"
23
24//===----------------------------------------------------------------------===//
25// VINTRP Instructions
26//===----------------------------------------------------------------------===//
27
28// Used to inject printing of "_e32" suffix for VI (there are "_e64" variants for VI)
29def VINTRPDst : VINTRPDstOperand <VGPR_32>;
30
31let Uses = [MODE, M0, EXEC] in {
32
33// FIXME: Specify SchedRW for VINTRP instructions.
34
35multiclass V_INTERP_P1_F32_m : VINTRP_m <
36  0x00000000,
37  (outs VINTRPDst:$vdst),
38  (ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
39  "v_interp_p1_f32$vdst, $vsrc, $attr$attrchan",
40  [(set f32:$vdst, (int_amdgcn_interp_p1 f32:$vsrc,
41                   (i32 timm:$attrchan), (i32 timm:$attr), M0))]
42>;
43
44let OtherPredicates = [has32BankLDS] in {
45
46defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m;
47
48} // End OtherPredicates = [has32BankLDS]
49
50let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in {
51
52defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m;
53
54} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1
55
56let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in {
57
58defm V_INTERP_P2_F32 : VINTRP_m <
59  0x00000001,
60  (outs VINTRPDst:$vdst),
61  (ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
62  "v_interp_p2_f32$vdst, $vsrc, $attr$attrchan",
63  [(set f32:$vdst, (int_amdgcn_interp_p2 f32:$src0, f32:$vsrc,
64                   (i32 timm:$attrchan), (i32 timm:$attr), M0))]>;
65
66} // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst"
67
68defm V_INTERP_MOV_F32 : VINTRP_m <
69  0x00000002,
70  (outs VINTRPDst:$vdst),
71  (ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan),
72  "v_interp_mov_f32$vdst, $vsrc, $attr$attrchan",
73  [(set f32:$vdst, (int_amdgcn_interp_mov (i32 timm:$vsrc),
74                   (i32 timm:$attrchan), (i32 timm:$attr), M0))]>;
75
76} // End Uses = [MODE, M0, EXEC]
77
78//===----------------------------------------------------------------------===//
79// Pseudo Instructions
80//===----------------------------------------------------------------------===//
81def ATOMIC_FENCE : SPseudoInstSI<
82  (outs), (ins i32imm:$ordering, i32imm:$scope),
83  [(atomic_fence (i32 timm:$ordering), (i32 timm:$scope))],
84  "ATOMIC_FENCE $ordering, $scope"> {
85  let hasSideEffects = 1;
86  let maybeAtomic = 1;
87}
88
89def VOP_I64_I64_DPP : VOPProfile <[i64, i64, untyped, untyped]> {
90  let HasExt = 1;
91  let HasExtDPP = 1;
92}
93
94let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
95
96// For use in patterns
97def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
98  (ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> {
99  let isPseudo = 1;
100  let isCodeGenOnly = 1;
101  let usesCustomInserter = 1;
102}
103
104// 64-bit vector move instruction. This is mainly used by the
105// SIFoldOperands pass to enable folding of inline immediates.
106def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
107                                      (ins VSrc_b64:$src0)>;
108
109// 64-bit vector move with dpp. Expanded post-RA.
110def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64_DPP> {
111  let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete.
112}
113
114// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the
115// WQM pass processes it.
116def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
117
118// Pseudoinstruction for @llvm.amdgcn.softwqm. Like @llvm.amdgcn.wqm it is
119// turned into a copy by WQM pass, but does not seed WQM requirements.
120def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
121
122// Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so
123// that the @earlyclobber is respected. The @earlyclobber is to make sure that
124// the instruction that defines $src0 (which is run in WWM) doesn't
125// accidentally clobber inactive channels of $vdst.
126let Constraints = "@earlyclobber $vdst" in {
127def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
128}
129
130} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
131
132def ENTER_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
133  let Uses = [EXEC];
134  let Defs = [EXEC, SCC];
135  let hasSideEffects = 0;
136  let mayLoad = 0;
137  let mayStore = 0;
138}
139
140def EXIT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
141  let hasSideEffects = 0;
142  let mayLoad = 0;
143  let mayStore = 0;
144}
145
146// Invert the exec mask and overwrite the inactive lanes of dst with inactive,
147// restoring it after we're done.
148def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
149  (ins VGPR_32: $src, VSrc_b32:$inactive),
150  [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> {
151  let Constraints = "$src = $vdst";
152}
153
154def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
155  (ins VReg_64: $src, VSrc_b64:$inactive),
156  [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> {
157  let Constraints = "$src = $vdst";
158}
159
160let usesCustomInserter = 1, Defs = [VCC, EXEC] in {
161def V_ADD_U64_PSEUDO : VPseudoInstSI <
162  (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1),
163  [(set VReg_64:$vdst, (getDivergentFrag<add>.ret i64:$src0, i64:$src1))]
164>;
165
166def V_SUB_U64_PSEUDO : VPseudoInstSI <
167  (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1),
168  [(set VReg_64:$vdst, (getDivergentFrag<sub>.ret i64:$src0, i64:$src1))]
169>;
170} // End usesCustomInserter = 1, Defs = [VCC, EXEC]
171
172let usesCustomInserter = 1, Defs = [SCC] in {
173def S_ADD_U64_PSEUDO : SPseudoInstSI <
174  (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
175  [(set SReg_64:$sdst, (UniformBinFrag<add> i64:$src0, i64:$src1))]
176>;
177
178def S_SUB_U64_PSEUDO : SPseudoInstSI <
179  (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
180  [(set SReg_64:$sdst, (UniformBinFrag<sub> i64:$src0, i64:$src1))]
181>;
182
183def S_ADD_U64_CO_PSEUDO : SPseudoInstSI <
184  (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
185>;
186
187def S_SUB_U64_CO_PSEUDO : SPseudoInstSI <
188  (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
189>;
190
191def S_ADD_CO_PSEUDO : SPseudoInstSI <
192  (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in)
193>;
194
195def S_SUB_CO_PSEUDO : SPseudoInstSI <
196  (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in)
197>;
198
199def S_UADDO_PSEUDO : SPseudoInstSI <
200  (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1)
201>;
202
203def S_USUBO_PSEUDO : SPseudoInstSI <
204  (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1)
205>;
206
207} // End usesCustomInserter = 1, Defs = [SCC]
208
209let usesCustomInserter = 1 in {
210def GET_GROUPSTATICSIZE : SPseudoInstSI <(outs SReg_32:$sdst), (ins),
211  [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
212} // End let usesCustomInserter = 1, SALU = 1
213
214// Wrap an instruction by duplicating it, except for setting isTerminator.
215class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI<
216      base_inst.OutOperandList,
217      base_inst.InOperandList> {
218  let Uses = base_inst.Uses;
219  let Defs = base_inst.Defs;
220  let isTerminator = 1;
221  let isAsCheapAsAMove = base_inst.isAsCheapAsAMove;
222  let hasSideEffects = base_inst.hasSideEffects;
223  let UseNamedOperandTable = base_inst.UseNamedOperandTable;
224  let CodeSize = base_inst.CodeSize;
225  let SchedRW = base_inst.SchedRW;
226}
227
228let WaveSizePredicate = isWave64 in {
229def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>;
230def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
231def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>;
232def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
233}
234
235let WaveSizePredicate = isWave32 in {
236def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>;
237def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;
238def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
239def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
240}
241
242
243def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
244  [(int_amdgcn_wave_barrier)]> {
245  let SchedRW = [];
246  let hasNoSchedulingInfo = 1;
247  let hasSideEffects = 1;
248  let mayLoad = 0;
249  let mayStore = 0;
250  let isConvergent = 1;
251  let FixedSize = 1;
252  let Size = 0;
253}
254
255// SI pseudo instructions. These are used by the CFG structurizer pass
256// and should be lowered to ISA instructions prior to codegen.
257
258// Dummy terminator instruction to use after control flow instructions
259// replaced with exec mask operations.
260def SI_MASK_BRANCH : VPseudoInstSI <
261  (outs), (ins brtarget:$target)> {
262  let isBranch = 0;
263  let isTerminator = 1;
264  let isBarrier = 0;
265  let SchedRW = [];
266  let hasNoSchedulingInfo = 1;
267  let FixedSize = 1;
268  let Size = 0;
269}
270
271let isTerminator = 1 in {
272
273let OtherPredicates = [EnableLateCFGStructurize] in {
274 def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI <
275  (outs),
276  (ins SReg_1:$vcc, brtarget:$target),
277  [(brcond i1:$vcc, bb:$target)]> {
278    let Size = 12;
279}
280}
281
282def SI_IF: CFPseudoInstSI <
283  (outs SReg_1:$dst), (ins SReg_1:$vcc, brtarget:$target),
284  [(set i1:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {
285  let Constraints = "";
286  let Size = 12;
287  let hasSideEffects = 1;
288}
289
290def SI_ELSE : CFPseudoInstSI <
291  (outs SReg_1:$dst),
292  (ins SReg_1:$src, brtarget:$target), [], 1, 1> {
293  let Size = 12;
294  let hasSideEffects = 1;
295}
296
297def SI_LOOP : CFPseudoInstSI <
298  (outs), (ins SReg_1:$saved, brtarget:$target),
299  [(AMDGPUloop i1:$saved, bb:$target)], 1, 1> {
300  let Size = 8;
301  let isBranch = 1;
302  let hasSideEffects = 1;
303}
304
305} // End isTerminator = 1
306
307def SI_END_CF : CFPseudoInstSI <
308  (outs), (ins SReg_1:$saved), [], 1, 1> {
309  let Size = 4;
310  let isAsCheapAsAMove = 1;
311  let isReMaterializable = 1;
312  let hasSideEffects = 1;
313  let mayLoad = 1; // FIXME: Should not need memory flags
314  let mayStore = 1;
315}
316
317def SI_IF_BREAK : CFPseudoInstSI <
318  (outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> {
319  let Size = 4;
320  let isAsCheapAsAMove = 1;
321  let isReMaterializable = 1;
322}
323
324// Branch to the early termination block of the shader if SCC is 0.
325// This uses SCC from a previous SALU operation, i.e. the update of
326// a mask of live lanes after a kill/demote operation.
327// Only valid in pixel shaders.
328def SI_EARLY_TERMINATE_SCC0 : SPseudoInstSI <(outs), (ins)> {
329  let Uses = [EXEC,SCC];
330}
331
332let Uses = [EXEC] in {
333
334multiclass PseudoInstKill <dag ins> {
335  // Even though this pseudo can usually be expanded without an SCC def, we
336  // conservatively assume that it has an SCC def, both because it is sometimes
337  // required in degenerate cases (when V_CMPX cannot be used due to constant
338  // bus limitations) and because it allows us to avoid having to track SCC
339  // liveness across basic blocks.
340  let Defs = [EXEC,VCC,SCC] in
341  def _PSEUDO : PseudoInstSI <(outs), ins> {
342    let isConvergent = 1;
343    let usesCustomInserter = 1;
344  }
345
346  let Defs = [EXEC,VCC,SCC] in
347  def _TERMINATOR : SPseudoInstSI <(outs), ins> {
348    let isTerminator = 1;
349  }
350}
351
352defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>;
353defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
354
355let Defs = [EXEC] in
356def SI_KILL_CLEANUP : SPseudoInstSI <(outs), (ins)>;
357
358let Defs = [EXEC,VCC] in
359def SI_ILLEGAL_COPY : SPseudoInstSI <
360  (outs unknown:$dst), (ins unknown:$src),
361  [], " ; illegal copy $src to $dst">;
362
363} // End Uses = [EXEC], Defs = [EXEC,VCC]
364
365// Branch on undef scc. Used to avoid intermediate copy from
366// IMPLICIT_DEF to SCC.
367def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> {
368  let isTerminator = 1;
369  let usesCustomInserter = 1;
370  let isBranch = 1;
371}
372
373def SI_PS_LIVE : PseudoInstSI <
374  (outs SReg_1:$dst), (ins),
375  [(set i1:$dst, (int_amdgcn_ps_live))]> {
376  let SALU = 1;
377}
378
379def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
380  [(int_amdgcn_unreachable)],
381  "; divergent unreachable"> {
382  let Size = 0;
383  let hasNoSchedulingInfo = 1;
384  let FixedSize = 1;
385}
386
387// Used as an isel pseudo to directly emit initialization with an
388// s_mov_b32 rather than a copy of another initialized
389// register. MachineCSE skips copies, and we don't want to have to
390// fold operands before it runs.
391def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> {
392  let Defs = [M0];
393  let usesCustomInserter = 1;
394  let isAsCheapAsAMove = 1;
395  let isReMaterializable = 1;
396}
397
398def SI_INIT_EXEC : SPseudoInstSI <
399  (outs), (ins i64imm:$src),
400  [(int_amdgcn_init_exec (i64 timm:$src))]> {
401  let Defs = [EXEC];
402  let isAsCheapAsAMove = 1;
403}
404
405def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
406  (outs), (ins SSrc_b32:$input, i32imm:$shift),
407  [(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> {
408  let Defs = [EXEC];
409}
410
411// Return for returning shaders to a shader variant epilog.
412def SI_RETURN_TO_EPILOG : SPseudoInstSI <
413  (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
414  let isTerminator = 1;
415  let isBarrier = 1;
416  let isReturn = 1;
417  let hasNoSchedulingInfo = 1;
418  let DisableWQM = 1;
419  let FixedSize = 1;
420}
421
422// Return for returning function calls.
423def SI_RETURN : SPseudoInstSI <
424  (outs), (ins), [],
425  "; return"> {
426  let isTerminator = 1;
427  let isBarrier = 1;
428  let isReturn = 1;
429  let SchedRW = [WriteBranch];
430}
431
432// Return for returning function calls without output register.
433//
434// This version is only needed so we can fill in the output register
435// in the custom inserter.
436def SI_CALL_ISEL : SPseudoInstSI <
437  (outs), (ins SSrc_b64:$src0, unknown:$callee),
438  [(AMDGPUcall i64:$src0, tglobaladdr:$callee)]> {
439  let Size = 4;
440  let isCall = 1;
441  let SchedRW = [WriteBranch];
442  let usesCustomInserter = 1;
443  // TODO: Should really base this on the call target
444  let isConvergent = 1;
445}
446
447def : GCNPat<
448  (AMDGPUcall i64:$src0, (i64 0)),
449  (SI_CALL_ISEL $src0, (i64 0))
450>;
451
452// Wrapper around s_swappc_b64 with extra $callee parameter to track
453// the called function after regalloc.
454def SI_CALL : SPseudoInstSI <
455  (outs SReg_64:$dst), (ins SSrc_b64:$src0, unknown:$callee)> {
456  let Size = 4;
457  let isCall = 1;
458  let UseNamedOperandTable = 1;
459  let SchedRW = [WriteBranch];
460  // TODO: Should really base this on the call target
461  let isConvergent = 1;
462}
463
464// Tail call handling pseudo
465def SI_TCRETURN : SPseudoInstSI <(outs),
466  (ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff),
467  [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> {
468  let Size = 4;
469  let isCall = 1;
470  let isTerminator = 1;
471  let isReturn = 1;
472  let isBarrier = 1;
473  let UseNamedOperandTable = 1;
474  let SchedRW = [WriteBranch];
475  // TODO: Should really base this on the call target
476  let isConvergent = 1;
477}
478
479
480def ADJCALLSTACKUP : SPseudoInstSI<
481  (outs), (ins i32imm:$amt0, i32imm:$amt1),
482  [(callseq_start timm:$amt0, timm:$amt1)],
483  "; adjcallstackup $amt0 $amt1"> {
484  let Size = 8; // Worst case. (s_add_u32 + constant)
485  let FixedSize = 1;
486  let hasSideEffects = 1;
487  let usesCustomInserter = 1;
488  let SchedRW = [WriteSALU];
489  let Defs = [SCC];
490}
491
492def ADJCALLSTACKDOWN : SPseudoInstSI<
493  (outs), (ins i32imm:$amt1, i32imm:$amt2),
494  [(callseq_end timm:$amt1, timm:$amt2)],
495  "; adjcallstackdown $amt1"> {
496  let Size = 8; // Worst case. (s_add_u32 + constant)
497  let hasSideEffects = 1;
498  let usesCustomInserter = 1;
499  let SchedRW = [WriteSALU];
500  let Defs = [SCC];
501}
502
503let Defs = [M0, EXEC, SCC],
504  UseNamedOperandTable = 1 in {
505
506// SI_INDIRECT_SRC/DST are only used by legacy SelectionDAG indirect
507// addressing implementation.
508class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI <
509  (outs VGPR_32:$vdst),
510  (ins rc:$src, VS_32:$idx, i32imm:$offset)> {
511  let usesCustomInserter = 1;
512}
513
514class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI <
515  (outs rc:$vdst),
516  (ins rc:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> {
517  let Constraints = "$src = $vdst";
518  let usesCustomInserter = 1;
519}
520
521def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>;
522def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>;
523def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>;
524def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>;
525def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>;
526def SI_INDIRECT_SRC_V32 : SI_INDIRECT_SRC<VReg_1024>;
527
528def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
529def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
530def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
531def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
532def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
533def SI_INDIRECT_DST_V32 : SI_INDIRECT_DST<VReg_1024>;
534
535} // End Uses = [EXEC], Defs = [M0, EXEC]
536
537// This is a pseudo variant of the v_movreld_b32 instruction in which the
538// vector operand appears only twice, once as def and once as use. Using this
539// pseudo avoids problems with the Two Address instructions pass.
540class INDIRECT_REG_WRITE_MOVREL_pseudo<RegisterClass rc,
541                                RegisterOperand val_ty> : PseudoInstSI <
542  (outs rc:$vdst), (ins rc:$vsrc, val_ty:$val, i32imm:$subreg)> {
543  let Constraints = "$vsrc = $vdst";
544  let Uses = [M0];
545}
546
547class V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<RegisterClass rc> :
548  INDIRECT_REG_WRITE_MOVREL_pseudo<rc, VSrc_b32> {
549  let VALU = 1;
550  let VOP1 = 1;
551  let Uses = [M0, EXEC];
552}
553
554class S_INDIRECT_REG_WRITE_MOVREL_pseudo<RegisterClass rc,
555                                  RegisterOperand val_ty> :
556  INDIRECT_REG_WRITE_MOVREL_pseudo<rc, val_ty> {
557  let SALU = 1;
558  let SOP1 = 1;
559  let Uses = [M0];
560}
561
562class S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<RegisterClass rc> :
563  S_INDIRECT_REG_WRITE_MOVREL_pseudo<rc, SSrc_b32>;
564class S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<RegisterClass rc> :
565  S_INDIRECT_REG_WRITE_MOVREL_pseudo<rc, SSrc_b64>;
566
567def V_INDIRECT_REG_WRITE_MOVREL_B32_V1 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VGPR_32>;
568def V_INDIRECT_REG_WRITE_MOVREL_B32_V2 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_64>;
569def V_INDIRECT_REG_WRITE_MOVREL_B32_V3 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_96>;
570def V_INDIRECT_REG_WRITE_MOVREL_B32_V4 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_128>;
571def V_INDIRECT_REG_WRITE_MOVREL_B32_V5 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_160>;
572def V_INDIRECT_REG_WRITE_MOVREL_B32_V8 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_256>;
573def V_INDIRECT_REG_WRITE_MOVREL_B32_V16 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_512>;
574def V_INDIRECT_REG_WRITE_MOVREL_B32_V32 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_1024>;
575
576def S_INDIRECT_REG_WRITE_MOVREL_B32_V1 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_32>;
577def S_INDIRECT_REG_WRITE_MOVREL_B32_V2 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_64>;
578def S_INDIRECT_REG_WRITE_MOVREL_B32_V3 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_96>;
579def S_INDIRECT_REG_WRITE_MOVREL_B32_V4 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_128>;
580def S_INDIRECT_REG_WRITE_MOVREL_B32_V5 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_160>;
581def S_INDIRECT_REG_WRITE_MOVREL_B32_V8 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_256>;
582def S_INDIRECT_REG_WRITE_MOVREL_B32_V16 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_512>;
583def S_INDIRECT_REG_WRITE_MOVREL_B32_V32 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_1024>;
584
585def S_INDIRECT_REG_WRITE_MOVREL_B64_V1 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_64>;
586def S_INDIRECT_REG_WRITE_MOVREL_B64_V2 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_128>;
587def S_INDIRECT_REG_WRITE_MOVREL_B64_V4 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_256>;
588def S_INDIRECT_REG_WRITE_MOVREL_B64_V8 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_512>;
589def S_INDIRECT_REG_WRITE_MOVREL_B64_V16 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_1024>;
590
591// These variants of V_INDIRECT_REG_READ/WRITE use VGPR indexing. By using these
592// pseudos we avoid spills or copies being inserted within indirect sequences
593// that switch the VGPR indexing mode. Spills to accvgprs could be effected by
594// this mode switching.
595
596class V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<RegisterClass rc> : PseudoInstSI <
597  (outs rc:$vdst), (ins rc:$vsrc, VSrc_b32:$val, SSrc_b32:$idx, i32imm:$subreg)> {
598  let Constraints = "$vsrc = $vdst";
599  let VALU = 1;
600  let Uses = [M0, EXEC];
601  let Defs = [M0];
602}
603
604def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VGPR_32>;
605def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_64>;
606def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_96>;
607def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_128>;
608def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_160>;
609def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_256>;
610def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_512>;
611def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_1024>;
612
613class V_INDIRECT_REG_READ_GPR_IDX_pseudo<RegisterClass rc> : PseudoInstSI <
614  (outs VGPR_32:$vdst), (ins rc:$vsrc, SSrc_b32:$idx, i32imm:$subreg)> {
615  let VALU = 1;
616  let Uses = [M0, EXEC];
617  let Defs = [M0];
618}
619
620def V_INDIRECT_REG_READ_GPR_IDX_B32_V1 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VGPR_32>;
621def V_INDIRECT_REG_READ_GPR_IDX_B32_V2 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_64>;
622def V_INDIRECT_REG_READ_GPR_IDX_B32_V3 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_96>;
623def V_INDIRECT_REG_READ_GPR_IDX_B32_V4 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_128>;
624def V_INDIRECT_REG_READ_GPR_IDX_B32_V5 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_160>;
625def V_INDIRECT_REG_READ_GPR_IDX_B32_V8 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_256>;
626def V_INDIRECT_REG_READ_GPR_IDX_B32_V16 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_512>;
627def V_INDIRECT_REG_READ_GPR_IDX_B32_V32 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_1024>;
628
629multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
630  let UseNamedOperandTable = 1, SGPRSpill = 1, Uses = [EXEC] in {
631    def _SAVE : PseudoInstSI <
632      (outs),
633      (ins sgpr_class:$data, i32imm:$addr)> {
634      let mayStore = 1;
635      let mayLoad = 0;
636    }
637
638    def _RESTORE : PseudoInstSI <
639      (outs sgpr_class:$data),
640      (ins i32imm:$addr)> {
641      let mayStore = 0;
642      let mayLoad = 1;
643    }
644  } // End UseNamedOperandTable = 1
645}
646
647// You cannot use M0 as the output of v_readlane_b32 instructions or
648// use it in the sdata operand of SMEM instructions. We still need to
649// be able to spill the physical register m0, so allow it for
650// SI_SPILL_32_* instructions.
651defm SI_SPILL_S32  : SI_SPILL_SGPR <SReg_32>;
652defm SI_SPILL_S64  : SI_SPILL_SGPR <SReg_64>;
653defm SI_SPILL_S96  : SI_SPILL_SGPR <SReg_96>;
654defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
655defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>;
656defm SI_SPILL_S192 : SI_SPILL_SGPR <SReg_192>;
657defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
658defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
659defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>;
660
661// VGPR or AGPR spill instructions. In case of AGPR spilling a temp register
662// needs to be used and an extra instruction to move between VGPR and AGPR.
663// UsesTmp adds to the total size of an expanded spill in this case.
664multiclass SI_SPILL_VGPR <RegisterClass vgpr_class, bit UsesTmp = 0> {
665  let UseNamedOperandTable = 1, VGPRSpill = 1,
666       SchedRW = [WriteVMEM] in {
667    def _SAVE : VPseudoInstSI <
668      (outs),
669      (ins vgpr_class:$vdata, i32imm:$vaddr,
670           SReg_32:$soffset, i32imm:$offset)> {
671      let mayStore = 1;
672      let mayLoad = 0;
673      // (2 * 4) + (8 * num_subregs) bytes maximum
674      int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), !add(UsesTmp, 3)), 8);
675      // Size field is unsigned char and cannot fit more.
676      let Size = !if(!le(MaxSize, 256), MaxSize, 252);
677    }
678
679    def _RESTORE : VPseudoInstSI <
680      (outs vgpr_class:$vdata),
681      (ins i32imm:$vaddr,
682           SReg_32:$soffset, i32imm:$offset)> {
683      let mayStore = 0;
684      let mayLoad = 1;
685
686      // (2 * 4) + (8 * num_subregs) bytes maximum
687      int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), !add(UsesTmp, 3)), 8);
688      // Size field is unsigned char and cannot fit more.
689      let Size = !if(!le(MaxSize, 256), MaxSize, 252);
690    }
691  } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM]
692}
693
694defm SI_SPILL_V32  : SI_SPILL_VGPR <VGPR_32>;
695defm SI_SPILL_V64  : SI_SPILL_VGPR <VReg_64>;
696defm SI_SPILL_V96  : SI_SPILL_VGPR <VReg_96>;
697defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
698defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>;
699defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>;
700defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
701defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
702defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>;
703
704defm SI_SPILL_A32  : SI_SPILL_VGPR <AGPR_32, 1>;
705defm SI_SPILL_A64  : SI_SPILL_VGPR <AReg_64, 1>;
706defm SI_SPILL_A96  : SI_SPILL_VGPR <AReg_96, 1>;
707defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128, 1>;
708defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160, 1>;
709defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192, 1>;
710defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>;
711defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>;
712defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>;
713
714def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
715  (outs SReg_64:$dst),
716  (ins si_ga:$ptr_lo, si_ga:$ptr_hi),
717  [(set SReg_64:$dst,
718      (i64 (SIpc_add_rel_offset tglobaladdr:$ptr_lo, tglobaladdr:$ptr_hi)))]> {
719  let Defs = [SCC];
720}
721
722def : GCNPat <
723  (SIpc_add_rel_offset tglobaladdr:$ptr_lo, 0),
724  (SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0))
725>;
726
727def : GCNPat<
728  (AMDGPUtrap timm:$trapid),
729  (S_TRAP $trapid)
730>;
731
732def : GCNPat<
733  (AMDGPUelse i1:$src, bb:$target),
734  (SI_ELSE $src, $target)
735>;
736
737def : Pat <
738  (int_amdgcn_kill i1:$src),
739  (SI_KILL_I1_PSEUDO SCSrc_i1:$src, 0)
740>;
741
742def : Pat <
743  (int_amdgcn_kill (i1 (not i1:$src))),
744  (SI_KILL_I1_PSEUDO SCSrc_i1:$src, -1)
745>;
746
747def : Pat <
748  (int_amdgcn_kill (i1 (setcc f32:$src, InlineImmFP32:$imm, cond:$cond))),
749  (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond))
750>;
751
752  // TODO: we could add more variants for other types of conditionals
753
754def : Pat <
755  (i64 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))),
756  (COPY $src) // Return the SGPRs representing i1 src
757>;
758
759def : Pat <
760  (i32 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))),
761  (COPY $src) // Return the SGPRs representing i1 src
762>;
763
764//===----------------------------------------------------------------------===//
765// VOP1 Patterns
766//===----------------------------------------------------------------------===//
767
768let OtherPredicates = [UnsafeFPMath] in {
769
770//defm : RsqPat<V_RSQ_F32_e32, f32>;
771
772def : RsqPat<V_RSQ_F32_e32, f32>;
773
774// Convert (x - floor(x)) to fract(x)
775def : GCNPat <
776  (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
777             (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
778  (V_FRACT_F32_e64 $mods, $x)
779>;
780
781// Convert (x + (-floor(x))) to fract(x)
782def : GCNPat <
783  (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
784             (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
785  (V_FRACT_F64_e64 $mods, $x)
786>;
787
788} // End OtherPredicates = [UnsafeFPMath]
789
790
791// f16_to_fp patterns
792def : GCNPat <
793  (f32 (f16_to_fp i32:$src0)),
794  (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0)
795>;
796
797def : GCNPat <
798  (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))),
799  (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0)
800>;
801
802def : GCNPat <
803  (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))),
804  (V_CVT_F32_F16_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0)))
805>;
806
807def : GCNPat <
808  (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))),
809  (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0)
810>;
811
812def : GCNPat <
813  (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))),
814  (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0)
815>;
816
817def : GCNPat <
818  (f64 (fpextend f16:$src)),
819  (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src))
820>;
821
822// fp_to_fp16 patterns
823def : GCNPat <
824  (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
825  (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0)
826>;
827
828def : GCNPat <
829  (i32 (fp_to_sint f16:$src)),
830  (V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 VSrc_b32:$src))
831>;
832
833def : GCNPat <
834  (i32 (fp_to_uint f16:$src)),
835  (V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 VSrc_b32:$src))
836>;
837
838def : GCNPat <
839  (f16 (sint_to_fp i32:$src)),
840  (V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 VSrc_b32:$src))
841>;
842
843def : GCNPat <
844  (f16 (uint_to_fp i32:$src)),
845  (V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 VSrc_b32:$src))
846>;
847
848//===----------------------------------------------------------------------===//
849// VOP2 Patterns
850//===----------------------------------------------------------------------===//
851
852// NoMods pattern used for mac. If there are any source modifiers then it's
853// better to select mad instead of mac.
854class FMADPat <ValueType vt, Instruction inst, SDPatternOperator node>
855  : GCNPat <(vt (node (vt (VOP3NoMods vt:$src0)),
856                      (vt (VOP3NoMods vt:$src1)),
857                      (vt (VOP3NoMods vt:$src2)))),
858    (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
859          SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
860>;
861
862// Prefer mac form when there are no modifiers.
863let AddedComplexity = 9 in {
864let OtherPredicates = [HasMadMacF32Insts] in {
865def : FMADPat <f32, V_MAC_F32_e64, fmad>;
866def : FMADPat <f32, V_MAC_F32_e64, AMDGPUfmad_ftz>;
867} // OtherPredicates = [HasMadMacF32Insts]
868
869// Don't allow source modifiers. If there are any source modifiers then it's
870// better to select mad instead of mac.
871let SubtargetPredicate = isGFX6GFX7GFX10,
872    OtherPredicates = [HasMadMacF32Insts, NoFP32Denormals] in
873def : GCNPat <
874      (f32 (fadd (AMDGPUfmul_legacy (VOP3NoMods f32:$src0),
875                                    (VOP3NoMods f32:$src1)),
876                 (VOP3NoMods f32:$src2))),
877      (V_MAC_LEGACY_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
878                            SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
879>;
880
881// Don't allow source modifiers. If there are any source modifiers then it's
882// better to select fma instead of fmac.
883let SubtargetPredicate = HasFmaLegacy32 in
884def : GCNPat <
885      (f32 (int_amdgcn_fma_legacy (VOP3NoMods f32:$src0),
886                                  (VOP3NoMods f32:$src1),
887                                  (VOP3NoMods f32:$src2))),
888      (V_FMAC_LEGACY_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
889                             SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
890>;
891
892let SubtargetPredicate = Has16BitInsts in {
893def : FMADPat <f16, V_MAC_F16_e64, fmad>;
894def : FMADPat <f16, V_MAC_F16_e64, AMDGPUfmad_ftz>;
895} // SubtargetPredicate = Has16BitInsts
896} // AddedComplexity = 9
897
898class FMADModsPat<ValueType Ty, Instruction inst, SDPatternOperator mad_opr>
899  : GCNPat<
900  (Ty (mad_opr (Ty (VOP3Mods Ty:$src0, i32:$src0_mod)),
901               (Ty (VOP3Mods Ty:$src1, i32:$src1_mod)),
902               (Ty (VOP3Mods Ty:$src2, i32:$src2_mod)))),
903  (inst $src0_mod, $src0, $src1_mod, $src1,
904  $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
905>;
906
907let OtherPredicates = [HasMadMacF32Insts] in
908def : FMADModsPat<f32, V_MAD_F32_e64, AMDGPUfmad_ftz>;
909
910let OtherPredicates = [HasMadMacF32Insts, NoFP32Denormals] in
911def : GCNPat <
912      (f32 (fadd (AMDGPUfmul_legacy (VOP3Mods f32:$src0, i32:$src0_mod),
913                                    (VOP3Mods f32:$src1, i32:$src1_mod)),
914                 (VOP3Mods f32:$src2, i32:$src2_mod))),
915      (V_MAD_LEGACY_F32_e64 $src0_mod, $src0, $src1_mod, $src1,
916                        $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
917>;
918
919let SubtargetPredicate = Has16BitInsts in
920def : FMADModsPat<f16, V_MAD_F16_e64, AMDGPUfmad_ftz>;
921
922class VOPSelectModsPat <ValueType vt> : GCNPat <
923  (vt (select i1:$src0, (VOP3Mods vt:$src1, i32:$src1_mods),
924                        (VOP3Mods vt:$src2, i32:$src2_mods))),
925  (V_CNDMASK_B32_e64 FP32InputMods:$src2_mods, VSrc_b32:$src2,
926                     FP32InputMods:$src1_mods, VSrc_b32:$src1, SSrc_i1:$src0)
927>;
928
929class VOPSelectPat <ValueType vt> : GCNPat <
930  (vt (select i1:$src0, vt:$src1, vt:$src2)),
931  (V_CNDMASK_B32_e64 0, VSrc_b32:$src2, 0, VSrc_b32:$src1, SSrc_i1:$src0)
932>;
933
934def : VOPSelectModsPat <i32>;
935def : VOPSelectModsPat <f32>;
936def : VOPSelectPat <f16>;
937def : VOPSelectPat <i16>;
938
939let AddedComplexity = 1 in {
940def : GCNPat <
941  (i32 (add (i32 (getDivergentFrag<ctpop>.ret i32:$popcnt)), i32:$val)),
942  (V_BCNT_U32_B32_e64 $popcnt, $val)
943>;
944}
945
946def : GCNPat <
947  (i32 (ctpop i32:$popcnt)),
948  (V_BCNT_U32_B32_e64 VSrc_b32:$popcnt, (i32 0))
949>;
950
951def : GCNPat <
952  (i16 (add (i16 (trunc (i32 (getDivergentFrag<ctpop>.ret i32:$popcnt)))), i16:$val)),
953  (V_BCNT_U32_B32_e64 $popcnt, $val)
954>;
955
956/********** ============================================ **********/
957/********** Extraction, Insertion, Building and Casting  **********/
958/********** ============================================ **********/
959
960foreach Index = 0-2 in {
961  def Extract_Element_v2i32_#Index : Extract_Element <
962    i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
963  >;
964  def Insert_Element_v2i32_#Index : Insert_Element <
965    i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
966  >;
967
968  def Extract_Element_v2f32_#Index : Extract_Element <
969    f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
970  >;
971  def Insert_Element_v2f32_#Index : Insert_Element <
972    f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
973  >;
974}
975
976foreach Index = 0-2 in {
977  def Extract_Element_v3i32_#Index : Extract_Element <
978    i32, v3i32, Index, !cast<SubRegIndex>(sub#Index)
979  >;
980  def Insert_Element_v3i32_#Index : Insert_Element <
981    i32, v3i32, Index, !cast<SubRegIndex>(sub#Index)
982  >;
983
984  def Extract_Element_v3f32_#Index : Extract_Element <
985    f32, v3f32, Index, !cast<SubRegIndex>(sub#Index)
986  >;
987  def Insert_Element_v3f32_#Index : Insert_Element <
988    f32, v3f32, Index, !cast<SubRegIndex>(sub#Index)
989  >;
990}
991
992foreach Index = 0-3 in {
993  def Extract_Element_v4i32_#Index : Extract_Element <
994    i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
995  >;
996  def Insert_Element_v4i32_#Index : Insert_Element <
997    i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
998  >;
999
1000  def Extract_Element_v4f32_#Index : Extract_Element <
1001    f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
1002  >;
1003  def Insert_Element_v4f32_#Index : Insert_Element <
1004    f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
1005  >;
1006}
1007
1008foreach Index = 0-4 in {
1009  def Extract_Element_v5i32_#Index : Extract_Element <
1010    i32, v5i32, Index, !cast<SubRegIndex>(sub#Index)
1011  >;
1012  def Insert_Element_v5i32_#Index : Insert_Element <
1013    i32, v5i32, Index, !cast<SubRegIndex>(sub#Index)
1014  >;
1015
1016  def Extract_Element_v5f32_#Index : Extract_Element <
1017    f32, v5f32, Index, !cast<SubRegIndex>(sub#Index)
1018  >;
1019  def Insert_Element_v5f32_#Index : Insert_Element <
1020    f32, v5f32, Index, !cast<SubRegIndex>(sub#Index)
1021  >;
1022}
1023
1024foreach Index = 0-7 in {
1025  def Extract_Element_v8i32_#Index : Extract_Element <
1026    i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
1027  >;
1028  def Insert_Element_v8i32_#Index : Insert_Element <
1029    i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
1030  >;
1031
1032  def Extract_Element_v8f32_#Index : Extract_Element <
1033    f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
1034  >;
1035  def Insert_Element_v8f32_#Index : Insert_Element <
1036    f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
1037  >;
1038}
1039
1040foreach Index = 0-15 in {
1041  def Extract_Element_v16i32_#Index : Extract_Element <
1042    i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
1043  >;
1044  def Insert_Element_v16i32_#Index : Insert_Element <
1045    i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
1046  >;
1047
1048  def Extract_Element_v16f32_#Index : Extract_Element <
1049    f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
1050  >;
1051  def Insert_Element_v16f32_#Index : Insert_Element <
1052    f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
1053  >;
1054}
1055
1056
1057def : Pat <
1058  (extract_subvector v4i16:$vec, (i32 0)),
1059  (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub0))
1060>;
1061
1062def : Pat <
1063  (extract_subvector v4i16:$vec, (i32 2)),
1064  (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub1))
1065>;
1066
1067def : Pat <
1068  (extract_subvector v4f16:$vec, (i32 0)),
1069  (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub0))
1070>;
1071
1072def : Pat <
1073  (extract_subvector v4f16:$vec, (i32 2)),
1074  (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1))
1075>;
1076
1077foreach Index = 0-31 in {
1078  def Extract_Element_v32i32_#Index : Extract_Element <
1079    i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)
1080  >;
1081
1082  def Insert_Element_v32i32_#Index : Insert_Element <
1083    i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)
1084  >;
1085
1086  def Extract_Element_v32f32_#Index : Extract_Element <
1087    f32, v32f32, Index, !cast<SubRegIndex>(sub#Index)
1088  >;
1089
1090  def Insert_Element_v32f32_#Index : Insert_Element <
1091    f32, v32f32, Index, !cast<SubRegIndex>(sub#Index)
1092  >;
1093}
1094
1095// FIXME: Why do only some of these type combinations for SReg and
1096// VReg?
1097// 16-bit bitcast
1098def : BitConvert <i16, f16, VGPR_32>;
1099def : BitConvert <f16, i16, VGPR_32>;
1100def : BitConvert <i16, f16, SReg_32>;
1101def : BitConvert <f16, i16, SReg_32>;
1102
1103// 32-bit bitcast
1104def : BitConvert <i32, f32, VGPR_32>;
1105def : BitConvert <f32, i32, VGPR_32>;
1106def : BitConvert <i32, f32, SReg_32>;
1107def : BitConvert <f32, i32, SReg_32>;
1108def : BitConvert <v2i16, i32, SReg_32>;
1109def : BitConvert <i32, v2i16, SReg_32>;
1110def : BitConvert <v2f16, i32, SReg_32>;
1111def : BitConvert <i32, v2f16, SReg_32>;
1112def : BitConvert <v2i16, v2f16, SReg_32>;
1113def : BitConvert <v2f16, v2i16, SReg_32>;
1114def : BitConvert <v2f16, f32, SReg_32>;
1115def : BitConvert <f32, v2f16, SReg_32>;
1116def : BitConvert <v2i16, f32, SReg_32>;
1117def : BitConvert <f32, v2i16, SReg_32>;
1118
1119// 64-bit bitcast
1120def : BitConvert <i64, f64, VReg_64>;
1121def : BitConvert <f64, i64, VReg_64>;
1122def : BitConvert <v2i32, v2f32, VReg_64>;
1123def : BitConvert <v2f32, v2i32, VReg_64>;
1124def : BitConvert <i64, v2i32, VReg_64>;
1125def : BitConvert <v2i32, i64, VReg_64>;
1126def : BitConvert <i64, v2f32, VReg_64>;
1127def : BitConvert <v2f32, i64, VReg_64>;
1128def : BitConvert <f64, v2f32, VReg_64>;
1129def : BitConvert <v2f32, f64, VReg_64>;
1130def : BitConvert <f64, v2i32, VReg_64>;
1131def : BitConvert <v2i32, f64, VReg_64>;
1132def : BitConvert <v4i16, v4f16, VReg_64>;
1133def : BitConvert <v4f16, v4i16, VReg_64>;
1134
1135// FIXME: Make SGPR
1136def : BitConvert <v2i32, v4f16, VReg_64>;
1137def : BitConvert <v4f16, v2i32, VReg_64>;
1138def : BitConvert <v2i32, v4f16, VReg_64>;
1139def : BitConvert <v2i32, v4i16, VReg_64>;
1140def : BitConvert <v4i16, v2i32, VReg_64>;
1141def : BitConvert <v2f32, v4f16, VReg_64>;
1142def : BitConvert <v4f16, v2f32, VReg_64>;
1143def : BitConvert <v2f32, v4i16, VReg_64>;
1144def : BitConvert <v4i16, v2f32, VReg_64>;
1145def : BitConvert <v4i16, f64, VReg_64>;
1146def : BitConvert <v4f16, f64, VReg_64>;
1147def : BitConvert <f64, v4i16, VReg_64>;
1148def : BitConvert <f64, v4f16, VReg_64>;
1149def : BitConvert <v4i16, i64, VReg_64>;
1150def : BitConvert <v4f16, i64, VReg_64>;
1151def : BitConvert <i64, v4i16, VReg_64>;
1152def : BitConvert <i64, v4f16, VReg_64>;
1153
1154def : BitConvert <v4i32, v4f32, VReg_128>;
1155def : BitConvert <v4f32, v4i32, VReg_128>;
1156
1157// 96-bit bitcast
1158def : BitConvert <v3i32, v3f32, SGPR_96>;
1159def : BitConvert <v3f32, v3i32, SGPR_96>;
1160
1161// 128-bit bitcast
1162def : BitConvert <v2i64, v4i32, SReg_128>;
1163def : BitConvert <v4i32, v2i64, SReg_128>;
1164def : BitConvert <v2f64, v4f32, VReg_128>;
1165def : BitConvert <v2f64, v4i32, VReg_128>;
1166def : BitConvert <v4f32, v2f64, VReg_128>;
1167def : BitConvert <v4i32, v2f64, VReg_128>;
1168def : BitConvert <v2i64, v2f64, VReg_128>;
1169def : BitConvert <v2f64, v2i64, VReg_128>;
1170def : BitConvert <v4f32, v2i64, VReg_128>;
1171def : BitConvert <v2i64, v4f32, VReg_128>;
1172
1173// 160-bit bitcast
1174def : BitConvert <v5i32, v5f32, SGPR_160>;
1175def : BitConvert <v5f32, v5i32, SGPR_160>;
1176
1177// 256-bit bitcast
1178def : BitConvert <v8i32, v8f32, SReg_256>;
1179def : BitConvert <v8f32, v8i32, SReg_256>;
1180def : BitConvert <v8i32, v8f32, VReg_256>;
1181def : BitConvert <v8f32, v8i32, VReg_256>;
1182def : BitConvert <v4i64, v4f64, VReg_256>;
1183def : BitConvert <v4f64, v4i64, VReg_256>;
1184def : BitConvert <v4i64, v8i32, VReg_256>;
1185def : BitConvert <v4i64, v8f32, VReg_256>;
1186def : BitConvert <v4f64, v8i32, VReg_256>;
1187def : BitConvert <v4f64, v8f32, VReg_256>;
1188def : BitConvert <v8i32, v4i64, VReg_256>;
1189def : BitConvert <v8f32, v4i64, VReg_256>;
1190def : BitConvert <v8i32, v4f64, VReg_256>;
1191def : BitConvert <v8f32, v4f64, VReg_256>;
1192
1193
1194// 512-bit bitcast
1195def : BitConvert <v16i32, v16f32, VReg_512>;
1196def : BitConvert <v16f32, v16i32, VReg_512>;
1197def : BitConvert <v8i64,  v8f64,  VReg_512>;
1198def : BitConvert <v8f64,  v8i64,  VReg_512>;
1199def : BitConvert <v8i64,  v16i32, VReg_512>;
1200def : BitConvert <v8f64,  v16i32, VReg_512>;
1201def : BitConvert <v16i32, v8i64,  VReg_512>;
1202def : BitConvert <v16i32, v8f64,  VReg_512>;
1203def : BitConvert <v8i64,  v16f32, VReg_512>;
1204def : BitConvert <v8f64,  v16f32, VReg_512>;
1205def : BitConvert <v16f32, v8i64,  VReg_512>;
1206def : BitConvert <v16f32, v8f64,  VReg_512>;
1207
1208// 1024-bit bitcast
1209def : BitConvert <v32i32, v32f32, VReg_1024>;
1210def : BitConvert <v32f32, v32i32, VReg_1024>;
1211def : BitConvert <v16i64, v16f64, VReg_1024>;
1212def : BitConvert <v16f64, v16i64, VReg_1024>;
1213def : BitConvert <v16i64, v32i32, VReg_1024>;
1214def : BitConvert <v32i32, v16i64, VReg_1024>;
1215def : BitConvert <v16f64, v32f32, VReg_1024>;
1216def : BitConvert <v32f32, v16f64, VReg_1024>;
1217def : BitConvert <v16i64, v32f32, VReg_1024>;
1218def : BitConvert <v32i32, v16f64, VReg_1024>;
1219def : BitConvert <v16f64, v32i32, VReg_1024>;
1220def : BitConvert <v32f32, v16i64, VReg_1024>;
1221
1222
1223/********** =================== **********/
1224/********** Src & Dst modifiers **********/
1225/********** =================== **********/
1226
1227
1228// If denormals are not enabled, it only impacts the compare of the
1229// inputs. The output result is not flushed.
1230class ClampPat<Instruction inst, ValueType vt> : GCNPat <
1231  (vt (AMDGPUclamp (VOP3Mods vt:$src0, i32:$src0_modifiers))),
1232  (inst i32:$src0_modifiers, vt:$src0,
1233        i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, DSTOMOD.NONE)
1234>;
1235
1236def : ClampPat<V_MAX_F32_e64, f32>;
1237def : ClampPat<V_MAX_F64_e64, f64>;
1238def : ClampPat<V_MAX_F16_e64, f16>;
1239
1240let SubtargetPredicate = HasVOP3PInsts in {
1241def : GCNPat <
1242  (v2f16 (AMDGPUclamp (VOP3PMods v2f16:$src0, i32:$src0_modifiers))),
1243  (V_PK_MAX_F16 $src0_modifiers, $src0,
1244                $src0_modifiers, $src0, DSTCLAMP.ENABLE)
1245>;
1246}
1247
1248/********** ================================ **********/
1249/********** Floating point absolute/negative **********/
1250/********** ================================ **********/
1251
1252// Prevent expanding both fneg and fabs.
1253// TODO: Add IgnoredBySelectionDAG bit?
1254let AddedComplexity = 1 in { // Prefer SALU to VALU patterns for DAG
1255
1256def : GCNPat <
1257  (fneg (fabs (f32 SReg_32:$src))),
1258  (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) // Set sign bit
1259>;
1260
1261def : GCNPat <
1262  (fabs (f32 SReg_32:$src)),
1263  (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fffffff)))
1264>;
1265
1266def : GCNPat <
1267  (fneg (f32 SReg_32:$src)),
1268  (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000)))
1269>;
1270
1271def : GCNPat <
1272  (fneg (f16 SReg_32:$src)),
1273  (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000)))
1274>;
1275
1276def : GCNPat <
1277  (fneg (f16 VGPR_32:$src)),
1278  (V_XOR_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src)
1279>;
1280
1281def : GCNPat <
1282  (fabs (f16 SReg_32:$src)),
1283  (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00007fff)))
1284>;
1285
1286def : GCNPat <
1287  (fneg (fabs (f16 SReg_32:$src))),
1288  (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
1289>;
1290
1291def : GCNPat <
1292  (fneg (fabs (f16 VGPR_32:$src))),
1293  (V_OR_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit
1294>;
1295
1296def : GCNPat <
1297  (fneg (v2f16 SReg_32:$src)),
1298  (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000)))
1299>;
1300
1301def : GCNPat <
1302  (fabs (v2f16 SReg_32:$src)),
1303  (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fff7fff)))
1304>;
1305
1306// This is really (fneg (fabs v2f16:$src))
1307//
1308// fabs is not reported as free because there is modifier for it in
1309// VOP3P instructions, so it is turned into the bit op.
1310def : GCNPat <
1311  (fneg (v2f16 (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff)))),
1312  (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
1313>;
1314
1315def : GCNPat <
1316  (fneg (v2f16 (fabs SReg_32:$src))),
1317  (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
1318>;
1319
1320// FIXME: The implicit-def of scc from S_[X]OR/AND_B32 is mishandled
1321 // def : GCNPat <
1322//   (fneg (f64 SReg_64:$src)),
1323//   (REG_SEQUENCE SReg_64,
1324//     (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)),
1325//     sub0,
1326//     (S_XOR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)),
1327//                (i32 (S_MOV_B32 (i32 0x80000000)))),
1328//     sub1)
1329// >;
1330
1331// def : GCNPat <
1332//   (fneg (fabs (f64 SReg_64:$src))),
1333//   (REG_SEQUENCE SReg_64,
1334//     (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)),
1335//     sub0,
1336//     (S_OR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)),
1337//               (S_MOV_B32 (i32 0x80000000))), // Set sign bit.
1338//     sub1)
1339// >;
1340
1341// FIXME: Use S_BITSET0_B32/B64?
1342// def : GCNPat <
1343//   (fabs (f64 SReg_64:$src)),
1344//   (REG_SEQUENCE SReg_64,
1345//     (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)),
1346//     sub0,
1347//     (S_AND_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)),
1348//                (i32 (S_MOV_B32 (i32 0x7fffffff)))),
1349//     sub1)
1350// >;
1351
1352} // End let AddedComplexity = 1
1353
1354def : GCNPat <
1355  (fabs (f32 VGPR_32:$src)),
1356  (V_AND_B32_e32 (S_MOV_B32 (i32 0x7fffffff)), VGPR_32:$src)
1357>;
1358
1359def : GCNPat <
1360  (fneg (f32 VGPR_32:$src)),
1361  (V_XOR_B32_e32 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src)
1362>;
1363
1364def : GCNPat <
1365  (fabs (f16 VGPR_32:$src)),
1366  (V_AND_B32_e32 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src)
1367>;
1368
1369def : GCNPat <
1370  (fneg (v2f16 VGPR_32:$src)),
1371  (V_XOR_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src)
1372>;
1373
1374def : GCNPat <
1375  (fabs (v2f16 VGPR_32:$src)),
1376  (V_AND_B32_e32 (S_MOV_B32 (i32 0x7fff7fff)), VGPR_32:$src)
1377>;
1378
1379def : GCNPat <
1380  (fneg (v2f16 (fabs VGPR_32:$src))),
1381  (V_OR_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) // Set sign bit
1382>;
1383
1384def : GCNPat <
1385  (fabs (f64 VReg_64:$src)),
1386  (REG_SEQUENCE VReg_64,
1387    (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)),
1388    sub0,
1389    (V_AND_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)),
1390                   (V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit.
1391     sub1)
1392>;
1393
1394// TODO: Use SGPR for constant
1395def : GCNPat <
1396  (fneg (f64 VReg_64:$src)),
1397  (REG_SEQUENCE VReg_64,
1398    (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)),
1399    sub0,
1400    (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)),
1401                   (i32 (V_MOV_B32_e32 (i32 0x80000000)))),
1402    sub1)
1403>;
1404
1405// TODO: Use SGPR for constant
1406def : GCNPat <
1407  (fneg (fabs (f64 VReg_64:$src))),
1408  (REG_SEQUENCE VReg_64,
1409    (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)),
1410    sub0,
1411    (V_OR_B32_e32 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)),
1412                  (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit.
1413    sub1)
1414>;
1415
1416def : GCNPat <
1417  (fcopysign f16:$src0, f16:$src1),
1418  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
1419>;
1420
1421def : GCNPat <
1422  (fcopysign f32:$src0, f16:$src1),
1423  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
1424             (V_LSHLREV_B32_e64 (i32 16), $src1))
1425>;
1426
1427def : GCNPat <
1428  (fcopysign f64:$src0, f16:$src1),
1429  (REG_SEQUENCE SReg_64,
1430    (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
1431    (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)),
1432               (V_LSHLREV_B32_e64 (i32 16), $src1)), sub1)
1433>;
1434
1435def : GCNPat <
1436  (fcopysign f16:$src0, f32:$src1),
1437  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
1438             (V_LSHRREV_B32_e64 (i32 16), $src1))
1439>;
1440
1441def : GCNPat <
1442  (fcopysign f16:$src0, f64:$src1),
1443  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
1444             (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
1445>;
1446
1447/********** ================== **********/
1448/********** Immediate Patterns **********/
1449/********** ================== **********/
1450
1451def : GCNPat <
1452  (VGPRImm<(i32 imm)>:$imm),
1453  (V_MOV_B32_e32 imm:$imm)
1454>;
1455
1456def : GCNPat <
1457  (VGPRImm<(f32 fpimm)>:$imm),
1458  (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
1459>;
1460
1461def : GCNPat <
1462  (i32 imm:$imm),
1463  (S_MOV_B32 imm:$imm)
1464>;
1465
1466def : GCNPat <
1467  (VGPRImm<(SIlds tglobaladdr:$ga)>),
1468  (V_MOV_B32_e32 $ga)
1469>;
1470
1471def : GCNPat <
1472  (SIlds tglobaladdr:$ga),
1473  (S_MOV_B32 $ga)
1474>;
1475
1476// FIXME: Workaround for ordering issue with peephole optimizer where
1477// a register class copy interferes with immediate folding.  Should
1478// use s_mov_b32, which can be shrunk to s_movk_i32
1479def : GCNPat <
1480  (VGPRImm<(f16 fpimm)>:$imm),
1481  (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))
1482>;
1483
1484def : GCNPat <
1485  (f32 fpimm:$imm),
1486  (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
1487>;
1488
1489def : GCNPat <
1490  (f16 fpimm:$imm),
1491  (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm)))
1492>;
1493
1494def : GCNPat <
1495  (p5 frameindex:$fi),
1496  (V_MOV_B32_e32 (p5 (frameindex_to_targetframeindex $fi)))
1497>;
1498
1499def : GCNPat <
1500  (p5 frameindex:$fi),
1501  (S_MOV_B32 (p5 (frameindex_to_targetframeindex $fi)))
1502>;
1503
1504def : GCNPat <
1505  (i64 InlineImm64:$imm),
1506  (S_MOV_B64 InlineImm64:$imm)
1507>;
1508
1509// XXX - Should this use a s_cmp to set SCC?
1510
1511// Set to sign-extended 64-bit value (true = -1, false = 0)
1512def : GCNPat <
1513  (i1 imm:$imm),
1514  (S_MOV_B64 (i64 (as_i64imm $imm)))
1515> {
1516  let WaveSizePredicate = isWave64;
1517}
1518
1519def : GCNPat <
1520  (i1 imm:$imm),
1521  (S_MOV_B32 (i32 (as_i32imm $imm)))
1522> {
1523  let WaveSizePredicate = isWave32;
1524}
1525
1526def : GCNPat <
1527  (f64 InlineImmFP64:$imm),
1528  (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineImmFP64:$imm)))
1529>;
1530
1531/********** ================== **********/
1532/********** Intrinsic Patterns **********/
1533/********** ================== **********/
1534
1535// FIXME: Should use _e64 and select source modifiers.
1536def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
1537
1538def : GCNPat <
1539  (i32 (sext i1:$src0)),
1540  (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
1541                     /*src1mod*/(i32 0), /*src1*/(i32 -1), $src0)
1542>;
1543
1544class Ext32Pat <SDNode ext> : GCNPat <
1545  (i32 (ext i1:$src0)),
1546  (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
1547                     /*src1mod*/(i32 0), /*src1*/(i32 1), $src0)
1548>;
1549
1550def : Ext32Pat <zext>;
1551def : Ext32Pat <anyext>;
1552
1553// The multiplication scales from [0,1) to the unsigned integer range,
1554// rounding down a bit to avoid unwanted overflow.
1555def : GCNPat <
1556  (AMDGPUurecip i32:$src0),
1557  (V_CVT_U32_F32_e32
1558    (V_MUL_F32_e32 (i32 CONST.FP_4294966784),
1559                   (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
1560>;
1561
1562//===----------------------------------------------------------------------===//
1563// VOP3 Patterns
1564//===----------------------------------------------------------------------===//
1565
1566def : IMad24Pat<V_MAD_I32_I24_e64, 1>;
1567def : UMad24Pat<V_MAD_U32_U24_e64, 1>;
1568
1569// BFI patterns
1570
1571def BFIImm32 : PatFrag<
1572  (ops node:$x, node:$y, node:$z),
1573  (i32 (DivergentBinFrag<or> (and node:$y, node:$x), (and node:$z, imm))),
1574  [{
1575    auto *X = dyn_cast<ConstantSDNode>(N->getOperand(0)->getOperand(1));
1576    auto *NotX = dyn_cast<ConstantSDNode>(N->getOperand(1)->getOperand(1));
1577    return X && NotX &&
1578      ~(unsigned)X->getZExtValue() == (unsigned)NotX->getZExtValue();
1579  }]
1580>;
1581
1582// Definition from ISA doc:
1583// (y & x) | (z & ~x)
1584def : AMDGPUPat <
1585  (DivergentBinFrag<or> (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
1586  (V_BFI_B32_e64 $x, $y, $z)
1587>;
1588
1589// (y & C) | (z & ~C)
1590def : AMDGPUPat <
1591  (BFIImm32 i32:$x, i32:$y, i32:$z),
1592  (V_BFI_B32_e64 $x, $y, $z)
1593>;
1594
1595// 64-bit version
1596def : AMDGPUPat <
1597  (DivergentBinFrag<or> (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))),
1598  (REG_SEQUENCE SReg_64,
1599    (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)),
1600               (i32 (EXTRACT_SUBREG SReg_64:$y, sub0)),
1601               (i32 (EXTRACT_SUBREG SReg_64:$z, sub0))), sub0,
1602    (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)),
1603               (i32 (EXTRACT_SUBREG SReg_64:$y, sub1)),
1604               (i32 (EXTRACT_SUBREG SReg_64:$z, sub1))), sub1)
1605>;
1606
1607// SHA-256 Ch function
1608// z ^ (x & (y ^ z))
1609def : AMDGPUPat <
1610  (DivergentBinFrag<xor> i32:$z, (and i32:$x, (xor i32:$y, i32:$z))),
1611  (V_BFI_B32_e64 $x, $y, $z)
1612>;
1613
1614// 64-bit version
1615def : AMDGPUPat <
1616  (DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
1617  (REG_SEQUENCE SReg_64,
1618    (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)),
1619               (i32 (EXTRACT_SUBREG SReg_64:$y, sub0)),
1620               (i32 (EXTRACT_SUBREG SReg_64:$z, sub0))), sub0,
1621    (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)),
1622               (i32 (EXTRACT_SUBREG SReg_64:$y, sub1)),
1623               (i32 (EXTRACT_SUBREG SReg_64:$z, sub1))), sub1)
1624>;
1625
1626def : AMDGPUPat <
1627  (fcopysign f32:$src0, f32:$src1),
1628  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, $src1)
1629>;
1630
1631def : AMDGPUPat <
1632  (fcopysign f32:$src0, f64:$src1),
1633  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
1634             (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1)))
1635>;
1636
1637def : AMDGPUPat <
1638  (fcopysign f64:$src0, f64:$src1),
1639  (REG_SEQUENCE SReg_64,
1640    (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
1641    (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)),
1642               (i32 (EXTRACT_SUBREG SReg_64:$src0, sub1)),
1643               (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1))), sub1)
1644>;
1645
1646def : AMDGPUPat <
1647  (fcopysign f64:$src0, f32:$src1),
1648  (REG_SEQUENCE SReg_64,
1649    (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
1650    (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)),
1651               (i32 (EXTRACT_SUBREG SReg_64:$src0, sub1)),
1652               $src1), sub1)
1653>;
1654
1655def : ROTRPattern <V_ALIGNBIT_B32_e64>;
1656
1657def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
1658          (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
1659                          (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
1660
1661def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
1662          (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
1663                          (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
1664
1665/********** ====================== **********/
1666/**********   Indirect addressing  **********/
1667/********** ====================== **********/
1668
1669multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
1670  // Extract with offset
1671  def : GCNPat<
1672    (eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))),
1673    (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset)
1674  >;
1675
1676  // Insert with offset
1677  def : GCNPat<
1678    (insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))),
1679    (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val)
1680  >;
1681}
1682
1683defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">;
1684defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">;
1685defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">;
1686defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">;
1687defm : SI_INDIRECT_Pattern <v32f32, f32, "V32">;
1688
1689defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">;
1690defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">;
1691defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">;
1692defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">;
1693defm : SI_INDIRECT_Pattern <v32i32, i32, "V32">;
1694
1695//===----------------------------------------------------------------------===//
1696// SAD Patterns
1697//===----------------------------------------------------------------------===//
1698
1699def : GCNPat <
1700  (add (sub_oneuse (umax i32:$src0, i32:$src1),
1701                   (umin i32:$src0, i32:$src1)),
1702       i32:$src2),
1703  (V_SAD_U32_e64 $src0, $src1, $src2, (i1 0))
1704>;
1705
1706def : GCNPat <
1707  (add (select_oneuse (i1 (setugt i32:$src0, i32:$src1)),
1708                      (sub i32:$src0, i32:$src1),
1709                      (sub i32:$src1, i32:$src0)),
1710       i32:$src2),
1711  (V_SAD_U32_e64 $src0, $src1, $src2, (i1 0))
1712>;
1713
1714//===----------------------------------------------------------------------===//
1715// Conversion Patterns
1716//===----------------------------------------------------------------------===//
1717
1718def : GCNPat<(i32 (sext_inreg i32:$src, i1)),
1719  (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16
1720
1721// Handle sext_inreg in i64
1722def : GCNPat <
1723  (i64 (sext_inreg i64:$src, i1)),
1724  (S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16
1725>;
1726
1727def : GCNPat <
1728  (i16 (sext_inreg i16:$src, i1)),
1729  (S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16
1730>;
1731
1732def : GCNPat <
1733  (i16 (sext_inreg i16:$src, i8)),
1734  (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16
1735>;
1736
1737def : GCNPat <
1738  (i64 (sext_inreg i64:$src, i8)),
1739  (S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16
1740>;
1741
1742def : GCNPat <
1743  (i64 (sext_inreg i64:$src, i16)),
1744  (S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16
1745>;
1746
1747def : GCNPat <
1748  (i64 (sext_inreg i64:$src, i32)),
1749  (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16
1750>;
1751
1752def : GCNPat <
1753  (i64 (zext i32:$src)),
1754  (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
1755>;
1756
1757def : GCNPat <
1758  (i64 (anyext i32:$src)),
1759  (REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)
1760>;
1761
1762class ZExt_i64_i1_Pat <SDNode ext> : GCNPat <
1763  (i64 (ext i1:$src)),
1764    (REG_SEQUENCE VReg_64,
1765      (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
1766                         /*src1mod*/(i32 0), /*src1*/(i32 1), $src),
1767      sub0, (S_MOV_B32 (i32 0)), sub1)
1768>;
1769
1770
1771def : ZExt_i64_i1_Pat<zext>;
1772def : ZExt_i64_i1_Pat<anyext>;
1773
1774// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
1775// REG_SEQUENCE patterns don't support instructions with multiple outputs.
1776def : GCNPat <
1777  (i64 (sext i32:$src)),
1778    (REG_SEQUENCE SReg_64, $src, sub0,
1779    (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1)
1780>;
1781
1782def : GCNPat <
1783  (i64 (sext i1:$src)),
1784  (REG_SEQUENCE VReg_64,
1785    (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
1786                       /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub0,
1787    (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
1788                       /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub1)
1789>;
1790
1791class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : GCNPat <
1792  (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))),
1793  (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE))
1794>;
1795
1796def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>;
1797def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>;
1798def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>;
1799def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>;
1800
1801// If we need to perform a logical operation on i1 values, we need to
1802// use vector comparisons since there is only one SCC register. Vector
1803// comparisons may write to a pair of SGPRs or a single SGPR, so treat
1804// these as 32 or 64-bit comparisons. When legalizing SGPR copies,
1805// instructions resulting in the copies from SCC to these instructions
1806// will be moved to the VALU.
1807
1808let WaveSizePredicate = isWave64 in {
1809def : GCNPat <
1810  (i1 (and i1:$src0, i1:$src1)),
1811  (S_AND_B64 $src0, $src1)
1812>;
1813
1814def : GCNPat <
1815  (i1 (or i1:$src0, i1:$src1)),
1816  (S_OR_B64 $src0, $src1)
1817>;
1818
1819def : GCNPat <
1820  (i1 (xor i1:$src0, i1:$src1)),
1821  (S_XOR_B64 $src0, $src1)
1822>;
1823
1824def : GCNPat <
1825  (i1 (add i1:$src0, i1:$src1)),
1826  (S_XOR_B64 $src0, $src1)
1827>;
1828
1829def : GCNPat <
1830  (i1 (sub i1:$src0, i1:$src1)),
1831  (S_XOR_B64 $src0, $src1)
1832>;
1833
1834let AddedComplexity = 1 in {
1835def : GCNPat <
1836  (i1 (add i1:$src0, (i1 -1))),
1837  (S_NOT_B64 $src0)
1838>;
1839
1840def : GCNPat <
1841  (i1 (sub i1:$src0, (i1 -1))),
1842  (S_NOT_B64 $src0)
1843>;
1844}
1845} // end isWave64
1846
1847let WaveSizePredicate = isWave32 in {
1848def : GCNPat <
1849  (i1 (and i1:$src0, i1:$src1)),
1850  (S_AND_B32 $src0, $src1)
1851>;
1852
1853def : GCNPat <
1854  (i1 (or i1:$src0, i1:$src1)),
1855  (S_OR_B32 $src0, $src1)
1856>;
1857
1858def : GCNPat <
1859  (i1 (xor i1:$src0, i1:$src1)),
1860  (S_XOR_B32 $src0, $src1)
1861>;
1862
1863def : GCNPat <
1864  (i1 (add i1:$src0, i1:$src1)),
1865  (S_XOR_B32 $src0, $src1)
1866>;
1867
1868def : GCNPat <
1869  (i1 (sub i1:$src0, i1:$src1)),
1870  (S_XOR_B32 $src0, $src1)
1871>;
1872
1873let AddedComplexity = 1 in {
1874def : GCNPat <
1875  (i1 (add i1:$src0, (i1 -1))),
1876  (S_NOT_B32 $src0)
1877>;
1878
1879def : GCNPat <
1880  (i1 (sub i1:$src0, (i1 -1))),
1881  (S_NOT_B32 $src0)
1882>;
1883}
1884} // end isWave32
1885
1886def : GCNPat <
1887  (f16 (sint_to_fp i1:$src)),
1888  (V_CVT_F16_F32_e32 (
1889      V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
1890                        /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
1891                        SSrc_i1:$src))
1892>;
1893
1894def : GCNPat <
1895  (f16 (uint_to_fp i1:$src)),
1896  (V_CVT_F16_F32_e32 (
1897      V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
1898                        /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
1899                        SSrc_i1:$src))
1900>;
1901
1902def : GCNPat <
1903  (f32 (sint_to_fp i1:$src)),
1904  (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
1905                        /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
1906                        SSrc_i1:$src)
1907>;
1908
1909def : GCNPat <
1910  (f32 (uint_to_fp i1:$src)),
1911  (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
1912                        /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
1913                        SSrc_i1:$src)
1914>;
1915
1916def : GCNPat <
1917  (f64 (sint_to_fp i1:$src)),
1918  (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
1919                                        /*src1mod*/(i32 0), /*src1*/(i32 -1),
1920                                        SSrc_i1:$src))
1921>;
1922
1923def : GCNPat <
1924  (f64 (uint_to_fp i1:$src)),
1925  (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
1926                                        /*src1mod*/(i32 0), /*src1*/(i32 1),
1927                                        SSrc_i1:$src))
1928>;
1929
1930//===----------------------------------------------------------------------===//
1931// Miscellaneous Patterns
1932//===----------------------------------------------------------------------===//
1933def : GCNPat <
1934  (i32 (AMDGPUfp16_zext f16:$src)),
1935  (COPY $src)
1936>;
1937
1938
1939def : GCNPat <
1940  (i32 (trunc i64:$a)),
1941  (EXTRACT_SUBREG $a, sub0)
1942>;
1943
1944def : GCNPat <
1945  (i1 (trunc i32:$a)),
1946  (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
1947>;
1948
1949def : GCNPat <
1950  (i1 (trunc i16:$a)),
1951  (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
1952>;
1953
1954def : GCNPat <
1955  (i1 (trunc i64:$a)),
1956  (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1),
1957                    (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
1958>;
1959
1960def : GCNPat <
1961  (i32 (bswap i32:$a)),
1962  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
1963             (V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 24)),
1964             (V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 8)))
1965>;
1966
1967// FIXME: This should have been narrowed to i32 during legalization.
1968// This pattern should also be skipped for GlobalISel
1969def : GCNPat <
1970  (i64 (bswap i64:$a)),
1971  (REG_SEQUENCE VReg_64,
1972  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
1973             (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
1974                             (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
1975                             (i32 24)),
1976             (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
1977                             (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
1978                             (i32 8))),
1979  sub0,
1980  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
1981             (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
1982                             (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
1983                             (i32 24)),
1984             (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
1985                             (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
1986                             (i32 8))),
1987  sub1)
1988>;
1989
1990// FIXME: The AddedComplexity should not be needed, but in GlobalISel
1991// the BFI pattern ends up taking precedence without it.
1992let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in {
1993// Magic number: 3 | (2 << 8) | (1 << 16) | (0 << 24)
1994//
1995// My reading of the manual suggests we should be using src0 for the
1996// register value, but this is what seems to work.
1997def : GCNPat <
1998  (i32 (bswap i32:$a)),
1999  (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x00010203)))
2000>;
2001
2002// FIXME: This should have been narrowed to i32 during legalization.
2003// This pattern should also be skipped for GlobalISel
2004def : GCNPat <
2005  (i64 (bswap i64:$a)),
2006  (REG_SEQUENCE VReg_64,
2007  (V_PERM_B32_e64  (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub1),
2008              (S_MOV_B32 (i32 0x00010203))),
2009  sub0,
2010  (V_PERM_B32_e64  (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub0),
2011              (S_MOV_B32 (i32 0x00010203))),
2012  sub1)
2013>;
2014
2015// Magic number: 1 | (0 << 8) | (12 << 16) | (12 << 24)
2016// The 12s emit 0s.
2017def : GCNPat <
2018  (i16 (bswap i16:$a)),
2019  (V_PERM_B32_e64  (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
2020>;
2021
2022def : GCNPat <
2023  (i32 (zext (bswap i16:$a))),
2024  (V_PERM_B32_e64  (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
2025>;
2026
2027// Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24)
2028def : GCNPat <
2029  (v2i16 (bswap v2i16:$a)),
2030  (V_PERM_B32_e64  (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x02030001)))
2031>;
2032
2033}
2034
2035
2036// Prefer selecting to max when legal, but using mul is always valid.
2037let AddedComplexity = -5 in {
2038def : GCNPat<
2039  (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
2040  (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src)
2041>;
2042
2043def : GCNPat<
2044  (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
2045  (V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src)
2046>;
2047
2048def : GCNPat<
2049  (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
2050  (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
2051>;
2052
2053def : GCNPat<
2054  (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
2055  (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src)
2056>;
2057
2058def : GCNPat<
2059  (fcanonicalize (f32 (fneg (VOP3Mods f32:$src, i32:$src_mods)))),
2060  (V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src)
2061>;
2062
2063// TODO: Handle fneg like other types.
2064def : GCNPat<
2065  (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
2066  (V_MUL_F64_e64  0, CONST.FP64_ONE, $src_mods, $src)
2067>;
2068} // End AddedComplexity = -5
2069
2070multiclass SelectCanonicalizeAsMax<
2071  list<Predicate> f32_preds = [],
2072  list<Predicate> f64_preds = [],
2073  list<Predicate> f16_preds = []> {
2074  def : GCNPat<
2075    (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
2076    (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src)> {
2077    let OtherPredicates = f32_preds;
2078  }
2079
2080  def : GCNPat<
2081    (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
2082    (V_MAX_F64_e64  $src_mods, $src, $src_mods, $src)> {
2083    let OtherPredicates = f64_preds;
2084  }
2085
2086  def : GCNPat<
2087    (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
2088    (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> {
2089    // FIXME: Should have 16-bit inst subtarget predicate
2090    let OtherPredicates = f16_preds;
2091  }
2092
2093  def : GCNPat<
2094    (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
2095    (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)> {
2096    // FIXME: Should have VOP3P subtarget predicate
2097    let OtherPredicates = f16_preds;
2098  }
2099}
2100
2101// On pre-gfx9 targets, v_max_*/v_min_* did not respect the denormal
2102// mode, and would never flush. For f64, it's faster to do implement
2103// this with a max. For f16/f32 it's a wash, but prefer max when
2104// valid.
2105//
2106// FIXME: Lowering f32/f16 with max is worse since we can use a
2107// smaller encoding if the input is fneg'd. It also adds an extra
2108// register use.
2109let SubtargetPredicate = HasMinMaxDenormModes in {
2110  defm : SelectCanonicalizeAsMax<[], [], []>;
2111} // End SubtargetPredicate = HasMinMaxDenormModes
2112
2113let SubtargetPredicate = NotHasMinMaxDenormModes in {
2114  // Use the max lowering if we don't need to flush.
2115
2116  // FIXME: We don't do use this for f32 as a workaround for the
2117  // library being compiled with the default ieee mode, but
2118  // potentially being called from flushing kernels. Really we should
2119  // not be mixing code expecting different default FP modes, but mul
2120  // works in any FP environment.
2121  defm : SelectCanonicalizeAsMax<[FalsePredicate], [FP64Denormals], [FP16Denormals]>;
2122} // End SubtargetPredicate = NotHasMinMaxDenormModes
2123
2124
2125let OtherPredicates = [HasDLInsts] in {
2126def : GCNPat <
2127  (fma (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)),
2128       (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
2129       (f32 (VOP3NoMods f32:$src2))),
2130  (V_FMAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
2131                  SRCMODS.NONE, $src2)
2132>;
2133} // End OtherPredicates = [HasDLInsts]
2134
2135let SubtargetPredicate = isGFX10Plus in
2136def : GCNPat <
2137  (fma (f16 (VOP3Mods f32:$src0, i32:$src0_modifiers)),
2138       (f16 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
2139       (f16 (VOP3NoMods f32:$src2))),
2140  (V_FMAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
2141                  SRCMODS.NONE, $src2)
2142>;
2143
2144def : GCNPat <
2145  (v2i16 (build_vector (i16 0), (i16 SReg_32:$src1))),
2146  (S_LSHL_B32 SReg_32:$src1, (i16 16))
2147>;
2148
2149def : GCNPat <
2150  (v2i16 (build_vector (i16 SReg_32:$src1), (i16 0))),
2151  (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
2152>;
2153
2154def : GCNPat <
2155  (v2f16 (build_vector (f16 SReg_32:$src1), (f16 FP_ZERO))),
2156  (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
2157>;
2158
2159def : GCNPat <
2160  (v2i16 (build_vector (i16 SReg_32:$src0), (i16 undef))),
2161  (COPY_TO_REGCLASS SReg_32:$src0, SReg_32)
2162>;
2163
2164def : GCNPat <
2165  (v2i16 (build_vector (i16 VGPR_32:$src0), (i16 undef))),
2166  (COPY_TO_REGCLASS VGPR_32:$src0, VGPR_32)
2167>;
2168
2169def : GCNPat <
2170  (v2f16 (build_vector f16:$src0, (f16 undef))),
2171  (COPY $src0)
2172>;
2173
2174def : GCNPat <
2175  (v2i16 (build_vector (i16 undef), (i16 SReg_32:$src1))),
2176  (S_LSHL_B32 SReg_32:$src1, (i32 16))
2177>;
2178
2179def : GCNPat <
2180  (v2f16 (build_vector (f16 undef), (f16 SReg_32:$src1))),
2181  (S_LSHL_B32 SReg_32:$src1, (i32 16))
2182>;
2183
2184let SubtargetPredicate = HasVOP3PInsts in {
2185def : GCNPat <
2186  (v2i16 (build_vector (i16 SReg_32:$src0), (i16 SReg_32:$src1))),
2187  (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
2188>;
2189
2190// With multiple uses of the shift, this will duplicate the shift and
2191// increase register pressure.
2192def : GCNPat <
2193  (v2i16 (build_vector (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))),
2194  (v2i16 (S_PACK_LH_B32_B16 SReg_32:$src0, SReg_32:$src1))
2195>;
2196
2197
2198def : GCNPat <
2199  (v2i16 (build_vector (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))),
2200                       (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))),
2201  (S_PACK_HH_B32_B16 SReg_32:$src0, SReg_32:$src1)
2202>;
2203
2204// TODO: Should source modifiers be matched to v_pack_b32_f16?
2205def : GCNPat <
2206  (v2f16 (build_vector (f16 SReg_32:$src0), (f16 SReg_32:$src1))),
2207  (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
2208>;
2209
2210} // End SubtargetPredicate = HasVOP3PInsts
2211
2212
2213def : GCNPat <
2214  (v2f16 (scalar_to_vector f16:$src0)),
2215  (COPY $src0)
2216>;
2217
2218def : GCNPat <
2219  (v2i16 (scalar_to_vector i16:$src0)),
2220  (COPY $src0)
2221>;
2222
2223def : GCNPat <
2224  (v4i16 (scalar_to_vector i16:$src0)),
2225  (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
2226>;
2227
2228def : GCNPat <
2229  (v4f16 (scalar_to_vector f16:$src0)),
2230  (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
2231>;
2232
2233def : GCNPat <
2234  (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask,
2235                           timm:$bank_mask, timm:$bound_ctrl)),
2236  (V_MOV_B64_DPP_PSEUDO VReg_64:$src, VReg_64:$src,
2237                        (as_i32timm $dpp_ctrl), (as_i32timm $row_mask),
2238                        (as_i32timm $bank_mask),
2239                        (as_i1timm $bound_ctrl))
2240>;
2241
2242def : GCNPat <
2243  (i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask,
2244                              timm:$bank_mask, timm:$bound_ctrl)),
2245  (V_MOV_B64_DPP_PSEUDO VReg_64:$old, VReg_64:$src, (as_i32timm $dpp_ctrl),
2246                        (as_i32timm $row_mask), (as_i32timm $bank_mask),
2247                        (as_i1timm $bound_ctrl))
2248>;
2249
2250//===----------------------------------------------------------------------===//
2251// Fract Patterns
2252//===----------------------------------------------------------------------===//
2253
2254let SubtargetPredicate = isGFX6 in {
2255
2256// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is
2257// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient
2258// way to implement it is using V_FRACT_F64.
2259// The workaround for the V_FRACT bug is:
2260//    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2261
2262// Convert floor(x) to (x - fract(x))
2263
2264// Don't bother handling this for GlobalISel, it's handled during
2265// lowering.
2266//
2267// FIXME: DAG should also custom lower this.
2268def : GCNPat <
2269  (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))),
2270  (V_ADD_F64_e64
2271      $mods,
2272      $x,
2273      SRCMODS.NEG,
2274      (V_CNDMASK_B64_PSEUDO
2275         (V_MIN_F64_e64
2276             SRCMODS.NONE,
2277             (V_FRACT_F64_e64 $mods, $x),
2278             SRCMODS.NONE,
2279             (V_MOV_B64_PSEUDO 0x3fefffffffffffff)),
2280         $x,
2281         (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))))
2282>;
2283
2284} // End SubtargetPredicates = isGFX6
2285
2286//============================================================================//
2287// Miscellaneous Optimization Patterns
2288//============================================================================//
2289
2290// Undo sub x, c -> add x, -c canonicalization since c is more likely
2291// an inline immediate than -c.
2292// TODO: Also do for 64-bit.
2293def : GCNPat<
2294  (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
2295  (S_SUB_I32 SReg_32:$src0, NegSubInlineConst32:$src1)
2296>;
2297
2298def : GCNPat<
2299  (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
2300  (V_SUB_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
2301  let SubtargetPredicate = HasAddNoCarryInsts;
2302}
2303
2304def : GCNPat<
2305  (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
2306  (V_SUB_CO_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
2307  let SubtargetPredicate = NotHasAddNoCarryInsts;
2308}
2309
2310
2311// Avoid pointlessly materializing a constant in VGPR.
2312// FIXME: Should also do this for readlane, but tablegen crashes on
2313// the ignored src1.
2314def : GCNPat<
2315  (int_amdgcn_readfirstlane (i32 imm:$src)),
2316  (S_MOV_B32 SReg_32:$src)
2317>;
2318
2319multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
2320  def : GCNPat <
2321    (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
2322    (BFM $a, $b)
2323  >;
2324
2325  def : GCNPat <
2326    (vt (add (vt (shl 1, vt:$a)), -1)),
2327    (BFM $a, (MOV (i32 0)))
2328  >;
2329}
2330
2331defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
2332// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
2333
2334// Bitfield extract patterns
2335
2336def IMMZeroBasedBitfieldMask : ImmLeaf <i32, [{
2337  return isMask_32(Imm);
2338}]>;
2339
2340def IMMPopCount : SDNodeXForm<imm, [{
2341  return CurDAG->getTargetConstant(countPopulation(N->getZExtValue()), SDLoc(N),
2342                                   MVT::i32);
2343}]>;
2344
2345def : AMDGPUPat <
2346  (DivergentBinFrag<and> (i32 (srl i32:$src, i32:$rshift)),
2347                         IMMZeroBasedBitfieldMask:$mask),
2348  (V_BFE_U32_e64 $src, $rshift, (i32 (IMMPopCount $mask)))
2349>;
2350
2351// x & ((1 << y) - 1)
2352def : AMDGPUPat <
2353  (DivergentBinFrag<and> i32:$src, (add_oneuse (shl_oneuse 1, i32:$width), -1)),
2354  (V_BFE_U32_e64 $src, (i32 0), $width)
2355>;
2356
2357// x & ~(-1 << y)
2358def : AMDGPUPat <
2359  (DivergentBinFrag<and> i32:$src,
2360                         (xor_oneuse (shl_oneuse -1, i32:$width), -1)),
2361  (V_BFE_U32_e64 $src, (i32 0), $width)
2362>;
2363
2364// x & (-1 >> (bitwidth - y))
2365def : AMDGPUPat <
2366  (DivergentBinFrag<and> i32:$src, (srl_oneuse -1, (sub 32, i32:$width))),
2367  (V_BFE_U32_e64 $src, (i32 0), $width)
2368>;
2369
2370// x << (bitwidth - y) >> (bitwidth - y)
2371def : AMDGPUPat <
2372  (DivergentBinFrag<srl> (shl_oneuse i32:$src, (sub 32, i32:$width)),
2373                         (sub 32, i32:$width)),
2374  (V_BFE_U32_e64 $src, (i32 0), $width)
2375>;
2376
2377def : AMDGPUPat <
2378  (DivergentBinFrag<sra> (shl_oneuse i32:$src, (sub 32, i32:$width)),
2379                         (sub 32, i32:$width)),
2380  (V_BFE_I32_e64 $src, (i32 0), $width)
2381>;
2382
2383// SHA-256 Ma patterns
2384
2385// ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y
2386def : AMDGPUPat <
2387  (DivergentBinFrag<or> (and i32:$x, i32:$z),
2388                        (and i32:$y, (or i32:$x, i32:$z))),
2389  (V_BFI_B32_e64 (V_XOR_B32_e64 i32:$x, i32:$y), i32:$z, i32:$y)
2390>;
2391
2392def : AMDGPUPat <
2393  (DivergentBinFrag<or> (and i64:$x, i64:$z),
2394                        (and i64:$y, (or i64:$x, i64:$z))),
2395  (REG_SEQUENCE SReg_64,
2396    (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)),
2397                    (i32 (EXTRACT_SUBREG SReg_64:$y, sub0))),
2398               (i32 (EXTRACT_SUBREG SReg_64:$z, sub0)),
2399               (i32 (EXTRACT_SUBREG SReg_64:$y, sub0))), sub0,
2400    (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)),
2401                    (i32 (EXTRACT_SUBREG SReg_64:$y, sub1))),
2402               (i32 (EXTRACT_SUBREG SReg_64:$z, sub1)),
2403               (i32 (EXTRACT_SUBREG SReg_64:$y, sub1))), sub1)
2404>;
2405
2406multiclass IntMed3Pat<Instruction med3Inst,
2407                 SDPatternOperator min,
2408                 SDPatternOperator max,
2409                 SDPatternOperator min_oneuse,
2410                 SDPatternOperator max_oneuse> {
2411
2412  // This matches 16 permutations of
2413  // min(max(a, b), max(min(a, b), c))
2414  def : AMDGPUPat <
2415  (min (max_oneuse i32:$src0, i32:$src1),
2416       (max_oneuse (min_oneuse i32:$src0, i32:$src1), i32:$src2)),
2417  (med3Inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2)
2418>;
2419
2420  // This matches 16 permutations of
2421  // max(min(x, y), min(max(x, y), z))
2422  def : AMDGPUPat <
2423  (max (min_oneuse i32:$src0, i32:$src1),
2424       (min_oneuse (max_oneuse i32:$src0, i32:$src1), i32:$src2)),
2425  (med3Inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2)
2426>;
2427}
2428
2429defm : IntMed3Pat<V_MED3_I32_e64, smin, smax, smin_oneuse, smax_oneuse>;
2430defm : IntMed3Pat<V_MED3_U32_e64, umin, umax, umin_oneuse, umax_oneuse>;
2431
2432// This matches 16 permutations of
2433// max(min(x, y), min(max(x, y), z))
2434class FPMed3Pat<ValueType vt,
2435                //SDPatternOperator max, SDPatternOperator min,
2436                Instruction med3Inst> : GCNPat<
2437  (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
2438                           (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
2439           (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
2440                                           (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
2441                           (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))),
2442  (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
2443>;
2444
2445class FP16Med3Pat<ValueType vt,
2446                Instruction med3Inst> : GCNPat<
2447  (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
2448                                     (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
2449           (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
2450                                                     (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
2451                           (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))),
2452  (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)
2453>;
2454
2455multiclass Int16Med3Pat<Instruction med3Inst,
2456                   SDPatternOperator min,
2457                   SDPatternOperator max,
2458                   SDPatternOperator max_oneuse,
2459                   SDPatternOperator min_oneuse> {
2460  // This matches 16 permutations of
2461  // max(min(x, y), min(max(x, y), z))
2462  def : GCNPat <
2463  (max (min_oneuse i16:$src0, i16:$src1),
2464       (min_oneuse (max_oneuse i16:$src0, i16:$src1), i16:$src2)),
2465  (med3Inst SRCMODS.NONE, VSrc_b16:$src0, SRCMODS.NONE, VSrc_b16:$src1, SRCMODS.NONE, VSrc_b16:$src2, DSTCLAMP.NONE)
2466>;
2467
2468  // This matches 16 permutations of
2469  // min(max(a, b), max(min(a, b), c))
2470  def : GCNPat <
2471  (min (max_oneuse i16:$src0, i16:$src1),
2472      (max_oneuse (min_oneuse i16:$src0, i16:$src1), i16:$src2)),
2473  (med3Inst SRCMODS.NONE, VSrc_b16:$src0, SRCMODS.NONE, VSrc_b16:$src1, SRCMODS.NONE, VSrc_b16:$src2, DSTCLAMP.NONE)
2474>;
2475}
2476
2477def : FPMed3Pat<f32, V_MED3_F32_e64>;
2478
2479let OtherPredicates = [isGFX9Plus] in {
2480def : FP16Med3Pat<f16, V_MED3_F16_e64>;
2481defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax, smax_oneuse, smin_oneuse>;
2482defm : Int16Med3Pat<V_MED3_U16_e64, umin, umax, umax_oneuse, umin_oneuse>;
2483} // End Predicates = [isGFX9Plus]
2484
2485class AMDGPUGenericInstruction : GenericInstruction {
2486  let Namespace = "AMDGPU";
2487}
2488
2489def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction {
2490  let OutOperandList = (outs type0:$dst);
2491  let InOperandList = (ins type1:$src);
2492  let hasSideEffects = 0;
2493}
2494
2495def G_AMDGPU_RCP_IFLAG : AMDGPUGenericInstruction {
2496  let OutOperandList = (outs type0:$dst);
2497  let InOperandList = (ins type1:$src);
2498  let hasSideEffects = 0;
2499}
2500
2501class BufferLoadGenericInstruction : AMDGPUGenericInstruction {
2502  let OutOperandList = (outs type0:$dst);
2503  let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset,
2504                           type2:$soffset, untyped_imm_0:$offset,
2505                           untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
2506  let hasSideEffects = 0;
2507  let mayLoad = 1;
2508}
2509
2510class TBufferLoadGenericInstruction : AMDGPUGenericInstruction {
2511  let OutOperandList = (outs type0:$dst);
2512  let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset,
2513                           type2:$soffset, untyped_imm_0:$offset, untyped_imm_0:$format,
2514                           untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
2515  let hasSideEffects = 0;
2516  let mayLoad = 1;
2517}
2518
2519def G_AMDGPU_BUFFER_LOAD_UBYTE : BufferLoadGenericInstruction;
2520def G_AMDGPU_BUFFER_LOAD_SBYTE : BufferLoadGenericInstruction;
2521def G_AMDGPU_BUFFER_LOAD_USHORT : BufferLoadGenericInstruction;
2522def G_AMDGPU_BUFFER_LOAD_SSHORT : BufferLoadGenericInstruction;
2523def G_AMDGPU_BUFFER_LOAD : BufferLoadGenericInstruction;
2524def G_AMDGPU_BUFFER_LOAD_FORMAT : BufferLoadGenericInstruction;
2525def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction;
2526def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction;
2527def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction;
2528
2529class BufferStoreGenericInstruction : AMDGPUGenericInstruction {
2530  let OutOperandList = (outs);
2531  let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
2532                           type2:$soffset, untyped_imm_0:$offset,
2533                           untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
2534  let hasSideEffects = 0;
2535  let mayStore = 1;
2536}
2537
2538class TBufferStoreGenericInstruction : AMDGPUGenericInstruction {
2539  let OutOperandList = (outs);
2540  let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
2541                           type2:$soffset, untyped_imm_0:$offset,
2542                           untyped_imm_0:$format,
2543                           untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
2544  let hasSideEffects = 0;
2545  let mayStore = 1;
2546}
2547
2548def G_AMDGPU_BUFFER_STORE : BufferStoreGenericInstruction;
2549def G_AMDGPU_BUFFER_STORE_BYTE : BufferStoreGenericInstruction;
2550def G_AMDGPU_BUFFER_STORE_SHORT : BufferStoreGenericInstruction;
2551def G_AMDGPU_BUFFER_STORE_FORMAT : BufferStoreGenericInstruction;
2552def G_AMDGPU_BUFFER_STORE_FORMAT_D16 : BufferStoreGenericInstruction;
2553def G_AMDGPU_TBUFFER_STORE_FORMAT : TBufferStoreGenericInstruction;
2554def G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : TBufferStoreGenericInstruction;
2555
2556def G_AMDGPU_FMIN_LEGACY : AMDGPUGenericInstruction {
2557  let OutOperandList = (outs type0:$dst);
2558  let InOperandList = (ins type0:$src0, type0:$src1);
2559  let hasSideEffects = 0;
2560}
2561
2562def G_AMDGPU_FMAX_LEGACY : AMDGPUGenericInstruction {
2563  let OutOperandList = (outs type0:$dst);
2564  let InOperandList = (ins type0:$src0, type0:$src1);
2565  let hasSideEffects = 0;
2566}
2567
2568foreach N = 0-3 in {
2569def G_AMDGPU_CVT_F32_UBYTE#N : AMDGPUGenericInstruction {
2570  let OutOperandList = (outs type0:$dst);
2571  let InOperandList = (ins type0:$src0);
2572  let hasSideEffects = 0;
2573}
2574}
2575
2576// Atomic cmpxchg. $cmpval ad $newval are packed in a single vector
2577// operand Expects a MachineMemOperand in addition to explicit
2578// operands.
2579def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction {
2580  let OutOperandList = (outs type0:$oldval);
2581  let InOperandList = (ins ptype1:$addr, type0:$cmpval_newval);
2582  let hasSideEffects = 0;
2583  let mayLoad = 1;
2584  let mayStore = 1;
2585}
2586
2587let Namespace = "AMDGPU" in {
2588def G_AMDGPU_ATOMIC_INC : G_ATOMICRMW_OP;
2589def G_AMDGPU_ATOMIC_DEC : G_ATOMICRMW_OP;
2590def G_AMDGPU_ATOMIC_FMIN : G_ATOMICRMW_OP;
2591def G_AMDGPU_ATOMIC_FMAX : G_ATOMICRMW_OP;
2592}
2593
2594class BufferAtomicGenericInstruction<bit NoRtn = 0> : AMDGPUGenericInstruction {
2595  let OutOperandList = !if(NoRtn, (outs), (outs type0:$dst));
2596  let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
2597                           type2:$soffset, untyped_imm_0:$offset,
2598                           untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
2599  let hasSideEffects = 0;
2600  let mayLoad = 1;
2601  let mayStore = 1;
2602}
2603
2604def G_AMDGPU_BUFFER_ATOMIC_SWAP : BufferAtomicGenericInstruction;
2605def G_AMDGPU_BUFFER_ATOMIC_ADD : BufferAtomicGenericInstruction;
2606def G_AMDGPU_BUFFER_ATOMIC_SUB : BufferAtomicGenericInstruction;
2607def G_AMDGPU_BUFFER_ATOMIC_SMIN : BufferAtomicGenericInstruction;
2608def G_AMDGPU_BUFFER_ATOMIC_UMIN : BufferAtomicGenericInstruction;
2609def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction;
2610def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction;
2611def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction;
2612def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction;
2613def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
2614def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
2615def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction;
2616def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction;
2617
2618def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction {
2619  let OutOperandList = (outs type0:$dst);
2620  let InOperandList = (ins type0:$vdata, type0:$cmp, type1:$rsrc, type2:$vindex,
2621                           type2:$voffset, type2:$soffset, untyped_imm_0:$offset,
2622                           untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
2623  let hasSideEffects = 0;
2624  let mayLoad = 1;
2625  let mayStore = 1;
2626}
2627
2628// Wrapper around llvm.amdgcn.s.buffer.load. This is mostly needed as
2629// a workaround for the intrinsic being defined as readnone, but
2630// really needs a memory operand.
2631def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction {
2632  let OutOperandList = (outs type0:$dst);
2633  let InOperandList = (ins type1:$rsrc, type2:$offset, untyped_imm_0:$cachepolicy);
2634  let hasSideEffects = 0;
2635  let mayLoad = 1;
2636  let mayStore = 0;
2637}
2638
2639// This is equivalent to the G_INTRINSIC*, but the operands may have
2640// been legalized depending on the subtarget requirements.
2641def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction {
2642  let OutOperandList = (outs type0:$dst);
2643  let InOperandList = (ins unknown:$intrin, variable_ops);
2644  let hasSideEffects = 0;
2645  let mayLoad = 1;
2646
2647  // FIXME: Use separate opcode for atomics.
2648  let mayStore = 1;
2649}
2650
2651// This is equivalent to the G_INTRINSIC*, but the operands may have
2652// been legalized depending on the subtarget requirements.
2653def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction {
2654  let OutOperandList = (outs);
2655  let InOperandList = (ins unknown:$intrin, variable_ops);
2656  let hasSideEffects = 0;
2657  let mayStore = 1;
2658}
2659
2660def G_AMDGPU_INTRIN_BVH_INTERSECT_RAY : AMDGPUGenericInstruction {
2661  let OutOperandList = (outs type0:$dst);
2662  let InOperandList = (ins unknown:$intrin, variable_ops);
2663  let hasSideEffects = 0;
2664  let mayLoad = 1;
2665  let mayStore = 0;
2666}
2667