1;; Copyright (C) 2016-2021 Free Software Foundation, Inc.
2
3;; This file is free software; you can redistribute it and/or modify it under
4;; the terms of the GNU General Public License as published by the Free
5;; Software Foundation; either version 3 of the License, or (at your option)
6;; any later version.
7
8;; This file is distributed in the hope that it will be useful, but WITHOUT
9;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
11;; for more details.
12
13;; You should have received a copy of the GNU General Public License
14;; along with GCC; see the file COPYING3.  If not see
15;; <http://www.gnu.org/licenses/>.
16
17;;- See file "rtl.def" for documentation on define_insn, match_*, et. al.
18
19(include "predicates.md")
20(include "constraints.md")
21
22;; {{{ Constants and enums
23
24; Named registers
25(define_constants
26  [(FIRST_SGPR_REG		 0)
27   (CC_SAVE_REG			 22)
28   (LAST_SGPR_REG		 101)
29   (FLAT_SCRATCH_REG		 102)
30   (FLAT_SCRATCH_LO_REG		 102)
31   (FLAT_SCRATCH_HI_REG		 103)
32   (XNACK_MASK_REG		 104)
33   (XNACK_MASK_LO_REG		 104)
34   (XNACK_MASK_HI_REG		 105)
35   (VCC_REG			 106)
36   (VCC_LO_REG			 106)
37   (VCC_HI_REG			 107)
38   (VCCZ_REG			 108)
39   (TBA_REG			 109)
40   (TBA_LO_REG			 109)
41   (TBA_HI_REG			 110)
42   (TMA_REG			 111)
43   (TMA_LO_REG			 111)
44   (TMA_HI_REG			 112)
45   (TTMP0_REG			 113)
46   (TTMP11_REG			 124)
47   (M0_REG			 125)
48   (EXEC_REG			 126)
49   (EXEC_LO_REG			 126)
50   (EXEC_HI_REG			 127)
51   (EXECZ_REG			 128)
52   (SCC_REG			 129)
53   (FIRST_VGPR_REG		 160)
54   (LAST_VGPR_REG		 415)])
55
56(define_constants
57  [(SP_REGNUM 16)
58   (LR_REGNUM 18)
59   (AP_REGNUM 416)
60   (FP_REGNUM 418)])
61
62(define_c_enum "unspecv" [
63  UNSPECV_PROLOGUE_USE
64  UNSPECV_KERNEL_RETURN
65  UNSPECV_BARRIER
66  UNSPECV_ATOMIC
67  UNSPECV_ICACHE_INV])
68
69(define_c_enum "unspec" [
70  UNSPEC_ADDPTR
71  UNSPEC_VECTOR
72  UNSPEC_BPERMUTE
73  UNSPEC_SGPRBASE
74  UNSPEC_MEMORY_BARRIER
75  UNSPEC_SMIN_DPP_SHR UNSPEC_SMAX_DPP_SHR
76  UNSPEC_UMIN_DPP_SHR UNSPEC_UMAX_DPP_SHR
77  UNSPEC_PLUS_DPP_SHR
78  UNSPEC_PLUS_CARRY_DPP_SHR UNSPEC_PLUS_CARRY_IN_DPP_SHR
79  UNSPEC_AND_DPP_SHR UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR
80  UNSPEC_MOV_DPP_SHR
81  UNSPEC_MOV_FROM_LANE63
82  UNSPEC_GATHER
83  UNSPEC_SCATTER
84  UNSPEC_RCP])
85
86;; }}}
87;; {{{ Attributes
88
89; Instruction type (encoding) as described in the ISA specification.
90; The following table summarizes possible operands of individual instruction
91; types and corresponding constraints.
92;
93; sop2 - scalar, two inputs, one output
94;	 ssrc0/ssrc1: sgpr 0-102; flat_scratch,xnack,vcc,tba,tma,ttmp0-11,exec
95;		      vccz,execz,scc,inline immedate,fp inline immediate
96;	 sdst: sgpr 0-102; flat_scratch,xnack,vcc,tba,tma,ttmp0-11,exec
97;
98;	 Constraints "=SD, SD", "SSA,SSB","SSB,SSA"
99;
100; sopk - scalar, inline constant input, one output
101;	 simm16: 16bit inline constant
102;	 sdst: same as sop2/ssrc0
103;
104;	 Constraints "=SD", "J"
105;
106; sop1 - scalar, one input, one output
107;	 ssrc0: same as sop2/ssrc0.  FIXME: manual omit VCCZ
108;	 sdst: same as sop2/sdst
109;
110;	 Constraints "=SD", "SSA"
111;
112; sopc - scalar, two inputs, one comparsion
113;	 ssrc0: same as sop2/ssc0.
114;
115;	 Constraints "SSI,SSA","SSA,SSI"
116;
117; sopp - scalar, one constant input, one special
118;	 simm16
119;
120; smem - scalar memory
121;	 sbase: aligned pair of sgprs.  Specify {size[15:0], base[47:0]} in
122;               dwords
123;	 sdata: sgpr0-102, flat_scratch, xnack, vcc, tba, tma
124;	 offset: sgpr or 20bit unsigned byte offset
125;
126; vop2 - vector, two inputs, one output
127;	 vsrc0: sgpr0-102,flat_scratch,xnack,vcc,tba,ttmp0-11,m0,exec,
128;		inline constant -16 to -64, fp inline immediate, vccz, execz,
129;		scc, lds, literal constant, vgpr0-255
130;	 vsrc1: vgpr0-255
131;	 vdst: vgpr0-255
132;	 Limitations: At most one SGPR, at most one constant
133;		      if constant is used, SGPR must be M0
134;		      Only SRC0 can be LDS_DIRECT
135;
136;	 constraints: "=v", "vBSv", "v"
137;
138; vop1 - vector, one input, one output
139;	 vsrc0: same as vop2/src0
140;	 vdst: vgpr0-255
141;
142;	 constraints: "=v", "vBSv"
143;
144; vopc - vector, two inputs, one comparsion output;
145;	 vsrc0: same as vop2/src0
146;	 vsrc1: vgpr0-255
147;	 vdst:
148;
149;	 constraints: "vASv", "v"
150;
151; vop3a - vector, three inputs, one output
152;	 vdst: vgpr0-255, for v_cmp sgpr or vcc
153;	 abs,clamp
154;	 vsrc0: sgpr0-102,vcc,tba,ttmp0-11,m0,exec,
155;		inline constant -16 to -64, fp inline immediate, vccz, execz,
156;		scc, lds_direct
157;		FIXME: really missing 1/pi? really 104 SGPRs
158;
159; vop3b - vector, three inputs, one vector output, one scalar output
160;	 vsrc0,vsrc1,vsrc2: same as vop3a vsrc0
161;	 vdst: vgpr0-255
162;	 sdst: sgpr0-103/vcc/tba/tma/ttmp0-11
163;
164; vop_sdwa - second dword for vop1/vop2/vopc for specifying sub-dword address
165;	 src0: vgpr0-255
166;	 dst_sel: BYTE_0-3, WORD_0-1, DWORD
167;	 dst_unused: UNUSED_PAD, UNUSED_SEXT, UNUSED_PRESERVE
168;	 clamp: true/false
169;	 src0_sel: BYTE_0-3, WORD_0-1, DWORD
170;	 flags: src0_sext, src0_neg, src0_abs, src1_sel, src1_sext, src1_neg,
171  ;		src1_abs
172;
173; vop_dpp - second dword for vop1/vop2/vopc for specifying data-parallel ops
174;	 src0: vgpr0-255
175;	 dpp_ctrl: quad_perm, row_sl0-15, row_sr0-15, row_rr0-15, wf_sl1,
176;		  wf_rl1, wf_sr1, wf_rr1, row_mirror, row_half_mirror,
177;		  bcast15, bcast31
178;	 flags: src0_neg, src0_abs, src1_neg, src1_abs
179;	 bank_mask: 4-bit mask
180;	 row_mask: 4-bit mask
181;
182; ds - Local and global data share instructions.
183;	 offset0: 8-bit constant
184;	 offset1: 8-bit constant
185;	 flag: gds
186;	 addr: vgpr0-255
187;	 data0: vgpr0-255
188;	 data1: vgpr0-255
189;	 vdst: vgpr0-255
190;
191; mubuf - Untyped memory buffer operation. First word with LDS, second word
192;	  non-LDS.
193;	 offset: 12-bit constant
194;	 vaddr: vgpr0-255
195;	 vdata: vgpr0-255
196;	 srsrc: sgpr0-102
197;	 soffset: sgpr0-102
198;	 flags: offen, idxen, glc, lds, slc, tfe
199;
200; mtbuf - Typed memory buffer operation. Two words
201;	 offset: 12-bit constant
202;	 dfmt: 4-bit constant
203;	 nfmt: 3-bit constant
204;	 vaddr: vgpr0-255
205;	 vdata: vgpr0-255
206;	 srsrc: sgpr0-102
207;	 soffset: sgpr0-102
208;	 flags: offen, idxen, glc, lds, slc, tfe
209;
210; flat - flat or global memory operations
211;	 flags: glc, slc
212;	 addr: vgpr0-255
213;	 data: vgpr0-255
214;	 vdst: vgpr0-255
215;
216; mult - expands to multiple instructions (pseudo encoding)
217;
218; vmult - as mult, when a vector instruction is used.
219
220(define_attr "type"
221	     "unknown,sop1,sop2,sopk,sopc,sopp,smem,ds,vop2,vop1,vopc,
222	      vop3a,vop3b,vop_sdwa,vop_dpp,mubuf,mtbuf,flat,mult,vmult"
223	     (const_string "unknown"))
224
225; Set if instruction is executed in scalar or vector unit
226
227(define_attr "unit" "unknown,scalar,vector"
228  (cond [(eq_attr "type" "sop1,sop2,sopk,sopc,sopp,smem,mult")
229	    (const_string "scalar")
230	 (eq_attr "type" "vop2,vop1,vopc,vop3a,vop3b,ds,
231			  vop_sdwa,vop_dpp,flat,vmult")
232	    (const_string "vector")]
233	 (const_string "unknown")))
234
235; All vector instructions run as 64 threads as predicated by the EXEC
236; register.  Scalar operations in vector register require a single lane
237; enabled, vector moves require a full set of lanes enabled, and most vector
238; operations handle the lane masking themselves.
239; The md_reorg pass is responsible for ensuring that EXEC is set appropriately
240; according to the following settings:
241;   auto   - md_reorg will inspect def/use to determine what to do.
242;   none   - exec is not needed.
243;   single - disable all but lane zero.
244;   full   - enable all lanes.
245
246(define_attr "exec" "auto,none,single,full"
247   (const_string "auto"))
248
249; Infer the (worst-case) length from the instruction type by default.  Many
250; types can have an optional immediate word following, which we include here.
251; "Multiple" types are counted as two 64-bit instructions.  This is just a
252; default fallback: it can be overridden per-alternative in insn patterns for
253; greater accuracy.
254
255(define_attr "length" ""
256  (cond [(eq_attr "type" "sop1") (const_int 8)
257	 (eq_attr "type" "sop2") (const_int 8)
258	 (eq_attr "type" "sopk") (const_int 8)
259	 (eq_attr "type" "sopc") (const_int 8)
260	 (eq_attr "type" "sopp") (const_int 4)
261	 (eq_attr "type" "smem") (const_int 8)
262	 (eq_attr "type" "ds")   (const_int 8)
263	 (eq_attr "type" "vop1") (const_int 8)
264	 (eq_attr "type" "vop2") (const_int 8)
265	 (eq_attr "type" "vopc") (const_int 8)
266	 (eq_attr "type" "vop3a") (const_int 8)
267	 (eq_attr "type" "vop3b") (const_int 8)
268	 (eq_attr "type" "vop_sdwa") (const_int 8)
269	 (eq_attr "type" "vop_dpp") (const_int 8)
270	 (eq_attr "type" "flat") (const_int 8)
271	 (eq_attr "type" "mult") (const_int 16)
272	 (eq_attr "type" "vmult") (const_int 16)]
273	(const_int 4)))
274
275; Disable alternatives that only apply to specific ISA variants.
276
277(define_attr "gcn_version" "gcn3,gcn5" (const_string "gcn3"))
278
279(define_attr "enabled" ""
280  (cond [(eq_attr "gcn_version" "gcn3") (const_int 1)
281	 (and (eq_attr "gcn_version" "gcn5")
282	      (ne (symbol_ref "TARGET_GCN5_PLUS") (const_int 0)))
283	   (const_int 1)]
284	(const_int 0)))
285
286; We need to be able to identify v_readlane and v_writelane with
287; SGPR lane selection in order to handle "Manually Inserted Wait States".
288
289(define_attr "laneselect" "yes,no" (const_string "no"))
290
291; Identify instructions that require a "Manually Inserted Wait State" if
292; their inputs are overwritten by subsequent instructions.
293
294(define_attr "delayeduse" "yes,no" (const_string "no"))
295
296;; }}}
297;; {{{ Iterators useful across the wole machine description
298
299(define_mode_iterator SIDI [SI DI])
300(define_mode_iterator SFDF [SF DF])
301(define_mode_iterator SISF [SI SF])
302(define_mode_iterator QIHI [QI HI])
303(define_mode_iterator DIDF [DI DF])
304(define_mode_iterator FP [HF SF DF])
305(define_mode_iterator FP_1REG [HF SF])
306
307;; }}}
308;; {{{ Attributes.
309
310; Translate RTX code into GCN instruction mnemonics with and without
311; suffixes such as _b32, etc.
312
313(define_code_attr mnemonic
314  [(minus "sub%i")
315   (plus "add%i")
316   (ashift "lshl%b")
317   (lshiftrt "lshr%b")
318   (ashiftrt "ashr%i")
319   (and "and%B")
320   (ior "or%B")
321   (xor "xor%B")
322   (mult "mul%i")
323   (smin "min%i")
324   (smax "max%i")
325   (umin "min%u")
326   (umax "max%u")
327   (not "not%B")
328   (popcount "bcnt_u32%b")])
329
330(define_code_attr bare_mnemonic
331  [(plus "add")
332   (minus "sub")
333   (and "and")
334   (ior "or")
335   (xor "xor")])
336
337(define_code_attr s_mnemonic
338  [(not "not%b")
339   (popcount "bcnt1_i32%b")
340   (clz "flbit_i32%b")
341   (ctz "ff1_i32%b")])
342
343(define_code_attr revmnemonic
344  [(minus "subrev%i")
345   (ashift "lshlrev%b")
346   (lshiftrt "lshrrev%b")
347   (ashiftrt "ashrrev%i")])
348
349; Translate RTX code into corresponding expander name.
350
351(define_code_attr expander
352  [(and "and")
353   (ior "ior")
354   (xor "xor")
355   (plus "add")
356   (minus "sub")
357   (ashift "ashl")
358   (lshiftrt "lshr")
359   (ashiftrt "ashr")
360   (mult "mul")
361   (smin "smin")
362   (smax "smax")
363   (umin "umin")
364   (umax "umax")
365   (not "one_cmpl")
366   (popcount "popcount")
367   (clz "clz")
368   (ctz "ctz")
369   (sign_extend "extend")
370   (zero_extend "zero_extend")])
371
372;; }}}
373;; {{{ Miscellaneous instructions
374
375(define_insn "nop"
376  [(const_int 0)]
377  ""
378  "s_nop\t0x0"
379  [(set_attr "type" "sopp")])
380
381; FIXME: What should the value of the immediate be? Zero is disallowed, so
382; pick 1 for now.
383(define_insn "trap"
384  [(trap_if (const_int 1) (const_int 0))]
385  ""
386  "s_trap\t1"
387  [(set_attr "type" "sopp")])
388
389;; }}}
390;; {{{ Moves
391
392;; All scalar modes we support moves in.
393(define_mode_iterator MOV_MODE [BI QI HI SI DI TI SF DF])
394
395; This is the entry point for creating all kinds of scalar moves,
396; including reloads and symbols.
397
398(define_expand "mov<mode>"
399  [(set (match_operand:MOV_MODE 0 "nonimmediate_operand")
400	(match_operand:MOV_MODE 1 "general_operand"))]
401  ""
402  {
403    if (SUBREG_P (operands[1])
404	&& GET_MODE (operands[1]) == SImode
405	&& GET_MODE (SUBREG_REG (operands[1])) == BImode)
406    {
407      /* (reg:BI VCC) has nregs==2 to ensure it gets clobbered as a whole,
408	 but (subreg:SI (reg:BI VCC)) doesn't, which causes the LRA liveness
409	 checks to assert.  Transform this:
410	   (set (reg:SI) (subreg:SI (reg:BI)))
411	 to this:
412	   (set (subreg:BI (reg:SI)) (reg:BI))  */
413      operands[0] = gen_rtx_SUBREG (BImode, operands[0], 0);
414      operands[1] = SUBREG_REG (operands[1]);
415    }
416    if (SUBREG_P (operands[0])
417	&& GET_MODE (operands[0]) == SImode
418	&& GET_MODE (SUBREG_REG (operands[0])) == BImode)
419      {
420	/* Likewise, transform this:
421	     (set (subreg:SI (reg:BI)) (reg:SI))
422	   to this:
423	     (set (reg:BI) (subreg:BI (reg:SI))) */
424	operands[0] = SUBREG_REG (operands[0]);
425	operands[1] = gen_rtx_SUBREG (BImode, operands[1], 0);
426      }
427
428    if (MEM_P (operands[0]))
429      operands[1] = force_reg (<MODE>mode, operands[1]);
430
431    if (!lra_in_progress && !reload_completed
432	&& !gcn_valid_move_p (<MODE>mode, operands[0], operands[1]))
433      {
434	/* Something is probably trying to generate a move
435	   which can only work indirectly.
436	   E.g. Move from LDS memory to SGPR hardreg
437	     or MEM:QI to SGPR.  */
438	rtx tmpreg = gen_reg_rtx (<MODE>mode);
439	emit_insn (gen_mov<mode> (tmpreg, operands[1]));
440	emit_insn (gen_mov<mode> (operands[0], tmpreg));
441	DONE;
442      }
443
444    if (<MODE>mode == DImode
445	&& (GET_CODE (operands[1]) == SYMBOL_REF
446	    || GET_CODE (operands[1]) == LABEL_REF))
447      {
448	if (lra_in_progress)
449	  emit_insn (gen_movdi_symbol_save_scc (operands[0], operands[1]));
450	else
451	  emit_insn (gen_movdi_symbol (operands[0], operands[1]));
452	DONE;
453      }
454  })
455
456; Split invalid moves into two valid moves
457
458(define_split
459  [(set (match_operand:MOV_MODE 0 "nonimmediate_operand")
460	(match_operand:MOV_MODE 1 "general_operand"))]
461  "!reload_completed && !lra_in_progress
462   && !gcn_valid_move_p (<MODE>mode, operands[0], operands[1])"
463  [(set (match_dup 2) (match_dup 1))
464   (set (match_dup 0) (match_dup 2))]
465  {
466    operands[2] = gen_reg_rtx(<MODE>mode);
467  })
468
469; We need BImode move so we can reload flags registers.
470
471(define_insn "*movbi"
472  [(set (match_operand:BI 0 "nonimmediate_operand"
473				    "=Sg,   v,Sg,cs,cV,cV,Sm,RS, v,RF, v,RM")
474	(match_operand:BI 1 "gcn_load_operand"
475				    "SSA,vSvA, v,SS, v,SS,RS,Sm,RF, v,RM, v"))]
476  ""
477  {
478    /* SCC as an operand is currently not accepted by the LLVM assembler, so
479       we emit bytes directly as a workaround.  */
480    switch (which_alternative) {
481    case 0:
482      if (REG_P (operands[1]) && REGNO (operands[1]) == SCC_REG)
483	return "; s_mov_b32\t%0,%1 is not supported by the assembler.\;"
484	       ".byte\t0xfd\;"
485	       ".byte\t0x0\;"
486	       ".byte\t0x80|%R0\;"
487	       ".byte\t0xbe";
488      else
489	return "s_mov_b32\t%0, %1";
490    case 1:
491      if (REG_P (operands[1]) && REGNO (operands[1]) == SCC_REG)
492	return "; v_mov_b32\t%0, %1\;"
493	       ".byte\t0xfd\;"
494	       ".byte\t0x2\;"
495	       ".byte\t((%V0<<1)&0xff)\;"
496	       ".byte\t0x7e|(%V0>>7)";
497      else
498	return "v_mov_b32\t%0, %1";
499    case 2:
500      return "v_readlane_b32\t%0, %1, 0";
501    case 3:
502      return "s_cmpk_lg_u32\t%1, 0";
503    case 4:
504      return "v_cmp_ne_u32\tvcc, 0, %1";
505    case 5:
506      if (REGNO (operands[1]) == SCC_REG)
507	return "; s_mov_b32\t%0, %1 is not supported by the assembler.\;"
508	       ".byte\t0xfd\;"
509	       ".byte\t0x0\;"
510	       ".byte\t0xea\;"
511	       ".byte\t0xbe\;"
512	       "s_mov_b32\tvcc_hi, 0";
513      else
514	return "s_mov_b32\tvcc_lo, %1\;"
515	       "s_mov_b32\tvcc_hi, 0";
516    case 6:
517      return "s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)";
518    case 7:
519      return "s_store_dword\t%1, %A0";
520    case 8:
521      return "flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0";
522    case 9:
523      return "flat_store_dword\t%A0, %1%O0%g0";
524    case 10:
525      return "global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)";
526    case 11:
527      return "global_store_dword\t%A0, %1%O0%g0";
528    default:
529      gcc_unreachable ();
530    }
531  }
532  [(set_attr "type" "sop1,vop1,vop3a,sopk,vopc,mult,smem,smem,flat,flat,
533		     flat,flat")
534   (set_attr "exec" "*,*,none,*,*,*,*,*,*,*,*,*")
535   (set_attr "length" "4,4,4,4,4,8,12,12,12,12,12,12")])
536
537; 32bit move pattern
538
539(define_insn "*mov<mode>_insn"
540  [(set (match_operand:SISF 0 "nonimmediate_operand"
541		  "=SD,SD,SD,SD,RB,Sm,RS,v,Sg, v, v,RF,v,RLRG,   v,SD, v,RM")
542	(match_operand:SISF 1 "gcn_load_operand"
543		  "SSA, J, B,RB,Sm,RS,Sm,v, v,Sv,RF, v,B,   v,RLRG, Y,RM, v"))]
544  ""
545  "@
546  s_mov_b32\t%0, %1
547  s_movk_i32\t%0, %1
548  s_mov_b32\t%0, %1
549  s_buffer_load%s0\t%0, s[0:3], %1\;s_waitcnt\tlgkmcnt(0)
550  s_buffer_store%s1\t%1, s[0:3], %0
551  s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
552  s_store_dword\t%1, %A0
553  v_mov_b32\t%0, %1
554  v_readlane_b32\t%0, %1, 0
555  v_writelane_b32\t%0, %1, 0
556  flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0
557  flat_store_dword\t%A0, %1%O0%g0
558  v_mov_b32\t%0, %1
559  ds_write_b32\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
560  ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
561  s_mov_b32\t%0, %1
562  global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
563  global_store_dword\t%A0, %1%O0%g0"
564  [(set_attr "type" "sop1,sopk,sop1,smem,smem,smem,smem,vop1,vop3a,vop3a,flat,
565		     flat,vop1,ds,ds,sop1,flat,flat")
566   (set_attr "exec" "*,*,*,*,*,*,*,*,none,none,*,*,*,*,*,*,*,*")
567   (set_attr "length" "4,4,8,12,12,12,12,4,8,8,12,12,8,12,12,8,12,12")])
568
569; 8/16bit move pattern
570
571(define_insn "*mov<mode>_insn"
572  [(set (match_operand:QIHI 0 "nonimmediate_operand"
573				 "=SD,SD,SD,v,Sg, v, v,RF,v,RLRG,   v, v,RM")
574	(match_operand:QIHI 1 "gcn_load_operand"
575				 "SSA, J, B,v, v,Sv,RF, v,B,   v,RLRG,RM, v"))]
576  "gcn_valid_move_p (<MODE>mode, operands[0], operands[1])"
577  "@
578  s_mov_b32\t%0, %1
579  s_movk_i32\t%0, %1
580  s_mov_b32\t%0, %1
581  v_mov_b32\t%0, %1
582  v_readlane_b32\t%0, %1, 0
583  v_writelane_b32\t%0, %1, 0
584  flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0
585  flat_store%s0\t%A0, %1%O0%g0
586  v_mov_b32\t%0, %1
587  ds_write%b0\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
588  ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
589  global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
590  global_store%s0\t%A0, %1%O0%g0"
591  [(set_attr "type"
592	     "sop1,sopk,sop1,vop1,vop3a,vop3a,flat,flat,vop1,ds,ds,flat,flat")
593   (set_attr "exec" "*,*,*,*,none,none,*,*,*,*,*,*,*")
594   (set_attr "length" "4,4,8,4,4,4,12,12,8,12,12,12,12")])
595
596; 64bit move pattern
597
598(define_insn_and_split "*mov<mode>_insn"
599  [(set (match_operand:DIDF 0 "nonimmediate_operand"
600			  "=SD,SD,SD,RS,Sm,v, v,Sg, v, v,RF,RLRG,   v, v,RM")
601	(match_operand:DIDF 1 "general_operand"
602			  "SSA, C,DB,Sm,RS,v,DB, v,Sv,RF, v,   v,RLRG,RM, v"))]
603  "GET_CODE(operands[1]) != SYMBOL_REF"
604  "@
605  s_mov_b64\t%0, %1
606  s_mov_b64\t%0, %1
607  #
608  s_store_dwordx2\t%1, %A0
609  s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
610  #
611  #
612  #
613  #
614  flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0
615  flat_store_dwordx2\t%A0, %1%O0%g0
616  ds_write_b64\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
617  ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
618  global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
619  global_store_dwordx2\t%A0, %1%O0%g0"
620  "reload_completed
621   && ((!MEM_P (operands[0]) && !MEM_P (operands[1])
622        && !gcn_sgpr_move_p (operands[0], operands[1]))
623       || (GET_CODE (operands[1]) == CONST_INT
624	   && !gcn_constant64_p (operands[1])))"
625  [(set (match_dup 0) (match_dup 1))
626   (set (match_dup 2) (match_dup 3))]
627  {
628    rtx inlo = gen_lowpart (SImode, operands[1]);
629    rtx inhi = gen_highpart_mode (SImode, <MODE>mode, operands[1]);
630    rtx outlo = gen_lowpart (SImode, operands[0]);
631    rtx outhi = gen_highpart_mode (SImode, <MODE>mode, operands[0]);
632
633    /* Ensure that overlapping registers aren't corrupted.  */
634    if (reg_overlap_mentioned_p (outlo, inhi))
635      {
636	operands[0] = outhi;
637	operands[1] = inhi;
638	operands[2] = outlo;
639	operands[3] = inlo;
640      }
641    else
642      {
643	operands[0] = outlo;
644	operands[1] = inlo;
645	operands[2] = outhi;
646	operands[3] = inhi;
647      }
648  }
649  [(set_attr "type" "sop1,sop1,mult,smem,smem,vmult,vmult,vmult,vmult,flat,
650		     flat,ds,ds,flat,flat")
651   (set_attr "length" "4,8,*,12,12,*,*,*,*,12,12,12,12,12,12")])
652
653; 128-bit move.
654
655(define_insn_and_split "*movti_insn"
656  [(set (match_operand:TI 0 "nonimmediate_operand"
657				      "=SD,RS,Sm,RF, v,v, v,SD,RM, v,RL, v")
658	(match_operand:TI 1 "general_operand"
659				      "SSB,Sm,RS, v,RF,v,Sv, v, v,RM, v,RL"))]
660  ""
661  "@
662  #
663  s_store_dwordx4\t%1, %A0
664  s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
665  flat_store_dwordx4\t%A0, %1%O0%g0
666  flat_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\t0
667  #
668  #
669  #
670  global_store_dwordx4\t%A0, %1%O0%g0
671  global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
672  ds_write_b128\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
673  ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)"
674  "reload_completed
675   && REG_P (operands[0])
676   && (REG_P (operands[1]) || GET_CODE (operands[1]) == CONST_INT)"
677  [(set (match_dup 0) (match_dup 1))
678   (set (match_dup 2) (match_dup 3))
679   (set (match_dup 4) (match_dup 5))
680   (set (match_dup 6) (match_dup 7))]
681  {
682    gcc_assert (rtx_equal_p (operands[0], operands[1])
683		|| !reg_overlap_mentioned_p (operands[0], operands[1]));
684    operands[6] = gcn_operand_part (TImode, operands[0], 3);
685    operands[7] = gcn_operand_part (TImode, operands[1], 3);
686    operands[4] = gcn_operand_part (TImode, operands[0], 2);
687    operands[5] = gcn_operand_part (TImode, operands[1], 2);
688    operands[2] = gcn_operand_part (TImode, operands[0], 1);
689    operands[3] = gcn_operand_part (TImode, operands[1], 1);
690    operands[0] = gcn_operand_part (TImode, operands[0], 0);
691    operands[1] = gcn_operand_part (TImode, operands[1], 0);
692  }
693  [(set_attr "type" "mult,smem,smem,flat,flat,vmult,vmult,vmult,flat,flat,\
694		     ds,ds")
695   (set_attr "delayeduse" "*,*,yes,*,*,*,*,*,yes,*,*,*")
696   (set_attr "length" "*,12,12,12,12,*,*,*,12,12,12,12")])
697
698;; }}}
699;; {{{ Prologue/Epilogue
700
701(define_insn "prologue_use"
702  [(unspec_volatile [(match_operand 0)] UNSPECV_PROLOGUE_USE)]
703  ""
704  ""
705  [(set_attr "length" "0")])
706
707(define_expand "prologue"
708  [(const_int 0)]
709  ""
710  {
711    gcn_expand_prologue ();
712    DONE;
713  })
714
715(define_expand "epilogue"
716  [(const_int 0)]
717  ""
718  {
719    gcn_expand_epilogue ();
720    DONE;
721  })
722
723;; }}}
724;; {{{ Control flow
725
726; This pattern must satisfy simplejump_p, which means it cannot be a parallel
727; that clobbers SCC.  Thus, we must preserve SCC if we're generating a long
728; branch sequence.
729
730(define_insn "jump"
731  [(set (pc)
732	(label_ref (match_operand 0)))]
733  ""
734  {
735    if (get_attr_length (insn) == 4)
736      return "s_branch\t%0";
737    else
738      /* !!! This sequence clobbers EXEC_SAVE_REG and CC_SAVE_REG.  */
739      return "; s_mov_b32\ts22, scc is not supported by the assembler.\;"
740	     ".long\t0xbe9600fd\;"
741	     "s_getpc_b64\ts[20:21]\;"
742	     "s_add_u32\ts20, s20, %0@rel32@lo+4\;"
743	     "s_addc_u32\ts21, s21, %0@rel32@hi+4\;"
744	     "s_cmpk_lg_u32\ts22, 0\;"
745	     "s_setpc_b64\ts[20:21]";
746  }
747  [(set_attr "type" "sopp")
748   (set (attr "length")
749	(if_then_else (and (ge (minus (match_dup 0) (pc))
750			       (const_int -131072))
751			   (lt (minus (match_dup 0) (pc))
752			       (const_int 131072)))
753		      (const_int 4)
754		      (const_int 32)))])
755
756(define_insn "indirect_jump"
757  [(set (pc)
758	(match_operand:DI 0 "register_operand" "Sg"))]
759  ""
760  "s_setpc_b64\t%0"
761  [(set_attr "type" "sop1")
762   (set_attr "length" "4")])
763
764(define_insn "cjump"
765  [(set (pc)
766	(if_then_else
767	  (match_operator:BI 1 "gcn_conditional_operator"
768	    [(match_operand:BI 2 "gcn_conditional_register_operand" "ca,cV")
769	     (const_int 0)])
770	  (label_ref (match_operand 0))
771	  (pc)))]
772  ""
773  {
774    if (get_attr_length (insn) == 4)
775      return "s_cbranch%C1\t%0";
776    else
777      {
778	/* !!! This sequence clobbers EXEC_SAVE_REG and CC_SAVE_REG but
779	       restores SCC.  */
780	if (REGNO (operands[2]) == SCC_REG)
781	  {
782	    if (GET_CODE (operands[1]) == EQ)
783	      return "s_cbranch%c1\t.Lskip%=\;"
784		     "s_getpc_b64\ts[20:21]\;"
785		     "s_add_u32\ts20, s20, %0@rel32@lo+4\;"
786		     "s_addc_u32\ts21, s21, %0@rel32@hi+4\;"
787		     "s_cmp_lg_u32\t0, 0\;"
788		     "s_setpc_b64\ts[20:21]\n"
789		     ".Lskip%=:";
790	    else
791	      return "s_cbranch%c1\t.Lskip%=\;"
792		     "s_getpc_b64\ts[20:21]\;"
793		     "s_add_u32\ts20, s20, %0@rel32@lo+4\;"
794		     "s_addc_u32\ts21, s21, %0@rel32@hi+4\;"
795		     "s_cmp_eq_u32\t0, 0\;"
796		     "s_setpc_b64\ts[20:21]\n"
797		     ".Lskip%=:";
798	  }
799	else
800	  return "s_cbranch%c1\t.Lskip%=\;"
801		 "; s_mov_b32\ts22, scc is not supported by the assembler.\;"
802		 ".byte\t0xfd\;"
803		 ".byte\t0x0\;"
804		 ".byte\t0x80|22\;"
805		 ".byte\t0xbe\;"
806		 "s_getpc_b64\ts[20:21]\;"
807		 "s_add_u32\ts20, s20, %0@rel32@lo+4\;"
808		 "s_addc_u32\ts21, s21, %0@rel32@hi+4\;"
809		 "s_cmpk_lg_u32\ts22, 0\;"
810		 "s_setpc_b64\ts[20:21]\n"
811		 ".Lskip%=:";
812      }
813  }
814  [(set_attr "type" "sopp")
815   (set (attr "length")
816	(if_then_else (and (ge (minus (match_dup 0) (pc))
817			       (const_int -131072))
818			   (lt (minus (match_dup 0) (pc))
819			       (const_int 131072)))
820		      (const_int 4)
821		      (const_int 36)))])
822
823; Returning from a normal function is different to returning from a
824; kernel function.
825
826(define_insn "gcn_return"
827  [(return)]
828  ""
829  {
830    if (cfun && cfun->machine && cfun->machine->normal_function)
831      return "s_setpc_b64\ts[18:19]";
832    else
833      return "s_waitcnt\tlgkmcnt(0)\;s_dcache_wb\;s_endpgm";
834  }
835  [(set_attr "type" "sop1")
836   (set_attr "length" "12")])
837
838(define_expand "call"
839  [(parallel [(call (match_operand 0 "")
840		    (match_operand 1 ""))
841	      (clobber (reg:DI LR_REGNUM))
842	      (clobber (match_scratch:DI 2))])]
843  ""
844  {})
845
846(define_insn "gcn_simple_call"
847  [(call (mem (match_operand 0 "immediate_operand" "Y,B"))
848	 (match_operand 1 "const_int_operand"))
849   (clobber (reg:DI LR_REGNUM))
850   (clobber (match_scratch:DI 2 "=&Sg,X"))]
851  ""
852  "@
853  s_getpc_b64\t%2\;s_add_u32\t%L2, %L2, %0@rel32@lo+4\;s_addc_u32\t%H2, %H2, %0@rel32@hi+4\;s_swappc_b64\ts[18:19], %2
854  s_swappc_b64\ts[18:19], %0"
855  [(set_attr "type" "mult,sop1")
856   (set_attr "length" "24,4")])
857
858(define_insn "movdi_symbol"
859 [(set (match_operand:DI 0 "nonimmediate_operand" "=Sg")
860       (match_operand:DI 1 "general_operand" "Y"))
861  (clobber (reg:BI SCC_REG))]
862 "GET_CODE (operands[1]) == SYMBOL_REF || GET_CODE (operands[1]) == LABEL_REF"
863  {
864    if (SYMBOL_REF_P (operands[1])
865	&& SYMBOL_REF_WEAK (operands[1]))
866	return "s_getpc_b64\t%0\;"
867	       "s_add_u32\t%L0, %L0, %1@gotpcrel32@lo+4\;"
868	       "s_addc_u32\t%H0, %H0, %1@gotpcrel32@hi+4\;"
869	       "s_load_dwordx2\t%0, %0\;"
870	       "s_waitcnt\tlgkmcnt(0)";
871
872    return "s_getpc_b64\t%0\;"
873	   "s_add_u32\t%L0, %L0, %1@rel32@lo+4\;"
874	   "s_addc_u32\t%H0, %H0, %1@rel32@hi+4";
875  }
876 [(set_attr "type" "mult")
877  (set_attr "length" "32")])
878
879(define_insn "movdi_symbol_save_scc"
880 [(set (match_operand:DI 0 "nonimmediate_operand" "=Sg")
881       (match_operand:DI 1 "general_operand" "Y"))
882  (clobber (reg:BI CC_SAVE_REG))]
883 "(GET_CODE (operands[1]) == SYMBOL_REF || GET_CODE (operands[1]) == LABEL_REF)
884  && (lra_in_progress || reload_completed)"
885  {
886    /* !!! These sequences clobber CC_SAVE_REG.  */
887
888    if (SYMBOL_REF_P (operands[1])
889	&& SYMBOL_REF_WEAK (operands[1]))
890	return "; s_mov_b32\ts22, scc is not supported by the assembler.\;"
891	       ".long\t0xbe9600fd\;"
892	       "s_getpc_b64\t%0\;"
893	       "s_add_u32\t%L0, %L0, %1@gotpcrel32@lo+4\;"
894	       "s_addc_u32\t%H0, %H0, %1@gotpcrel32@hi+4\;"
895	       "s_load_dwordx2\t%0, %0\;"
896	       "s_cmpk_lg_u32\ts22, 0\;"
897	       "s_waitcnt\tlgkmcnt(0)";
898
899    return "; s_mov_b32\ts22, scc is not supported by the assembler.\;"
900	   ".long\t0xbe9600fd\;"
901	   "s_getpc_b64\t%0\;"
902	   "s_add_u32\t%L0, %L0, %1@rel32@lo+4\;"
903	   "s_addc_u32\t%H0, %H0, %1@rel32@hi+4\;"
904	   "s_cmpk_lg_u32\ts22, 0";
905  }
906 [(set_attr "type" "mult")
907  (set_attr "length" "40")])
908
909
910(define_insn "gcn_indirect_call"
911  [(call (mem (match_operand:DI 0 "register_operand" "Sg"))
912	 (match_operand 1 "" ""))
913   (clobber (reg:DI LR_REGNUM))
914   (clobber (match_scratch:DI 2 "=X"))]
915  ""
916  "s_swappc_b64\ts[18:19], %0"
917  [(set_attr "type" "sop1")
918   (set_attr "length" "4")])
919
920(define_expand "call_value"
921  [(parallel [(set (match_operand 0 "")
922		   (call (match_operand 1 "")
923			 (match_operand 2 "")))
924	      (clobber (reg:DI LR_REGNUM))
925	      (clobber (match_scratch:DI 3))])]
926  ""
927  {})
928
929(define_insn "gcn_call_value"
930  [(set (match_operand 0 "register_operand" "=Sg,Sg")
931	(call (mem (match_operand 1 "immediate_operand" "Y,B"))
932	      (match_operand 2 "const_int_operand")))
933   (clobber (reg:DI LR_REGNUM))
934   (clobber (match_scratch:DI 3 "=&Sg,X"))]
935  ""
936  "@
937  s_getpc_b64\t%3\;s_add_u32\t%L3, %L3, %1@rel32@lo+4\;s_addc_u32\t%H3, %H3, %1@rel32@hi+4\;s_swappc_b64\ts[18:19], %3
938  s_swappc_b64\ts[18:19], %1"
939  [(set_attr "type" "sop1")
940   (set_attr "length" "24")])
941
942(define_insn "gcn_call_value_indirect"
943  [(set (match_operand 0 "register_operand" "=Sg")
944	(call (mem (match_operand:DI 1 "register_operand" "Sg"))
945	      (match_operand 2 "" "")))
946   (clobber (reg:DI LR_REGNUM))
947   (clobber (match_scratch:DI 3 "=X"))]
948  ""
949  "s_swappc_b64\ts[18:19], %1"
950  [(set_attr "type" "sop1")
951   (set_attr "length" "4")])
952
953; GCN does not have an instruction to clear only part of the instruction
954; cache, so the operands are ignored.
955
956(define_insn "clear_icache"
957  [(unspec_volatile
958    [(match_operand 0 "") (match_operand 1 "")]
959    UNSPECV_ICACHE_INV)]
960  ""
961  "s_icache_inv"
962  [(set_attr "type" "sopp")
963   (set_attr "length" "4")])
964
965;; }}}
966;; {{{ Conditionals
967
968; 32-bit compare, scalar unit only
969
970(define_insn "cstoresi4"
971  [(set (match_operand:BI 0 "gcn_conditional_register_operand"
972							 "=cs, cs, cs, cs")
973	(match_operator:BI 1 "gcn_compare_operator"
974	  [(match_operand:SI 2 "gcn_alu_operand"	 "SSA,SSA,SSB, SS")
975	   (match_operand:SI 3 "gcn_alu_operand"	 "SSA,SSL, SS,SSB")]))]
976  ""
977  "@
978   s_cmp%D1\t%2, %3
979   s_cmpk%D1\t%2, %3
980   s_cmp%D1\t%2, %3
981   s_cmp%D1\t%2, %3"
982  [(set_attr "type" "sopc,sopk,sopk,sopk")
983   (set_attr "length" "4,4,8,8")])
984
985(define_expand "cbranchsi4"
986  [(match_operator 0 "gcn_compare_operator"
987     [(match_operand:SI 1 "gcn_alu_operand")
988      (match_operand:SI 2 "gcn_alu_operand")])
989   (match_operand 3)]
990  ""
991  {
992    rtx cc = gen_reg_rtx (BImode);
993    emit_insn (gen_cstoresi4 (cc, operands[0], operands[1], operands[2]));
994    emit_jump_insn (gen_cjump (operands[3],
995			       gen_rtx_NE (BImode, cc, const0_rtx), cc));
996    DONE;
997  })
998
999; 64-bit compare; either unit, but scalar allows limited operators
1000
1001(define_expand "cstoredi4"
1002  [(set (match_operand:BI 0 "gcn_conditional_register_operand")
1003	(match_operator:BI 1 "gcn_compare_operator"
1004			   [(match_operand:DI 2 "gcn_alu_operand")
1005			    (match_operand:DI 3 "gcn_alu_operand")]))]
1006  ""
1007  {})
1008
1009(define_insn "cstoredi4_vec_and_scalar"
1010  [(set (match_operand:BI 0 "gcn_conditional_register_operand" "= cs,  cV")
1011	(match_operator:BI 1 "gcn_compare_64bit_operator"
1012	  [(match_operand:DI 2 "gcn_alu_operand"	       "%SSA,vSvC")
1013	   (match_operand:DI 3 "gcn_alu_operand"	       " SSC,   v")]))]
1014  ""
1015  "@
1016   s_cmp%D1\t%2, %3
1017   v_cmp%E1\tvcc, %2, %3"
1018  [(set_attr "type" "sopc,vopc")
1019   (set_attr "length" "8")])
1020
1021(define_insn "cstoredi4_vector"
1022  [(set (match_operand:BI 0 "gcn_conditional_register_operand" "= cV")
1023	(match_operator:BI 1 "gcn_compare_operator"
1024          [(match_operand:DI 2 "gcn_alu_operand"	       "vSvB")
1025	   (match_operand:DI 3 "gcn_alu_operand"	       "   v")]))]
1026  ""
1027  "v_cmp%E1\tvcc, %2, %3"
1028  [(set_attr "type" "vopc")
1029   (set_attr "length" "8")])
1030
1031(define_expand "cbranchdi4"
1032  [(match_operator 0 "gcn_compare_operator"
1033     [(match_operand:DI 1 "gcn_alu_operand")
1034      (match_operand:DI 2 "gcn_alu_operand")])
1035   (match_operand 3)]
1036  ""
1037  {
1038    rtx cc = gen_reg_rtx (BImode);
1039    emit_insn (gen_cstoredi4 (cc, operands[0], operands[1], operands[2]));
1040    emit_jump_insn (gen_cjump (operands[3],
1041			       gen_rtx_NE (BImode, cc, const0_rtx), cc));
1042    DONE;
1043  })
1044
1045; FP compare; vector unit only
1046
1047(define_insn "cstore<mode>4"
1048  [(set (match_operand:BI 0 "gcn_conditional_register_operand" "=cV")
1049	(match_operator:BI 1 "gcn_fp_compare_operator"
1050	  [(match_operand:SFDF 2 "gcn_alu_operand"		"vB")
1051	   (match_operand:SFDF 3 "gcn_alu_operand"		 "v")]))]
1052  ""
1053  "v_cmp%E1\tvcc, %2, %3"
1054  [(set_attr "type" "vopc")
1055   (set_attr "length" "8")])
1056
1057(define_expand "cbranch<mode>4"
1058  [(match_operator 0 "gcn_fp_compare_operator"
1059     [(match_operand:SFDF 1 "gcn_alu_operand")
1060      (match_operand:SFDF 2 "gcn_alu_operand")])
1061   (match_operand 3)]
1062  ""
1063  {
1064    rtx cc = gen_reg_rtx (BImode);
1065    emit_insn (gen_cstore<mode>4 (cc, operands[0], operands[1], operands[2]));
1066    emit_jump_insn (gen_cjump (operands[3],
1067			       gen_rtx_NE (BImode, cc, const0_rtx), cc));
1068    DONE;
1069  })
1070
1071;; }}}
1072;; {{{ ALU special cases: Plus
1073
1074(define_insn "addsi3"
1075  [(set (match_operand:SI 0 "register_operand"         "= Sg, Sg, Sg,   v")
1076        (plus:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA,  0,SgA,   v")
1077		 (match_operand:SI 2 "gcn_alu_operand" " SgA,SgJ,  B,vBSv")))
1078   (clobber (match_scratch:BI 3			       "= cs, cs, cs,   X"))
1079   (clobber (match_scratch:DI 4			       "=  X,  X,  X,  cV"))]
1080  ""
1081  "@
1082   s_add_i32\t%0, %1, %2
1083   s_addk_i32\t%0, %2
1084   s_add_i32\t%0, %1, %2
1085   v_add%^_u32\t%0, vcc, %2, %1"
1086  [(set_attr "type" "sop2,sopk,sop2,vop2")
1087   (set_attr "length" "4,4,8,8")])
1088
1089(define_expand "addsi3_scc"
1090  [(parallel [(set (match_operand:SI 0 "register_operand")
1091		   (plus:SI (match_operand:SI 1 "gcn_alu_operand")
1092			    (match_operand:SI 2 "gcn_alu_operand")))
1093	      (clobber (reg:BI SCC_REG))
1094	      (clobber (scratch:DI))])]
1095  ""
1096  {})
1097
1098; Having this as an insn_and_split allows us to keep together DImode adds
1099; through some RTL optimisation passes, and means the CC reg we set isn't
1100; dependent on the constraint alternative (which doesn't seem to work well).
1101
1102; If v_addc_u32 is used to add with carry, a 32-bit literal constant cannot be
1103; used as an operand due to the read of VCC, so we restrict constants to the
1104; inlinable range for that alternative.
1105
1106(define_insn_and_split "adddi3"
1107  [(set (match_operand:DI 0 "register_operand"		 "=Sg, v")
1108	(plus:DI (match_operand:DI 1 "register_operand"  " Sg, v")
1109		 (match_operand:DI 2 "nonmemory_operand" "SgB,vA")))
1110   (clobber (match_scratch:BI 3				 "=cs, X"))
1111   (clobber (match_scratch:DI 4				 "= X,cV"))]
1112  ""
1113  "#"
1114  "&& reload_completed"
1115  [(const_int 0)]
1116  {
1117    rtx cc = gen_rtx_REG (BImode, gcn_vgpr_register_operand (operands[1],
1118							     DImode)
1119			  ? VCC_REG : SCC_REG);
1120
1121    emit_insn (gen_addsi3_scalar_carry
1122	       (gcn_operand_part (DImode, operands[0], 0),
1123		gcn_operand_part (DImode, operands[1], 0),
1124		gcn_operand_part (DImode, operands[2], 0),
1125		cc));
1126    rtx val = gcn_operand_part (DImode, operands[2], 1);
1127    if (val != const0_rtx)
1128      emit_insn (gen_addcsi3_scalar
1129		 (gcn_operand_part (DImode, operands[0], 1),
1130		  gcn_operand_part (DImode, operands[1], 1),
1131		  gcn_operand_part (DImode, operands[2], 1),
1132		  cc, cc));
1133    else
1134      emit_insn (gen_addcsi3_scalar_zero
1135		 (gcn_operand_part (DImode, operands[0], 1),
1136		  gcn_operand_part (DImode, operands[1], 1),
1137		  cc));
1138    DONE;
1139  }
1140  [(set_attr "type" "mult,vmult")
1141   (set_attr "length" "8")])
1142
1143(define_expand "adddi3_scc"
1144  [(parallel [(set (match_operand:DI 0 "register_operand")
1145		   (plus:DI (match_operand:DI 1 "register_operand")
1146			    (match_operand:DI 2 "nonmemory_operand")))
1147	      (clobber (reg:BI SCC_REG))
1148	      (clobber (scratch:DI))])]
1149  ""
1150  {})
1151
1152;; Add with carry.
1153
1154(define_insn "addsi3_scalar_carry"
1155  [(set (match_operand:SI 0 "register_operand"	       "= Sg, v")
1156	(plus:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA, v")
1157		 (match_operand:SI 2 "gcn_alu_operand" " SgB,vB")))
1158   (set (match_operand:BI 3 "register_operand"	       "= cs,cV")
1159	(ltu:BI (plus:SI (match_dup 1)
1160			 (match_dup 2))
1161		(match_dup 1)))]
1162  ""
1163  "@
1164   s_add_u32\t%0, %1, %2
1165   v_add%^_u32\t%0, vcc, %2, %1"
1166  [(set_attr "type" "sop2,vop2")
1167   (set_attr "length" "8,8")])
1168
1169(define_insn "addsi3_scalar_carry_cst"
1170  [(set (match_operand:SI 0 "register_operand"           "=Sg, v")
1171        (plus:SI (match_operand:SI 1 "gcn_alu_operand"   "SgA, v")
1172		 (match_operand:SI 2 "const_int_operand" "  n, n")))
1173   (set (match_operand:BI 4 "register_operand"           "=cs,cV")
1174	(geu:BI (plus:SI (match_dup 1)
1175			 (match_dup 2))
1176		(match_operand:SI 3 "const_int_operand"  "  n, n")))]
1177  "INTVAL (operands[2]) == -INTVAL (operands[3])"
1178  "@
1179   s_add_u32\t%0, %1, %2
1180   v_add%^_u32\t%0, vcc, %2, %1"
1181  [(set_attr "type" "sop2,vop2")
1182   (set_attr "length" "4")])
1183
1184(define_insn "addcsi3_scalar"
1185  [(set (match_operand:SI 0 "register_operand"			   "= Sg, v")
1186	(plus:SI (plus:SI (zero_extend:SI
1187			    (match_operand:BI 3 "register_operand" "= cs,cV"))
1188			  (match_operand:SI 1 "gcn_alu_operand"    "%SgA, v"))
1189		 (match_operand:SI 2 "gcn_alu_operand"		   " SgB,vA")))
1190   (set (match_operand:BI 4 "register_operand"			   "=  3, 3")
1191	(ior:BI (ltu:BI (plus:SI
1192			  (plus:SI
1193			    (zero_extend:SI (match_dup 3))
1194			    (match_dup 1))
1195			  (match_dup 2))
1196			(match_dup 2))
1197		(ltu:BI (plus:SI (zero_extend:SI (match_dup 3)) (match_dup 1))
1198			(match_dup 1))))]
1199  ""
1200  "@
1201   s_addc_u32\t%0, %1, %2
1202   v_addc%^_u32\t%0, vcc, %2, %1, vcc"
1203  [(set_attr "type" "sop2,vop2")
1204   (set_attr "length" "8,4")])
1205
1206(define_insn "addcsi3_scalar_zero"
1207  [(set (match_operand:SI 0 "register_operand"		  "=Sg, v")
1208        (plus:SI (zero_extend:SI
1209		   (match_operand:BI 2 "register_operand" "=cs,cV"))
1210		 (match_operand:SI 1 "gcn_alu_operand"    "SgA, v")))
1211   (set (match_dup 2)
1212	(ltu:BI (plus:SI (zero_extend:SI (match_dup 2))
1213			 (match_dup 1))
1214		(match_dup 1)))]
1215  ""
1216  "@
1217   s_addc_u32\t%0, %1, 0
1218   v_addc%^_u32\t%0, vcc, 0, %1, vcc"
1219  [(set_attr "type" "sop2,vop2")
1220   (set_attr "length" "4")])
1221
1222; "addptr" is the same as "add" except that it must not write to VCC or SCC
1223; as a side-effect.  Unfortunately GCN does not have a suitable instruction
1224; for this, so we use CC_SAVE_REG as a temp.
1225; Note that it is not safe to save/clobber/restore as separate insns because
1226; doing so will break data-flow analysis, so this must use multiple
1227; instructions in one insn.
1228;
1229; The "v0" should be just "v", but somehow the "0" helps LRA not loop forever
1230; on testcase pr54713-2.c with -O0. It's only an optimization hint anyway.
1231;
1232; The SGPR alternative is preferred as it is typically used with mov_sgprbase.
1233
1234(define_insn "addptrdi3"
1235  [(set (match_operand:DI 0 "register_operand"		 "= v, Sg")
1236    (unspec:DI [
1237	(plus:DI (match_operand:DI 1 "register_operand"	 "^v0,Sg0")
1238		 (match_operand:DI 2 "nonmemory_operand" "vDA,SgDB"))]
1239	UNSPEC_ADDPTR))]
1240  ""
1241  {
1242    if (which_alternative == 0)
1243      {
1244	rtx new_operands[4] = { operands[0], operands[1], operands[2],
1245				gen_rtx_REG (DImode, CC_SAVE_REG) };
1246
1247	output_asm_insn ("v_add%^_u32\t%L0, %3, %L2, %L1", new_operands);
1248	output_asm_insn ("v_addc%^_u32\t%H0, %3, %H2, %H1, %3", new_operands);
1249      }
1250    else
1251      {
1252	rtx new_operands[4] = { operands[0], operands[1], operands[2],
1253				gen_rtx_REG (BImode, CC_SAVE_REG) };
1254
1255	output_asm_insn ("s_mov_b32\t%3, scc", new_operands);
1256	output_asm_insn ("s_add_u32\t%L0, %L1, %L2", new_operands);
1257	output_asm_insn ("s_addc_u32\t%H0, %H1, %H2", new_operands);
1258	output_asm_insn ("s_cmpk_lg_u32\t%3, 0", new_operands);
1259      }
1260
1261    return "";
1262  }
1263  [(set_attr "type" "vmult,mult")
1264   (set_attr "length" "16,24")])
1265
1266;; }}}
1267;; {{{ ALU special cases: Minus
1268
1269(define_insn "subsi3"
1270  [(set (match_operand:SI 0 "register_operand"          "=Sg, Sg,    v,   v")
1271	(minus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA,SgA,    v,vBSv")
1272		  (match_operand:SI 2 "gcn_alu_operand" "SgA,  B, vBSv,   v")))
1273   (clobber (match_scratch:BI 3				"=cs, cs,    X,   X"))
1274   (clobber (match_scratch:DI 4				"= X,  X,   cV,  cV"))]
1275  ""
1276  "@
1277   s_sub_i32\t%0, %1, %2
1278   s_sub_i32\t%0, %1, %2
1279   v_subrev%^_u32\t%0, vcc, %2, %1
1280   v_sub%^_u32\t%0, vcc, %1, %2"
1281  [(set_attr "type" "sop2,sop2,vop2,vop2")
1282   (set_attr "length" "4,8,8,8")])
1283
1284(define_insn_and_split "subdi3"
1285  [(set (match_operand:DI 0 "register_operand"        "=Sg, Sg")
1286	(minus:DI
1287		(match_operand:DI 1 "gcn_alu_operand" "SgA,SgB")
1288		(match_operand:DI 2 "gcn_alu_operand" "SgB,SgA")))
1289   (clobber (reg:BI SCC_REG))]
1290  ""
1291  "#"
1292  "reload_completed"
1293  [(const_int 0)]
1294  {
1295    emit_insn (gen_subsi3_scalar_carry
1296	       (gcn_operand_part (DImode, operands[0], 0),
1297		gcn_operand_part (DImode, operands[1], 0),
1298		gcn_operand_part (DImode, operands[2], 0)));
1299    rtx val = gcn_operand_part (DImode, operands[2], 1);
1300    if (val != const0_rtx)
1301      emit_insn (gen_subcsi3_scalar
1302		 (gcn_operand_part (DImode, operands[0], 1),
1303		  gcn_operand_part (DImode, operands[1], 1),
1304		  gcn_operand_part (DImode, operands[2], 1)));
1305    else
1306      emit_insn (gen_subcsi3_scalar_zero
1307		 (gcn_operand_part (DImode, operands[0], 1),
1308		  gcn_operand_part (DImode, operands[1], 1)));
1309    DONE;
1310  }
1311  [(set_attr "length" "8")])
1312
1313(define_insn "subsi3_scalar_carry"
1314  [(set (match_operand:SI 0 "register_operand"          "=Sg, Sg")
1315        (minus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA,SgB")
1316		  (match_operand:SI 2 "gcn_alu_operand" "SgB,SgA")))
1317   (set (reg:BI SCC_REG)
1318	(gtu:BI (minus:SI (match_dup 1)
1319			  (match_dup 2))
1320		(match_dup 1)))]
1321  ""
1322  "s_sub_u32\t%0, %1, %2"
1323  [(set_attr "type" "sop2")
1324   (set_attr "length" "8")])
1325
1326(define_insn "subsi3_scalar_carry_cst"
1327  [(set (match_operand:SI 0 "register_operand"           "=Sg")
1328        (minus:SI (match_operand:SI 1 "gcn_alu_operand"  "SgA")
1329		 (match_operand:SI 2 "const_int_operand" "  n")))
1330   (set (reg:BI SCC_REG)
1331	(leu:BI (minus:SI (match_dup 1)
1332			 (match_dup 2))
1333		(match_operand:SI 3 "const_int_operand"  "  n")))]
1334  "INTVAL (operands[2]) == -INTVAL (operands[3])"
1335  "s_sub_u32\t%0, %1, %2"
1336  [(set_attr "type" "sop2")
1337   (set_attr "length" "4")])
1338
1339(define_insn "subcsi3_scalar"
1340  [(set (match_operand:SI 0 "register_operand"                    "=Sg, Sg")
1341        (minus:SI (minus:SI (zero_extend:SI (reg:BI SCC_REG))
1342			    (match_operand:SI 1 "gcn_alu_operand" "SgA,SgB"))
1343		 (match_operand:SI 2 "gcn_alu_operand"            "SgB,SgA")))
1344   (set (reg:BI SCC_REG)
1345	(ior:BI (gtu:BI (minus:SI (minus:SI (zero_extend:SI (reg:BI SCC_REG))
1346					    (match_dup 1))
1347				 (match_dup 2))
1348			(match_dup 1))
1349		(gtu:BI (minus:SI (zero_extend:SI (reg:BI SCC_REG))
1350				  (match_dup 1))
1351			(match_dup 1))))]
1352  ""
1353  "s_subb_u32\t%0, %1, %2"
1354  [(set_attr "type" "sop2")
1355   (set_attr "length" "8")])
1356
1357(define_insn "subcsi3_scalar_zero"
1358  [(set (match_operand:SI 0 "register_operand"		"=Sg")
1359        (minus:SI (zero_extend:SI (reg:BI SCC_REG))
1360		  (match_operand:SI 1 "gcn_alu_operand" "SgA")))
1361   (set (reg:BI SCC_REG)
1362	(gtu:BI (minus:SI (zero_extend:SI (reg:BI SCC_REG)) (match_dup 1))
1363		(match_dup 1)))]
1364  ""
1365  "s_subb_u32\t%0, %1, 0"
1366  [(set_attr "type" "sop2")
1367   (set_attr "length" "4")])
1368
1369;; }}}
1370;; {{{ ALU: mult
1371
1372; Vector multiply has vop3a encoding, but no corresponding vop2a, so no long
1373; immediate.
1374(define_insn "mulsi3"
1375  [(set (match_operand:SI 0 "register_operand"	       "= Sg,Sg, Sg,   v")
1376        (mult:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA, 0,SgA,   v")
1377		 (match_operand:SI 2 "gcn_alu_operand" " SgA, J,  B,vASv")))]
1378  ""
1379  "@
1380   s_mul_i32\t%0, %1, %2
1381   s_mulk_i32\t%0, %2
1382   s_mul_i32\t%0, %1, %2
1383   v_mul_lo_i32\t%0, %1, %2"
1384  [(set_attr "type" "sop2,sopk,sop2,vop3a")
1385   (set_attr "length" "4,4,8,4")])
1386
1387(define_code_iterator any_extend [sign_extend zero_extend])
1388(define_code_attr sgnsuffix [(sign_extend "%i") (zero_extend "%u")])
1389(define_code_attr su [(sign_extend "s") (zero_extend "u")])
1390(define_code_attr u [(sign_extend "") (zero_extend "u")])
1391(define_code_attr iu [(sign_extend "i") (zero_extend "u")])
1392(define_code_attr e [(sign_extend "e") (zero_extend "")])
1393
1394(define_insn "<su>mulsi3_highpart"
1395  [(set (match_operand:SI 0 "register_operand"	       "= v")
1396	(truncate:SI
1397	  (lshiftrt:DI
1398	    (mult:DI
1399	      (any_extend:DI
1400		(match_operand:SI 1 "register_operand" "% v"))
1401	      (any_extend:DI
1402		(match_operand:SI 2 "register_operand" "vSv")))
1403	    (const_int 32))))]
1404  ""
1405  "v_mul_hi<sgnsuffix>0\t%0, %2, %1"
1406  [(set_attr "type" "vop3a")
1407   (set_attr "length" "8")])
1408
1409(define_insn "<u>mulhisi3"
1410  [(set (match_operand:SI 0 "register_operand"			"=v")
1411	(mult:SI
1412	  (any_extend:SI (match_operand:HI 1 "register_operand" "%v"))
1413	  (any_extend:SI (match_operand:HI 2 "register_operand" " v"))))]
1414  ""
1415  "v_mul_<iu>32_<iu>24_sdwa\t%0, %<e>1, %<e>2 src0_sel:WORD_0 src1_sel:WORD_0"
1416  [(set_attr "type" "vop_sdwa")
1417   (set_attr "length" "8")])
1418
1419(define_insn "<u>mulqihi3_scalar"
1420  [(set (match_operand:HI 0 "register_operand"			"=v")
1421	(mult:HI
1422	  (any_extend:HI (match_operand:QI 1 "register_operand" "%v"))
1423	  (any_extend:HI (match_operand:QI 2 "register_operand" " v"))))]
1424  ""
1425  "v_mul_<iu>32_<iu>24_sdwa\t%0, %<e>1, %<e>2 src0_sel:BYTE_0 src1_sel:BYTE_0"
1426  [(set_attr "type" "vop_sdwa")
1427   (set_attr "length" "8")])
1428
1429;; }}}
1430;; {{{ ALU: generic 32-bit unop
1431
1432(define_code_iterator bitunop [not popcount])
1433(define_code_attr popcount_extra_op [(not "") (popcount ", 0")])
1434
1435(define_insn "<expander>si2"
1436  [(set (match_operand:SI 0 "register_operand"  "=Sg,   v")
1437        (bitunop:SI
1438	  (match_operand:SI 1 "gcn_alu_operand" "SgB,vSvB")))
1439   (clobber (match_scratch:BI 2			"=cs,   X"))]
1440  ""
1441  "@
1442   s_<s_mnemonic>0\t%0, %1
1443   v_<mnemonic>0\t%0, %1<popcount_extra_op>"
1444  [(set_attr "type" "sop1,vop1")
1445   (set_attr "length" "8")])
1446
1447(define_code_iterator countzeros [clz ctz])
1448
1449(define_insn "<expander>si2"
1450  [(set (match_operand:SI 0 "register_operand"  "=Sg,Sg")
1451        (countzeros:SI
1452	  (match_operand:SI 1 "gcn_alu_operand" "SgA, B")))]
1453  ""
1454  "s_<s_mnemonic>1\t%0, %1"
1455  [(set_attr "type" "sop1")
1456   (set_attr "length" "4,8")])
1457
1458; The truncate ensures that a constant passed to operand 1 is treated as DImode
1459(define_insn "<expander>di2"
1460  [(set (match_operand:SI 0 "register_operand"    "=Sg,Sg")
1461	(truncate:SI
1462	  (countzeros:DI
1463	    (match_operand:DI 1 "gcn_alu_operand" "SgA, B"))))]
1464  ""
1465  "s_<s_mnemonic>1\t%0, %1"
1466  [(set_attr "type" "sop1")
1467   (set_attr "length" "4,8")])
1468
1469;; }}}
1470;; {{{ ALU: generic 32-bit binop
1471
1472; No plus and mult - they have variant with 16bit immediate
1473; and thus are defined later.
1474(define_code_iterator binop [and ior xor smin smax umin umax
1475				 ashift lshiftrt ashiftrt])
1476(define_code_iterator vec_and_scalar_com [and ior xor smin smax umin umax])
1477(define_code_iterator vec_and_scalar_nocom [ashift lshiftrt ashiftrt])
1478
1479(define_insn "<expander>si3"
1480  [(set (match_operand:SI 0 "gcn_valu_dst_operand"    "= Sg,   v,RD")
1481        (vec_and_scalar_com:SI
1482	  (match_operand:SI 1 "gcn_valu_src0_operand" "%SgA,vSvB, 0")
1483	  (match_operand:SI 2 "gcn_alu_operand"	      " SgB,   v, v")))
1484   (clobber (match_scratch:BI 3			      "= cs,   X, X"))]
1485  ""
1486  "@
1487   s_<mnemonic>0\t%0, %1, %2
1488   v_<mnemonic>0\t%0, %1, %2
1489   ds_<mnemonic>0\t%A0, %2%O0"
1490  [(set_attr "type" "sop2,vop2,ds")
1491   (set_attr "length" "8")])
1492
1493(define_insn "<expander>si3"
1494  [(set (match_operand:SI 0 "register_operand"  "=Sg, Sg,   v")
1495        (vec_and_scalar_nocom:SI
1496	  (match_operand:SI 1 "gcn_alu_operand" "SgB,SgA,   v")
1497	  (match_operand:SI 2 "gcn_alu_operand" "SgA,SgB,vSvB")))
1498   (clobber (match_scratch:BI 3			"=cs, cs,   X"))]
1499  ""
1500  "@
1501   s_<mnemonic>0\t%0, %1, %2
1502   s_<mnemonic>0\t%0, %1, %2
1503   v_<revmnemonic>0\t%0, %2, %1"
1504  [(set_attr "type" "sop2,sop2,vop2")
1505   (set_attr "length" "8")])
1506
1507(define_expand "<expander>si3_scc"
1508  [(parallel [(set (match_operand:SI 0 "gcn_valu_dst_operand")
1509		   (binop:SI
1510		     (match_operand:SI 1 "gcn_valu_src0_operand")
1511		     (match_operand:SI 2 "gcn_alu_operand")))
1512	      (clobber (reg:BI SCC_REG))])]
1513  ""
1514  {})
1515
1516;; }}}
1517;; {{{ ALU: generic 64-bit
1518
1519(define_code_iterator vec_and_scalar64_com [and ior xor])
1520
1521(define_insn_and_split "<expander>di3"
1522   [(set (match_operand:DI 0 "register_operand"  "= Sg,    v")
1523	 (vec_and_scalar64_com:DI
1524	  (match_operand:DI 1 "gcn_alu_operand"  "%SgA,vSvDB")
1525	   (match_operand:DI 2 "gcn_alu_operand" " SgC,    v")))
1526   (clobber (match_scratch:BI 3			 "= cs,    X"))]
1527  ""
1528  "@
1529   s_<mnemonic>0\t%0, %1, %2
1530   #"
1531  "reload_completed && gcn_vgpr_register_operand (operands[0], DImode)"
1532  [(parallel [(set (match_dup 4)
1533		   (vec_and_scalar64_com:SI (match_dup 5) (match_dup 6)))
1534	      (clobber (match_dup 3))])
1535   (parallel [(set (match_dup 7)
1536		   (vec_and_scalar64_com:SI (match_dup 8) (match_dup 9)))
1537	      (clobber (match_dup 3))])]
1538  {
1539    operands[4] = gcn_operand_part (DImode, operands[0], 0);
1540    operands[5] = gcn_operand_part (DImode, operands[1], 0);
1541    operands[6] = gcn_operand_part (DImode, operands[2], 0);
1542    operands[7] = gcn_operand_part (DImode, operands[0], 1);
1543    operands[8] = gcn_operand_part (DImode, operands[1], 1);
1544    operands[9] = gcn_operand_part (DImode, operands[2], 1);
1545  }
1546  [(set_attr "type" "sop2,vop2")
1547   (set_attr "length" "8")])
1548
1549(define_insn "<expander>di3"
1550  [(set (match_operand:DI 0 "register_operand"   "=Sg, Sg,   v")
1551	(vec_and_scalar_nocom:DI
1552	  (match_operand:DI 1 "gcn_alu_operand"  "SgC,SgA,   v")
1553	  (match_operand:SI 2 "gcn_alu_operand"  "SgA,SgC,vSvC")))
1554   (clobber (match_scratch:BI 3			 "=cs, cs,   X"))]
1555  ""
1556  "@
1557   s_<mnemonic>0\t%0, %1, %2
1558   s_<mnemonic>0\t%0, %1, %2
1559   v_<revmnemonic>0\t%0, %2, %1"
1560  [(set_attr "type" "sop2,sop2,vop2")
1561   (set_attr "length" "8")])
1562
1563;; }}}
1564;; {{{ ALU: generic 128-bit binop
1565
1566; TImode shifts can't be synthesized by the middle-end
1567(define_expand "<expander>ti3"
1568  [(set (match_operand:TI 0 "register_operand")
1569	(vec_and_scalar_nocom:TI
1570	  (match_operand:TI 1 "gcn_alu_operand")
1571	  (match_operand:SI 2 "gcn_alu_operand")))]
1572  ""
1573  {
1574    rtx dest = operands[0];
1575    rtx src = operands[1];
1576    rtx shift = operands[2];
1577
1578    enum {ashr, lshr, ashl} shiftop = <expander>;
1579    rtx (*inverse_shift_fn) (rtx, rtx, rtx)
1580      = (shiftop == ashl ? gen_lshrdi3 : gen_ashldi3);
1581    rtx (*logical_shift_fn) (rtx, rtx, rtx)
1582      = (shiftop == ashl ? gen_ashldi3 : gen_lshrdi3);
1583
1584    /* We shift "from" one subreg "to" the other, according to shiftop.  */
1585    int from = (shiftop == ashl ? 0 : 8);
1586    int to = (shiftop == ashl ? 8 : 0);
1587    rtx destfrom = simplify_gen_subreg (DImode, dest, TImode, from);
1588    rtx destto = simplify_gen_subreg (DImode, dest, TImode, to);
1589    rtx srcfrom = simplify_gen_subreg (DImode, src, TImode, from);
1590    rtx srcto = simplify_gen_subreg (DImode, src, TImode, to);
1591
1592    int shiftval = (CONST_INT_P (shift) ? INTVAL (shift) : -1);
1593    enum {RUNTIME, ZERO, SMALL, LARGE} shiftcomparison
1594     = (!CONST_INT_P (shift) ? RUNTIME
1595        : shiftval == 0 ? ZERO
1596        : shiftval < 64 ? SMALL
1597        : LARGE);
1598
1599    rtx large_label, zero_label, exit_label;
1600
1601    if (shiftcomparison == RUNTIME)
1602      {
1603        zero_label = gen_label_rtx ();
1604        large_label = gen_label_rtx ();
1605        exit_label = gen_label_rtx ();
1606
1607        rtx cond = gen_rtx_EQ (VOIDmode, shift, const0_rtx);
1608        emit_insn (gen_cbranchsi4 (cond, shift, const0_rtx, zero_label));
1609
1610        rtx sixtyfour = GEN_INT (64);
1611        cond = gen_rtx_GE (VOIDmode, shift, sixtyfour);
1612        emit_insn (gen_cbranchsi4 (cond, shift, sixtyfour, large_label));
1613      }
1614
1615    if (shiftcomparison == SMALL || shiftcomparison == RUNTIME)
1616      {
1617        /* Shift both parts by the same amount, then patch in the bits that
1618           cross the boundary.
1619           This does *not* work for zero-length shifts.  */
1620        rtx tmpto1 = gen_reg_rtx (DImode);
1621        rtx tmpto2 = gen_reg_rtx (DImode);
1622        emit_insn (gen_<expander>di3 (destfrom, srcfrom, shift));
1623        emit_insn (logical_shift_fn (tmpto1, srcto, shift));
1624        rtx lessershiftval = gen_reg_rtx (SImode);
1625        emit_insn (gen_subsi3 (lessershiftval, GEN_INT (64), shift));
1626        emit_insn (inverse_shift_fn (tmpto2, srcfrom, lessershiftval));
1627        emit_insn (gen_iordi3 (destto, tmpto1, tmpto2));
1628      }
1629
1630    if (shiftcomparison == RUNTIME)
1631      {
1632        emit_jump_insn (gen_jump (exit_label));
1633        emit_barrier ();
1634
1635        emit_label (zero_label);
1636      }
1637
1638    if (shiftcomparison == ZERO || shiftcomparison == RUNTIME)
1639      emit_move_insn (dest, src);
1640
1641    if (shiftcomparison == RUNTIME)
1642      {
1643        emit_jump_insn (gen_jump (exit_label));
1644        emit_barrier ();
1645
1646        emit_label (large_label);
1647      }
1648
1649    if (shiftcomparison == LARGE || shiftcomparison == RUNTIME)
1650      {
1651        /* Do the shift within one part, and set the other part appropriately.
1652           Shifts of 128+ bits are an error.  */
1653        rtx lessershiftval = gen_reg_rtx (SImode);
1654        emit_insn (gen_subsi3 (lessershiftval, shift, GEN_INT (64)));
1655        emit_insn (gen_<expander>di3 (destto, srcfrom, lessershiftval));
1656        if (shiftop == ashr)
1657          emit_insn (gen_ashrdi3 (destfrom, srcfrom, GEN_INT (63)));
1658        else
1659          emit_move_insn (destfrom, const0_rtx);
1660      }
1661
1662    if (shiftcomparison == RUNTIME)
1663      emit_label (exit_label);
1664
1665    DONE;
1666  })
1667
1668;; }}}
1669;; {{{ Atomics
1670
1671; Each compute unit has it's own L1 cache. The L2 cache is shared between
1672; all the compute units.  Any load or store instruction can skip L1 and
1673; access L2 directly using the "glc" flag.  Atomic instructions also skip
1674; L1.  The L1 cache can be flushed and invalidated using instructions.
1675;
1676; Therefore, in order for "acquire" and "release" atomic modes to work
1677; correctly across compute units we must flush before each "release"
1678; and invalidate the cache after each "acquire".  It might seem like
1679; invalidation could be safely done before an "acquire", but since each
1680; compute unit can run up to 40 threads simultaneously, all reading values
1681; into the L1 cache, this is not actually safe.
1682;
1683; Additionally, scalar flat instructions access L2 via a different cache
1684; (the "constant cache"), so they have separate constrol instructions.  We
1685; do not attempt to invalidate both caches at once; instead, atomics
1686; operating on scalar flat pointers will flush the constant cache, and
1687; atomics operating on flat or global pointers will flush L1.  It is up to
1688; the programmer to get this right.
1689
1690(define_code_iterator atomicops [plus minus and ior xor])
1691(define_mode_attr X [(SI "") (DI "_X2")])
1692
1693;; TODO compare_and_swap test_and_set inc dec
1694;; Hardware also supports min and max, but GCC does not.
1695
1696(define_expand "memory_barrier"
1697  [(set (match_dup 0)
1698	(unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))]
1699  ""
1700  {
1701    operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
1702    MEM_VOLATILE_P (operands[0]) = 1;
1703  })
1704
1705(define_insn "*memory_barrier"
1706  [(set (match_operand:BLK 0)
1707	(unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))]
1708  ""
1709  "buffer_wbinvl1_vol"
1710  [(set_attr "type" "mubuf")
1711   (set_attr "length" "4")])
1712
1713; FIXME: These patterns have been disabled as they do not seem to work
1714; reliably - they can cause hangs or incorrect results.
1715; TODO: flush caches according to memory model
1716(define_insn "atomic_fetch_<bare_mnemonic><mode>"
1717  [(set (match_operand:SIDI 0 "register_operand"     "=Sm, v, v")
1718	(match_operand:SIDI 1 "memory_operand"	     "+RS,RF,RM"))
1719   (set (match_dup 1)
1720	(unspec_volatile:SIDI
1721	  [(atomicops:SIDI
1722	    (match_dup 1)
1723	    (match_operand:SIDI 2 "register_operand" " Sm, v, v"))]
1724	   UNSPECV_ATOMIC))
1725   (use (match_operand 3 "const_int_operand"))]
1726  "0 /* Disabled.  */"
1727  "@
1728   s_atomic_<bare_mnemonic><X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)
1729   flat_atomic_<bare_mnemonic><X>\t%0, %1, %2 glc\;s_waitcnt\t0
1730   global_atomic_<bare_mnemonic><X>\t%0, %A1, %2%O1 glc\;s_waitcnt\tvmcnt(0)"
1731  [(set_attr "type" "smem,flat,flat")
1732   (set_attr "length" "12")
1733   (set_attr "gcn_version" "gcn5,*,gcn5")])
1734
1735; FIXME: These patterns are disabled because the instructions don't
1736; seem to work as advertised.  Specifically, OMP "team distribute"
1737; reductions apparently "lose" some of the writes, similar to what
1738; you might expect from a concurrent non-atomic read-modify-write.
1739; TODO: flush caches according to memory model
1740(define_insn "atomic_<bare_mnemonic><mode>"
1741  [(set (match_operand:SIDI 0 "memory_operand"       "+RS,RF,RM")
1742	(unspec_volatile:SIDI
1743	  [(atomicops:SIDI
1744	    (match_dup 0)
1745	    (match_operand:SIDI 1 "register_operand" " Sm, v, v"))]
1746	  UNSPECV_ATOMIC))
1747   (use (match_operand 2 "const_int_operand"))]
1748  "0 /* Disabled.  */"
1749  "@
1750   s_atomic_<bare_mnemonic><X>\t%0, %1\;s_waitcnt\tlgkmcnt(0)
1751   flat_atomic_<bare_mnemonic><X>\t%0, %1\;s_waitcnt\t0
1752   global_atomic_<bare_mnemonic><X>\t%A0, %1%O0\;s_waitcnt\tvmcnt(0)"
1753  [(set_attr "type" "smem,flat,flat")
1754   (set_attr "length" "12")
1755   (set_attr "gcn_version" "gcn5,*,gcn5")])
1756
1757(define_mode_attr x2 [(SI "DI") (DI "TI")])
1758(define_mode_attr size [(SI "4") (DI "8")])
1759(define_mode_attr bitsize [(SI "32") (DI "64")])
1760
1761(define_expand "sync_compare_and_swap<mode>"
1762  [(match_operand:SIDI 0 "register_operand")
1763   (match_operand:SIDI 1 "memory_operand")
1764   (match_operand:SIDI 2 "register_operand")
1765   (match_operand:SIDI 3 "register_operand")]
1766  ""
1767  {
1768    if (MEM_ADDR_SPACE (operands[1]) == ADDR_SPACE_LDS)
1769      {
1770	emit_insn (gen_sync_compare_and_swap<mode>_lds_insn (operands[0],
1771							     operands[1],
1772							     operands[2],
1773							     operands[3]));
1774	DONE;
1775      }
1776
1777    /* Operands 2 and 3 must be placed in consecutive registers, and passed
1778       as a combined value.  */
1779    rtx src_cmp = gen_reg_rtx (<x2>mode);
1780    emit_move_insn (gen_rtx_SUBREG (<MODE>mode, src_cmp, 0), operands[3]);
1781    emit_move_insn (gen_rtx_SUBREG (<MODE>mode, src_cmp, <size>), operands[2]);
1782    emit_insn (gen_sync_compare_and_swap<mode>_insn (operands[0],
1783						     operands[1],
1784						     src_cmp));
1785    DONE;
1786  })
1787
1788(define_insn "sync_compare_and_swap<mode>_insn"
1789  [(set (match_operand:SIDI 0 "register_operand"    "=Sm, v, v")
1790	(match_operand:SIDI 1 "memory_operand"      "+RS,RF,RM"))
1791   (set (match_dup 1)
1792	(unspec_volatile:SIDI
1793	  [(match_operand:<x2> 2 "register_operand" " Sm, v, v")]
1794	  UNSPECV_ATOMIC))]
1795  ""
1796  "@
1797   s_atomic_cmpswap<X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)
1798   flat_atomic_cmpswap<X>\t%0, %1, %2 glc\;s_waitcnt\t0
1799   global_atomic_cmpswap<X>\t%0, %A1, %2%O1 glc\;s_waitcnt\tvmcnt(0)"
1800  [(set_attr "type" "smem,flat,flat")
1801   (set_attr "length" "12")
1802   (set_attr "gcn_version" "gcn5,*,gcn5")
1803   (set_attr "delayeduse" "*,yes,yes")])
1804
1805(define_insn "sync_compare_and_swap<mode>_lds_insn"
1806  [(set (match_operand:SIDI 0 "register_operand"    "= v")
1807	(unspec_volatile:SIDI
1808	  [(match_operand:SIDI 1 "memory_operand"   "+RL")]
1809	  UNSPECV_ATOMIC))
1810   (set (match_dup 1)
1811	(unspec_volatile:SIDI
1812	  [(match_operand:SIDI 2 "register_operand" "  v")
1813	   (match_operand:SIDI 3 "register_operand" "  v")]
1814	  UNSPECV_ATOMIC))]
1815  ""
1816  "ds_cmpst_rtn_b<bitsize> %0, %1, %2, %3\;s_waitcnt\tlgkmcnt(0)"
1817  [(set_attr "type" "ds")
1818   (set_attr "length" "12")])
1819
1820(define_insn "atomic_load<mode>"
1821  [(set (match_operand:SIDI 0 "register_operand"  "=Sm, v, v")
1822	(unspec_volatile:SIDI
1823	  [(match_operand:SIDI 1 "memory_operand" " RS,RF,RM")]
1824	  UNSPECV_ATOMIC))
1825   (use (match_operand:SIDI 2 "immediate_operand" "  i, i, i"))]
1826  ""
1827  {
1828    switch (INTVAL (operands[2]))
1829      {
1830      case MEMMODEL_RELAXED:
1831	switch (which_alternative)
1832	  {
1833	  case 0:
1834	    return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)";
1835	  case 1:
1836	    return "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0";
1837	  case 2:
1838	    return "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)";
1839	  }
1840	break;
1841      case MEMMODEL_CONSUME:
1842      case MEMMODEL_ACQUIRE:
1843      case MEMMODEL_SYNC_ACQUIRE:
1844	switch (which_alternative)
1845	  {
1846	  case 0:
1847	    return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)\;"
1848		   "s_dcache_wb_vol";
1849	  case 1:
1850	    return "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0\;"
1851		   "buffer_wbinvl1_vol";
1852	  case 2:
1853	    return "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)\;"
1854		   "buffer_wbinvl1_vol";
1855	  }
1856	break;
1857      case MEMMODEL_ACQ_REL:
1858      case MEMMODEL_SEQ_CST:
1859      case MEMMODEL_SYNC_SEQ_CST:
1860	switch (which_alternative)
1861	  {
1862	  case 0:
1863	    return "s_dcache_wb_vol\;s_load%o0\t%0, %A1 glc\;"
1864		   "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
1865	  case 1:
1866	    return "buffer_wbinvl1_vol\;flat_load%o0\t%0, %A1%O1 glc\;"
1867		   "s_waitcnt\t0\;buffer_wbinvl1_vol";
1868	  case 2:
1869	    return "buffer_wbinvl1_vol\;global_load%o0\t%0, %A1%O1 glc\;"
1870		   "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
1871	  }
1872	break;
1873      }
1874    gcc_unreachable ();
1875  }
1876  [(set_attr "type" "smem,flat,flat")
1877   (set_attr "length" "20")
1878   (set_attr "gcn_version" "gcn5,*,gcn5")])
1879
1880(define_insn "atomic_store<mode>"
1881  [(set (match_operand:SIDI 0 "memory_operand"      "=RS,RF,RM")
1882	(unspec_volatile:SIDI
1883	  [(match_operand:SIDI 1 "register_operand" " Sm, v, v")]
1884	  UNSPECV_ATOMIC))
1885  (use (match_operand:SIDI 2 "immediate_operand"    "  i, i, i"))]
1886  ""
1887  {
1888    switch (INTVAL (operands[2]))
1889      {
1890      case MEMMODEL_RELAXED:
1891	switch (which_alternative)
1892	  {
1893	  case 0:
1894	    return "s_store%o1\t%1, %A0 glc\;s_waitcnt\tlgkmcnt(0)";
1895	  case 1:
1896	    return "flat_store%o1\t%A0, %1%O0 glc\;s_waitcnt\t0";
1897	  case 2:
1898	    return "global_store%o1\t%A0, %1%O0 glc\;s_waitcnt\tvmcnt(0)";
1899	  }
1900	break;
1901      case MEMMODEL_RELEASE:
1902      case MEMMODEL_SYNC_RELEASE:
1903	switch (which_alternative)
1904	  {
1905	  case 0:
1906	    return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc";
1907	  case 1:
1908	    return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc";
1909	  case 2:
1910	    return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc";
1911	  }
1912	break;
1913      case MEMMODEL_ACQ_REL:
1914      case MEMMODEL_SEQ_CST:
1915      case MEMMODEL_SYNC_SEQ_CST:
1916	switch (which_alternative)
1917	  {
1918	  case 0:
1919	    return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc\;"
1920		   "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
1921	  case 1:
1922	    return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;"
1923		   "s_waitcnt\t0\;buffer_wbinvl1_vol";
1924	  case 2:
1925	    return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;"
1926		   "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
1927	  }
1928	break;
1929      }
1930    gcc_unreachable ();
1931  }
1932  [(set_attr "type" "smem,flat,flat")
1933   (set_attr "length" "20")
1934   (set_attr "gcn_version" "gcn5,*,gcn5")])
1935
1936(define_insn "atomic_exchange<mode>"
1937  [(set (match_operand:SIDI 0 "register_operand"    "=Sm, v, v")
1938        (match_operand:SIDI 1 "memory_operand"	    "+RS,RF,RM"))
1939   (set (match_dup 1)
1940	(unspec_volatile:SIDI
1941	  [(match_operand:SIDI 2 "register_operand" " Sm, v, v")]
1942	  UNSPECV_ATOMIC))
1943   (use (match_operand 3 "immediate_operand"))]
1944  ""
1945  {
1946    switch (INTVAL (operands[3]))
1947      {
1948      case MEMMODEL_RELAXED:
1949	switch (which_alternative)
1950	  {
1951	  case 0:
1952	    return "s_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)";
1953	  case 1:
1954	    return "flat_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\t0";
1955	  case 2:
1956	    return "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
1957		   "s_waitcnt\tvmcnt(0)";
1958	  }
1959	break;
1960      case MEMMODEL_CONSUME:
1961      case MEMMODEL_ACQUIRE:
1962      case MEMMODEL_SYNC_ACQUIRE:
1963	switch (which_alternative)
1964	  {
1965	  case 0:
1966	    return "s_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)\;"
1967		   "s_dcache_wb_vol\;s_dcache_inv_vol";
1968	  case 1:
1969	    return "flat_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\t0\;"
1970		   "buffer_wbinvl1_vol";
1971	  case 2:
1972	    return "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
1973		   "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
1974	  }
1975	break;
1976      case MEMMODEL_RELEASE:
1977      case MEMMODEL_SYNC_RELEASE:
1978	switch (which_alternative)
1979	  {
1980	  case 0:
1981	    return "s_dcache_wb_vol\;s_atomic_swap<X>\t%0, %1, %2 glc\;"
1982		   "s_waitcnt\tlgkmcnt(0)";
1983	  case 1:
1984	    return "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
1985		   "s_waitcnt\t0";
1986	  case 2:
1987	    return "buffer_wbinvl1_vol\;"
1988		   "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
1989		   "s_waitcnt\tvmcnt(0)";
1990	  }
1991	break;
1992      case MEMMODEL_ACQ_REL:
1993      case MEMMODEL_SEQ_CST:
1994      case MEMMODEL_SYNC_SEQ_CST:
1995	switch (which_alternative)
1996	  {
1997	  case 0:
1998	    return "s_dcache_wb_vol\;s_atomic_swap<X>\t%0, %1, %2 glc\;"
1999		   "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
2000	  case 1:
2001	    return "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
2002		   "s_waitcnt\t0\;buffer_wbinvl1_vol";
2003	  case 2:
2004	    return "buffer_wbinvl1_vol\;"
2005		   "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
2006		   "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
2007	  }
2008	break;
2009      }
2010    gcc_unreachable ();
2011  }
2012  [(set_attr "type" "smem,flat,flat")
2013   (set_attr "length" "20")
2014   (set_attr "gcn_version" "gcn5,*,gcn5")])
2015
2016;; }}}
2017;; {{{ OpenACC / OpenMP
2018
2019(define_expand "oacc_dim_size"
2020  [(match_operand:SI 0 "register_operand")
2021   (match_operand:SI 1 "const_int_operand")]
2022  ""
2023  {
2024    rtx tmp = gcn_oacc_dim_size (INTVAL (operands[1]));
2025    emit_move_insn (operands[0], gen_lowpart (SImode, tmp));
2026    DONE;
2027  })
2028
2029(define_expand "oacc_dim_pos"
2030  [(match_operand:SI 0 "register_operand")
2031   (match_operand:SI 1 "const_int_operand")]
2032  ""
2033  {
2034    emit_move_insn (operands[0], gcn_oacc_dim_pos (INTVAL (operands[1])));
2035    DONE;
2036  })
2037
2038(define_expand "gcn_wavefront_barrier"
2039  [(set (match_dup 0)
2040	(unspec_volatile:BLK [(match_dup 0)] UNSPECV_BARRIER))]
2041  ""
2042  {
2043    operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
2044    MEM_VOLATILE_P (operands[0]) = 1;
2045  })
2046
2047(define_insn "*gcn_wavefront_barrier"
2048  [(set (match_operand:BLK 0 "")
2049	(unspec_volatile:BLK [(match_dup 0)] UNSPECV_BARRIER))]
2050  ""
2051  "s_barrier"
2052  [(set_attr "type" "sopp")])
2053
2054(define_expand "oacc_fork"
2055  [(set (match_operand:SI 0 "")
2056	(match_operand:SI 1 ""))
2057   (use (match_operand:SI 2 ""))]
2058  ""
2059  {
2060    /* We need to have oacc_fork/oacc_join named patterns as a pair,
2061       but the fork isn't actually used.  */
2062    gcc_unreachable ();
2063  })
2064
2065(define_expand "oacc_join"
2066  [(set (match_operand:SI 0 "")
2067	(match_operand:SI 1 ""))
2068   (use (match_operand:SI 2 ""))]
2069  ""
2070  {
2071    emit_insn (gen_gcn_wavefront_barrier ());
2072    DONE;
2073  })
2074
2075;; }}}
2076
2077(include "gcn-valu.md")
2078